diff --git a/CONTRIBUTING_MAC.md b/CONTRIBUTING_MAC.md new file mode 100644 index 00000000..1fcf6f38 --- /dev/null +++ b/CONTRIBUTING_MAC.md @@ -0,0 +1,17 @@ +# Contributing As a Mac User + +## Installation + +For M1 Macs (Apple Silicon), various [online sources](https://github.com/scipy/scipy/issues/13409) recommend using OpenBLAS as a native (not Rosetta) linear algebra backend for dependencies like NumPy. +```sh +brew install openblas +export OPENBLAS=$(/opt/homebrew/bin/brew --prefix openblas) +``` + +As [documented by Ray](https://docs.ray.io/en/latest/ray-overview/installation.html#m1-mac-apple-silicon-support), one must install miniforge, a community-driven Conda installer, for packages that support `arm64`. In particular, the GRPCIO package from pip will not work with Ray, so it must be uninstalled (`pip uninstall grpcio`) and replaced with the one from miniforge (`conda install grpcio`). + + +## Testing + +`tests/core/backend/test_backend_init.py::test_head_detection` +As [documented by Ray](https://github.com/ray-project/ray/issues/24130), there is different default behavior for fetching node IP addresses. On Linux, a private address (e.g. `192.168.0.1`) is returned, while on Windows and Mac, `127.0.0.1` is returned. As NumS is primarily released for Linux, this test will fail for Mac contributors, but it shouldn't affect development. diff --git a/nums/core/array/application.py b/nums/core/array/application.py index 48654617..9bc47aae 100644 --- a/nums/core/array/application.py +++ b/nums/core/array/application.py @@ -230,7 +230,11 @@ def loadtxt( def scalar(self, value): return BlockArray.from_scalar(value, self.km) - def array(self, array: Union[np.ndarray, sparse.COO, List[float]], block_shape: tuple = None): + def array( + self, + array: Union[np.ndarray, sparse.COO, List[float]], + block_shape: tuple = None, + ): if not isinstance(array, (np.ndarray, sparse.COO)): if array_utils.is_array_like(array): array = np.array(array) @@ -917,7 +921,7 @@ def map_uop( # TODO(hme): Faster to create ndarray first, # and instantiate block array on return # to avoid instantiating blocks on BlockArray initialization. - rarr.blocks[grid_entry] = arr.blocks[grid_entry].uop_map( + rarr.blocks[grid_entry] = arr.blocks[grid_entry].map_uop( op_name, args=args, kwargs=kwargs ) return rarr @@ -928,7 +932,7 @@ def matmul(self, arr_1: BlockArray, arr_2: BlockArray) -> BlockArray: def tensordot( self, arr_1: BlockArray, arr_2: BlockArray, axes: int = 2 ) -> BlockArray: - return arr_1.tensordot(arr_2, axes) + return arr_1.tensordot(arr_1, arr_2, axes) def einsum(self, subscript, *operands): def _compute_syskwargs(blocks): diff --git a/nums/core/array/base.py b/nums/core/array/base.py index 9d0e34c5..a09bbbb1 100644 --- a/nums/core/array/base.py +++ b/nums/core/array/base.py @@ -13,9 +13,7 @@ # limitations under the License. -# pylint: disable = protected-access - - +import warnings import numpy as np from nums.core.array import utils as array_utils @@ -23,11 +21,11 @@ from nums.core.grid.grid import ArrayGrid -class Block: - # pylint: disable=redefined-builtin, global-statement - # TODO(hme): Create a base class, and move this concrete class into blockarray.py. - # Do this when we implement a SparseBlock object. +# pylint: disable=protected-access, redefined-builtin +# pylint: disable=too-many-lines + +class BlockBase: block_id_counter = -1 def __init__( @@ -40,7 +38,7 @@ def __init__( km: KernelManager, id=None, ): - self._km = km + self.km = km self.grid_entry: tuple = grid_entry self.grid_shape: tuple = grid_shape self.oid: np.object = None @@ -54,33 +52,35 @@ def __init__( self.id = Block.block_id_counter # Set if a device id was used to compute this block. self._device = None - self.fill_value = None @property - def nnz(self): - return np.prod(self.shape) + def is_dense(self): + raise NotImplementedError() @property - def is_dense(self): - return self.fill_value is None + def nbytes(self): + raise NotImplementedError() def __repr__(self): - return "Block(" + str(self.oid) + ")" + raise NotImplementedError() def size(self): return np.product(self.shape) def copy(self, shallow=True): - assert shallow, "Only shallow copies are currently supported." - block = Block( - self.grid_entry, - self.grid_shape, - self.shape, - self.dtype, - self.transposed, - self._km, + raise NotImplementedError() + + def get(self): + return self.km.get(self.oid) + + def astype(self, dtype): + block = self.copy() + block.dtype = dtype + block.oid = self.km.astype( + self.oid, + dtype.__name__, + syskwargs={"grid_entry": block.grid_entry, "grid_shape": block.grid_shape}, ) - block.oid = self.oid return block def true_grid_entry(self): @@ -93,26 +93,19 @@ def true_grid_shape(self): return tuple(reversed(self.grid_shape)) return self.grid_shape - def device(self): - if self._device is not None: - return self._device - return self._km.device_grid.get_device( - self.true_grid_entry(), self.true_grid_shape() - ) - def transpose(self, defer=False, redistribute=False): # If defer is True, this operation does not modify the remote object. # If defer is True and redistribute is False, # this operation does not move the remote object. grid_entryT = tuple(reversed(self.grid_entry)) grid_shapeT = tuple(reversed(self.grid_shape)) - blockT = Block( + blockT = type(self)( grid_entry=grid_entryT, grid_shape=grid_shapeT, shape=tuple(reversed(self.shape)), dtype=self.dtype, transposed=not self.transposed, - km=self._km, + km=self.km, ) blockT.oid = self.oid if not defer: @@ -124,9 +117,16 @@ def transpose(self, defer=False, redistribute=False): "grid_entry": self.grid_entry, "grid_shape": self.grid_shape, } - blockT.oid = self._km.transpose(self.oid, syskwargs=syskwargs) + blockT.oid = self.km.transpose(self.oid, syskwargs=syskwargs) return blockT + def device(self): + if self._device is not None: + return self._device + return self.km.device_grid.get_device( + self.true_grid_entry(), self.true_grid_shape() + ) + def swapaxes(self, axis1, axis2): block = self.copy() grid_entry = list(block.grid_entry) @@ -141,7 +141,7 @@ def swapaxes(self, axis1, axis2): block.grid_shape = tuple(grid_shape) block.shape = tuple(shape) - block.oid = self._km.swapaxes( + block.oid = self.km.swapaxes( block.oid, axis1, axis2, @@ -149,48 +149,20 @@ def swapaxes(self, axis1, axis2): ) return block + def map_uop(self, op_name, args=None, kwargs=None, device=None): + raise NotImplementedError() + def ufunc(self, op_name, device=None): - return self.uop_map(op_name, device=device) + return self.map_uop(op_name, device=device) - def uop_map(self, op_name, args=None, kwargs=None, device=None): - # This retains transpose. - block = self.copy() - block.dtype = array_utils.get_uop_output_type(op_name, self.dtype) - args = () if args is None else args - kwargs = {} if kwargs is None else kwargs - if device is None: - syskwargs = {"grid_entry": block.grid_entry, "grid_shape": block.grid_shape} - else: - syskwargs = {"device": device} - block._device = device - block.oid = self._km.map_uop( - op_name, self.oid, args, kwargs, syskwargs=syskwargs - ) - return block + def block_from_scalar(self, other): + raise NotImplementedError() - def _block_from_other(self, other): - # Assume other is numeric. - # This only occurs during some numpy operations (e.g. np.mean), - # where a literal is used in the operation. - assert isinstance(other, (int, float, np.int, np.float)) - block = Block( - self.grid_entry, - self.grid_shape, - (1,), - self.dtype, - False, - self._km, - ) - # We pass syskwargs here for correct node placement for `other`, - # which should be local to self. - block.oid = self._km.put( - np.array(other, dtype=self.dtype), - syskwargs={ - "grid_entry": self.grid_entry, - "grid_shape": self.grid_shape, - }, - ) - return block + def conjugate(self): + return self.ufunc("conjugate") + + def sqrt(self): + return self.ufunc("sqrt") @staticmethod def block_meta(op, block1, block2, args): @@ -227,77 +199,105 @@ def block_meta(op, block1, block2, args): @staticmethod def init_block(op_name, block1, block2, args, device=None): - result_grid_entry, result_grid_shape, result_shape, dtype = Block.block_meta( - op_name, block1, block2, args - ) - block = Block( - grid_entry=result_grid_entry, - grid_shape=result_grid_shape, - shape=result_shape, - dtype=dtype, - transposed=False, - km=block1._km, - ) - block._device = device - return block + raise NotImplementedError() + + def _check_bop_implemented(self, other): + raise NotImplementedError() + + @staticmethod + def binary_op(op_name, a, b, args: dict, device=None): + raise NotImplementedError() def bop(self, op_name, other, args: dict, device=None): - if not isinstance(other, Block): - other = self._block_from_other(other) - block: Block = self.init_block(op_name, self, other, args, device) - if device is None: - syskwargs = {"grid_entry": block.grid_entry, "grid_shape": block.grid_shape} - else: - syskwargs = {"device": device} - block.oid = self._km.bop( - op_name, - self.oid, - other.oid, - self.transposed, - other.transposed, - axes=args.get("axes"), - syskwargs=syskwargs, - ) - return block + raise NotImplementedError() def tensordot(self, other, axes): - return self.bop("tensordot", other, args={"axes": axes}) + raise NotImplementedError() def __add__(self, other): - return self.bop("add", other, args={}) + if not self._check_bop_implemented(other): + return NotImplemented + return self.binary_op("add", self, other, args={}) + + def __radd__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.binary_op("add", other, self, args={}) def __sub__(self, other): - return self.bop("sub", other, args={}) + if not self._check_bop_implemented(other): + return NotImplemented + return self.binary_op("sub", self, other, args={}) + + def __rsub__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.binary_op("sub", other, self, args={}) def __mul__(self, other): - return self.bop("mul", other, args={}) + if not self._check_bop_implemented(other): + return NotImplemented + return self.binary_op("mul", self, other, args={}) - def __matmul__(self, other): - return self.tensordot(other, axes=1) + def __rmul__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.binary_op("mul", other, self, args={}) def __truediv__(self, other): - return self.bop("truediv", other, args={}) + if not self._check_bop_implemented(other): + return NotImplemented + return self.binary_op("truediv", self, other, args={}) + + def __rtruediv__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.binary_op("truediv", other, self, args={}) def __pow__(self, other): - return self.bop("pow", other, args={}) + if not self._check_bop_implemented(other): + return NotImplemented + return self.binary_op("pow", self, other, args={}) + + def __rpow__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.binary_op("pow", other, self, args={}) + + def __matmul__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.tensordot(other, axes=1) def __ge__(self, other): - return self.bop("ge", other, args={}) + if not self._check_bop_implemented(other): + return NotImplemented + return self.binary_op("ge", self, other, args={}) def __gt__(self, other): - return self.bop("gt", other, args={}) + if not self._check_bop_implemented(other): + return NotImplemented + return self.binary_op("gt", self, other, args={}) def __le__(self, other): - return self.bop("le", other, args={}) + if not self._check_bop_implemented(other): + return NotImplemented + return self.binary_op("le", self, other, args={}) def __lt__(self, other): - return self.bop("lt", other, args={}) + if not self._check_bop_implemented(other): + return NotImplemented + return self.binary_op("lt", self, other, args={}) def __eq__(self, other): - return self.bop("eq", other, args={}) + if not self._check_bop_implemented(other): + return NotImplemented + return self.binary_op("eq", self, other, args={}) def __ne__(self, other): - return self.bop("ne", other, args={}) + if not self._check_bop_implemented(other): + return NotImplemented + return self.binary_op("ne", self, other, args={}) __iadd__ = __add__ __isub__ = __sub__ @@ -306,24 +306,134 @@ def __ne__(self, other): __itruediv__ = __truediv__ __ipow__ = __pow__ - def astype(self, dtype): + +class Block(BlockBase): + # pylint: disable=redefined-builtin, global-statement + + @property + def is_dense(self): + return True + + @property + def nbytes(self): + return np.prod(self.shape) * np.dtype(self.dtype).itemsize + + def __repr__(self): + return "Block(" + str(self.oid) + ")" + + def copy(self, shallow=True): + assert shallow, "Only shallow copies are currently supported." + block = Block( + self.grid_entry, + self.grid_shape, + self.shape, + self.dtype, + self.transposed, + self.km, + ) + block.oid = self.oid + return block + + def map_uop(self, op_name, args=None, kwargs=None, device=None): + # This retains transpose. block = self.copy() - block.dtype = dtype - block.oid = self._km.astype( - self.oid, - dtype.__name__, - syskwargs={"grid_entry": block.grid_entry, "grid_shape": block.grid_shape}, + block.dtype = array_utils.get_uop_output_type(op_name, self.dtype) + args = () if args is None else args + kwargs = {} if kwargs is None else kwargs + if device is None: + syskwargs = {"grid_entry": block.grid_entry, "grid_shape": block.grid_shape} + else: + syskwargs = {"device": device} + block._device = device + block.oid = self.km.map_uop( + op_name, self.oid, args, kwargs, syskwargs=syskwargs ) return block - def conjugate(self): - return self.ufunc("conjugate") + def block_from_scalar(self, other): + # Assume other is numeric. + # This only occurs during some numpy operations (e.g. np.mean), + # where a literal is used in the operation. + assert array_utils.is_scalar(other) + block = Block( + self.grid_entry, + self.grid_shape, + (1,), + self.dtype, + False, + self.km, + ) + # We pass syskwargs here for correct node placement for `other`, + # which should be local to self. + block.oid = self.km.put( + np.array(other, dtype=self.dtype), + syskwargs={ + "grid_entry": self.grid_entry, + "grid_shape": self.grid_shape, + }, + ) + return block - def sqrt(self): - return self.ufunc("sqrt") + @staticmethod + def init_block(op_name, block1, block2, args, device=None): + ( + result_grid_entry, + result_grid_shape, + result_shape, + dtype, + ) = BlockBase.block_meta(op_name, block1, block2, args) + block = Block( + grid_entry=result_grid_entry, + grid_shape=result_grid_shape, + shape=result_shape, + dtype=dtype, + transposed=False, + km=block1.km, + ) + block._device = device + return block - def get(self): - return self._km.get(self.oid) + def _check_bop_implemented(self, other): + if isinstance(other, Block) or array_utils.is_scalar(other): + return True + return False + + @staticmethod + def binary_op(op_name, a, b, args: dict, device=None): + if isinstance(a, Block) and array_utils.is_scalar(b): + b = a.block_from_scalar(b) + elif isinstance(b, Block) and array_utils.is_scalar(a): + a = b.block_from_scalar(a) + if not isinstance(a, Block) or not isinstance(b, Block): + raise NotImplementedError() + + block: Block = a.init_block(op_name, a, b, args, device) + if device is None: + syskwargs = {"grid_entry": block.grid_entry, "grid_shape": block.grid_shape} + else: + syskwargs = {"device": device} + block.oid = a.km.bop( + op_name, + a.oid, + b.oid, + a.transposed, + b.transposed, + axes=args.get("axes"), + syskwargs=syskwargs, + ) + return block + + def bop(self, op_name, other, args: dict, device=None): + try: + return self.binary_op(op_name, self, other, args, device) + except NotImplementedError: + return other.binary_op(op_name, self, other, args, device) + + def tensordot(self, other, axes): + try: + return self.binary_op("tensordot", self, other, args={"axes": axes}) + except NotImplementedError: + return other.binary_op("tensordot", self, other, args={"axes": axes}) class BlockArrayBase: @@ -336,40 +446,31 @@ def __init__(self, grid: ArrayGrid, km: KernelManager, blocks: np.ndarray = None self.size = np.product(self.shape) self.ndim = len(self.shape) self.dtype = self.grid.dtype - try: - self.nbytes = self.grid.nbytes() - except ValueError as _: - self.nbytes = None self.blocks = blocks - if self.blocks is None: - # TODO (hme): Subclass np.ndarray for self.blocks instances, - # and override key methods to better integrate with NumPy's ufuncs. - self.blocks = np.empty(shape=self.grid.grid_shape, dtype=Block) - for grid_entry in self.grid.get_entry_iterator(): - self.blocks[grid_entry] = Block( - grid_entry=grid_entry, - grid_shape=self.grid.grid_shape, - shape=self.grid.get_block_shape(grid_entry), - dtype=self.dtype, - transposed=False, - km=self.km, - ) - self.fill_value = None - - @property - def nnz(self): - return np.prod(self.shape) @property def is_dense(self): - return self.fill_value is None + raise NotImplementedError() def __repr__(self): return "BlockArray(" + str(self.blocks) + ")" + def __getattr__(self, item): + if item == "__array_priority__" or item == "__array_struct__": + # This is triggered by a numpy array on the LHS. + raise TypeError( + "Unexpected conversion attempt from BlockArrayBase to ndarray." + ) + elif item == "ndim": + return len(self.shape) + elif item == "T": + return self.transpose() + else: + raise NotImplementedError(item) + def get(self) -> np.ndarray: result: np.ndarray = np.zeros(shape=self.grid.shape, dtype=self.grid.dtype) - block_shape: np.ndarray = np.array(self.grid.block_shape, dtype=np.int) + block_shape: np.ndarray = np.array(self.grid.block_shape, dtype=np.int64) arrays: list = self.km.get( [ self.blocks[grid_entry].oid @@ -378,16 +479,83 @@ def get(self) -> np.ndarray: ) for block_index, grid_entry in enumerate(self.grid.get_entry_iterator()): start = block_shape * grid_entry - entry_shape = np.array(self.grid.get_block_shape(grid_entry), dtype=np.int) + entry_shape = np.array( + self.grid.get_block_shape(grid_entry), dtype=np.int64 + ) end = start + entry_shape slices = tuple(map(lambda item: slice(*item), zip(*(start, end)))) - block: Block = self.blocks[grid_entry] + block: BlockBase = self.blocks[grid_entry] arr: np.ndarray = arrays[block_index] if block.transposed: arr = arr.T result[slices] = arr.reshape(block.shape) return result + def touch(self): + """ + "Touch" an array. This is an efficient distributed "wait" operation. + """ + oids = [] + for grid_entry in self.grid.get_entry_iterator(): + block: BlockBase = self.blocks[grid_entry] + oids.append( + self.km.touch( + block.oid, + syskwargs={ + "grid_entry": block.grid_entry, + "grid_shape": block.grid_shape, + }, + ) + ) + self.km.get(oids) + return self + + def copy(self): + raise NotImplementedError() + + def astype(self, dtype): + raise NotImplementedError() + + def flattened_oids(self): + oids = [] + for grid_entry in self.grid.get_entry_iterator(): + oid = self.blocks[grid_entry].oid + oids.append(oid) + return oids + + @classmethod + def empty(cls, shape, block_shape, dtype, km: KernelManager): + raise NotImplementedError() + + @staticmethod + def to_block_array(obj, km: KernelManager, block_shape=None): + raise NotImplementedError() + + def transpose(self, defer=False, redistribute=False): + """ + Transpose this matrix. Only use defer with arithmetic operations. + Setting redistribute to True may significantly impact performance. + :param defer: When true, the transpose operation will be applied + with the next arithmetic operation. + :param redistribute: If defer is false, setting this to true will + redistribute the data according to the device grid (data placement policy). + This parameter has no effect when defer is true. + :return: The transposed matrix. + """ + if defer and redistribute: + warnings.warn("defer is True, redistribute=True will be ignored.") + metaT = self.grid.to_meta() + metaT["shape"] = tuple(reversed(metaT["shape"])) + metaT["block_shape"] = tuple(reversed(metaT["block_shape"])) + gridT = ArrayGrid.from_meta(metaT) + rarrT = type(self)(gridT, self.km) + rarrT.blocks = np.copy(self.blocks.T) + for grid_entry in rarrT.grid.get_entry_iterator(): + rarrT.blocks[grid_entry] = rarrT.blocks[grid_entry].transpose( + defer, redistribute + ) + return rarrT + def broadcast_to(self, shape): b = array_utils.broadcast(self.shape, shape) result_block_shape = array_utils.broadcast_block_shape( @@ -410,3 +578,286 @@ def broadcast_to(self, shape): broadcast = it.itviews[0] result.blocks = broadcast return result + + def reduce_axis(self, op_name, axis, keepdims=False): + raise NotImplementedError() + + def tree_reduce( + self, op_name, blocks_or_oids, result_grid_entry, result_grid_shape, *args + ): + raise NotImplementedError() + + def check_or_convert_other(self, other, compute_block_shape=False): + raise NotImplementedError() + + def _check_bop_implemented(self, other): + raise NotImplementedError() + + # All operators: https://docs.python.org/3/library/operator.html + + ################# + # Unary functions + ################# + + def ufunc(self, op_name): + raise NotImplementedError() + + def __neg__(self): + return self.ufunc("negative") + + def __pos__(self): + return self + + def __abs__(self): + return self.ufunc("abs") + + def __invert__(self): + return self.ufunc("invert") + + ################# + # Arithmetic + ################# + + @staticmethod + def elementwise(op_name, a, b): + raise NotImplementedError() + + def __mod__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("mod", self, other) + + def __rmod__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("mod", other, self) + + __imod__ = __mod__ + + def __add__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("add", self, other) + + def __radd__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("add", other, self) + + __iadd__ = __add__ + + def __sub__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("sub", self, other) + + def __rsub__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("sub", other, self) + + __isub__ = __sub__ + + def __mul__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("mul", self, other) + + def __rmul__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("mul", other, self) + + __imul__ = __mul__ + + def __truediv__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("truediv", self, other) + + def __rtruediv__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("truediv", other, self) + + __itruediv__ = __truediv__ + + def __floordiv__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("floor_divide", self, other) + + def __rfloordiv__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("floor_divide", other, self) + + __ifloordiv__ = __floordiv__ + + def __pow__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("pow", self, other) + + def __rpow__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("pow", other, self) + + __ipow__ = __pow__ + + ################## + # Boolean + ################## + + # TODO (hme): Type check bool ops. + def __bool__(self): + # pylint: disable=no-member + if np.sum(self.shape) == len(self.shape): + # If all ones or scalar, then this is defined. + return self.get().__bool__() + return True + + def __or__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("bitwise_or", self, other) + + def __ror__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("bitwise_or", other, self) + + __ior__ = __or__ + + def __and__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("bitwise_and", self, other) + + def __rand__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("bitwise_and", other, self) + + __iand__ = __and__ + + def __xor__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("bitwise_xor", self, other) + + def __rxor__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("bitwise_xor", other, self) + + __ixor__ = __xor__ + + def __lshift__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("left_shift", self, other) + + def __rlshift__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("left_shift", other, self) + + __ilshift__ = __lshift__ + + def __rshift__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("right_shift", self, other) + + def __rrshift__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.elementwise("right_shift", other, self) + + __irshift__ = __rshift__ + + ################# + # Linear Algebra + ################# + + def _compute_tensordot_syskwargs( + self, self_block: BlockBase, other_block: BlockBase + ): + # Schedule on larger block. + if np.product(self_block.shape) >= np.product(other_block.shape): + return self_block.true_grid_entry(), self_block.true_grid_shape() + else: + return other_block.true_grid_entry(), other_block.true_grid_shape() + + @staticmethod + def tensordot(a, b, axes=2): + raise NotImplementedError() + + def __matmul__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + if len(self.shape) > 2: + # TODO (bcp): NumPy's implementation does a stacked matmul, which is not supported yet. + raise NotImplementedError( + "Matrix multiply for tensors of rank > 2 not supported yet." + ) + else: + return self.tensordot(self, other, 1) + + def __rmatmul__(self, other): + if not self._check_bop_implemented(other): + return NotImplemented + return self.tensordot(other, self, 1) + + __imatmul__ = __matmul__ + + ################# + # Inequalities + ################# + + def __inequality__(self, op_name, other): + raise NotImplementedError() + + def __ge__(self, other): + return self.__inequality__("ge", other) + + def __rge__(self, other): + other = self.check_or_convert_other(other) + return other.__inequality__("ge", self) + + def __gt__(self, other): + return self.__inequality__("gt", other) + + def __rgt__(self, other): + other = self.check_or_convert_other(other) + return other.__inequality__("gt", self) + + def __le__(self, other): + return self.__inequality__("le", other) + + def __rle__(self, other): + other = self.check_or_convert_other(other) + return other.__inequality__("le", self) + + def __lt__(self, other): + return self.__inequality__("lt", other) + + def __rlt__(self, other): + other = self.check_or_convert_other(other) + return other.__inequality__("lt", self) + + def __eq__(self, other): + return self.__inequality__("eq", other) + + def __req__(self, other): + other = self.check_or_convert_other(other) + return other.__inequality__("eq", self) + + def __ne__(self, other): + return self.__inequality__("ne", other) + + def __rne__(self, other): + other = self.check_or_convert_other(other) + return other.__inequality__("ne", self) diff --git a/nums/core/array/blockarray.py b/nums/core/array/blockarray.py index 46a7daf3..d6dabd6a 100644 --- a/nums/core/array/blockarray.py +++ b/nums/core/array/blockarray.py @@ -17,10 +17,9 @@ import itertools import numpy as np -import nums.core.array.sparse as array_sparse from nums.core.array import utils as array_utils -from nums.core.array.base import BlockArrayBase, Block +from nums.core.array.base import BlockBase, Block, BlockArrayBase from nums.core.array.view import ArrayView from nums.core.grid.grid import ArrayGrid from nums.core.kernel.kernel_manager import KernelManager @@ -30,6 +29,32 @@ class BlockArray(BlockArrayBase): + def __init__(self, grid: ArrayGrid, km: KernelManager, blocks: np.ndarray = None): + if blocks is not None: + assert blocks.dtype == Block, "BlockArray must be initialized with Blocks" + super().__init__(grid, km, blocks) + try: + self.nbytes = self.grid.nbytes() + except ValueError as _: + self.nbytes = None + if self.blocks is None: + # TODO (hme): Subclass np.ndarray for self.blocks instances, + # and override key methods to better integrate with NumPy's ufuncs. + self.blocks = np.empty(shape=self.grid.grid_shape, dtype=Block) + for grid_entry in self.grid.get_entry_iterator(): + self.blocks[grid_entry] = Block( + grid_entry=grid_entry, + grid_shape=self.grid.grid_shape, + shape=self.grid.get_block_shape(grid_entry), + dtype=self.dtype, + transposed=False, + km=self.km, + ) + + @property + def is_dense(self): + return True + @classmethod def empty(cls, shape, block_shape, dtype, km: KernelManager): return BlockArray.create("empty", shape, block_shape, dtype, km) @@ -115,24 +140,12 @@ def copy(self): rarr_copy.blocks[grid_entry] = self.blocks[grid_entry].copy() return rarr_copy - def touch(self): - """ - "Touch" an array. This is an efficient distributed "wait" operation. - """ - oids = [] - for grid_entry in self.grid.get_entry_iterator(): - block: Block = self.blocks[grid_entry] - oids.append( - self.km.touch( - block.oid, - syskwargs={ - "grid_entry": block.grid_entry, - "grid_shape": block.grid_shape, - }, - ) - ) - self.km.get(oids) - return self + def astype(self, dtype): + grid = ArrayGrid(self.shape, self.block_shape, dtype.__name__) + result = BlockArray(grid, self.km) + for grid_entry in result.grid.get_entry_iterator(): + result.blocks[grid_entry] = self.blocks[grid_entry].astype(dtype) + return result def is_single_block(self): return self.blocks.size == 1 @@ -223,42 +236,6 @@ def swapaxes(self, axis1, axis2): rarr_swap = BlockArray(grid_swap, self.km, rarr_src) return rarr_swap - def transpose(self, defer=False, redistribute=False): - """ - Transpose this matrix. Only use defer with arithmetic operations. - Setting redistribute to True may significantly impact performance. - :param defer: When true, the transpose operation will be applied - with the next arithmetic operation. - :param redistribute: If defer is false, setting this to true will - redistribute the data according to the device grid (data placement policy). - This parameter has no effect when defer is true. - :return: The transposed matrix. - """ - if defer and redistribute: - warnings.warn("defer is True, redistribute=True will be ignored.") - metaT = self.grid.to_meta() - metaT["shape"] = tuple(reversed(metaT["shape"])) - metaT["block_shape"] = tuple(reversed(metaT["block_shape"])) - gridT = ArrayGrid.from_meta(metaT) - rarrT = BlockArray(gridT, self.km) - rarrT.blocks = np.copy(self.blocks.T) - for grid_entry in rarrT.grid.get_entry_iterator(): - rarrT.blocks[grid_entry] = rarrT.blocks[grid_entry].transpose( - defer, redistribute - ) - return rarrT - - def __getattr__(self, item): - if item == "__array_priority__" or item == "__array_struct__": - # This is triggered by a numpy array on the LHS. - raise TypeError("Unexpected conversion attempt from BlockArray to ndarray.") - elif item == "ndim": - return len(self.shape) - elif item == "T": - return self.transpose() - else: - raise NotImplementedError(item) - def _preprocess_subscript(self, item): if not isinstance(item, tuple): ss = (item,) @@ -267,7 +244,7 @@ def _preprocess_subscript(self, item): # We need to fetch any block arrays. tmp = [] for entry in ss: - if isinstance(entry, BlockArray): + if isinstance(entry, BlockArrayBase): val = entry.get() else: val = entry @@ -323,7 +300,7 @@ def __getitem__(self, item): av: ArrayView = ArrayView.from_block_array(self) # TODO (hme): We don't have to create, but do so for now until we need to optimize. - return av[ss].create(BlockArray) + return av[ss].create() def _advanced_single_array_select(self, ss: tuple, axis: int = 0): # Create output array along the axis of the selection operation. @@ -355,7 +332,7 @@ def _advanced_single_array_select(self, ss: tuple, axis: int = 0): shape.append(self.shape[i]) block_shape.append(self.block_shape[i]) - dst_arr = BlockArray( + dst_arr = type(self)( ArrayGrid( shape=tuple(shape), block_shape=tuple(block_shape), @@ -424,10 +401,11 @@ def _advanced_single_array_select(self, ss: tuple, axis: int = 0): return dst_arr def __setitem__(self, key, value): - value: BlockArray = BlockArray.to_block_array(value, self.km) + value: BlockArrayBase = self.to_block_array(value, self.km) ss, is_handled_advanced, axis = self._preprocess_subscript(key) if is_handled_advanced: return self._advanced_single_array_assign(ss, value, axis) + av: ArrayView = ArrayView.from_block_array(self) av[key] = value @@ -445,7 +423,7 @@ def _advanced_single_array_assign( # 2. value is scalar or 1-dimensional. # We currently don't support the case where value may broadcasted if it has more dims. # This should be a straight-forward future task. - value: BlockArray = value + value: BlockArrayBase = value mode = None if len(value.shape) == 0: # subscripted value per block will broadcast to other dimensions. @@ -526,7 +504,7 @@ def _advanced_single_array_assign( }, ) for dst_grid_entry in dst_arr.grid.get_entry_iterator(): - dst_block: Block = dst_arr.blocks[dst_grid_entry] + dst_block: BlockBase = dst_arr.blocks[dst_grid_entry] dst_coord: tuple = dst_arr.grid.get_entry_coordinates(dst_grid_entry) # Make sure index values in subscript are within bounds of dst_arr. @@ -553,7 +531,7 @@ def _advanced_single_array_assign( continue if mode == "scalar": - src_block: Block = src_arr.blocks.item() + src_block: BlockBase = src_arr.blocks.item() src_coord: tuple = src_arr.grid.get_entry_coordinates( src_block.grid_entry ) @@ -571,7 +549,7 @@ def _advanced_single_array_assign( ) elif mode == "single-dim": for src_grid_entry in src_arr.grid.get_entry_iterator(): - src_block: Block = src_arr.blocks[src_grid_entry] + src_block: BlockBase = src_arr.blocks[src_grid_entry] src_coord: tuple = src_arr.grid.get_entry_coordinates( src_grid_entry ) @@ -597,7 +575,7 @@ def _advanced_single_array_assign( + [j] + list(dst_grid_entry[axis + 1 :]) ) - src_block: Block = src_arr.blocks[src_grid_entry] + src_block: BlockBase = src_arr.blocks[src_grid_entry] src_coord: tuple = src_arr.grid.get_entry_coordinates( src_grid_entry ) @@ -635,60 +613,19 @@ def check_or_convert_other(self, other, compute_block_shape=False): block_shape = None if compute_block_shape else self.block_shape return BlockArray.to_block_array(other, self.km, block_shape=block_shape) + def _check_bop_implemented(self, other): + if isinstance(other, (BlockArray, np.ndarray, list)) or array_utils.is_scalar( + other + ): + return True + return False + def ufunc(self, op_name): result = self.copy() for grid_entry in self.grid.get_entry_iterator(): result.blocks[grid_entry] = self.blocks[grid_entry].ufunc(op_name) return result - def tree_reduce( - self, op_name, blocks_or_oids, result_grid_entry, result_grid_shape - ): - """ - Basic tree reduce imp. - Schedules op on same node as left operand. - :param op_name: The reduction op. - :param blocks_or_oids: A list of type Block or a list of tuples. - Tuples must be of the form - (oid, grid_entry, grid_shape, transposed) - :param result_grid_entry: The grid entry of the result block. This will be used - to compute the final reduction step. - :param result_grid_shape: The grid entry of the result block. This will be used - to compute the final reduction step. - :return: The oid of the result. - """ - oid_list = blocks_or_oids - if isinstance(blocks_or_oids[0], Block): - oid_list = [ - (b.oid, b.grid_entry, b.grid_shape, b.transposed) - for b in blocks_or_oids - ] - if len(oid_list) == 1: - return oid_list[0][0] - q = oid_list - while len(q) > 1: - a_oid, a_ge, a_gs, a_T = q.pop(0) - b_oid, _, _, b_T = q.pop(0) - ge, gs = ( - (result_grid_entry, result_grid_shape) if len(q) == 0 else (a_ge, a_gs) - ) - c_oid = self.km.bop_reduce( - op_name, - a_oid, - b_oid, - a_T, - b_T, - syskwargs={ - "grid_entry": ge, - "grid_shape": gs, - }, - ) - q.append((c_oid, ge, gs, False)) - r_oid, r_ge, r_gs, _ = q.pop(0) - assert r_ge == result_grid_entry - assert r_gs == result_grid_shape - return r_oid - def reduce_axis(self, op_name, axis, keepdims=False): if not (axis is None or isinstance(axis, (int, np.int32, np.int64))): raise NotImplementedError("Only integer axis is currently supported.") @@ -769,18 +706,115 @@ def reduce_axis(self, op_name, axis, keepdims=False): ) return result + def tree_reduce( + self, op_name, blocks_or_oids, result_grid_entry, result_grid_shape, *args + ): + """ + Basic tree reduce imp. + Schedules op on same node as left operand. + :param op_name: The reduction op. + :param blocks_or_oids: A list of type Block or a list of tuples. + Tuples must be of the form + (oid, grid_entry, grid_shape, transposed) + :param result_grid_entry: The grid entry of the result block. This will be used + to compute the final reduction step. + :param result_grid_shape: The grid entry of the result block. This will be used + to compute the final reduction step. + :return: The oid of the result. + """ + oid_list = blocks_or_oids + if isinstance(blocks_or_oids[0], Block): + oid_list = [ + (b.oid, b.grid_entry, b.grid_shape, b.transposed) + for b in blocks_or_oids + ] + if len(oid_list) == 1: + return oid_list[0][0] + q = oid_list + while len(q) > 1: + a_oid, a_ge, a_gs, a_T = q.pop(0) + b_oid, _, _, b_T = q.pop(0) + ge, gs = ( + (result_grid_entry, result_grid_shape) if len(q) == 0 else (a_ge, a_gs) + ) + c_oid = self.km.bop_reduce( + op_name, + a_oid, + b_oid, + a_T, + b_T, + syskwargs={ + "grid_entry": ge, + "grid_shape": gs, + }, + ) + q.append((c_oid, ge, gs, False)) + r_oid, r_ge, r_gs, _ = q.pop(0) + assert r_ge == result_grid_entry + assert r_gs == result_grid_shape + return r_oid + ################# - # Linear Algebra + # Arithmetic ################# - def _compute_tensordot_syskwargs(self, self_block: Block, other_block: Block): - # Schedule on larger block. - if np.product(self_block.shape) >= np.product(other_block.shape): - return self_block.true_grid_entry(), self_block.true_grid_shape() + @staticmethod + def elementwise(op_name, a, b): + if isinstance(a, BlockArray): + b = a.check_or_convert_other(b) + elif isinstance(b, BlockArray): + a = b.check_or_convert_other(a) else: - return other_block.true_grid_entry(), other_block.true_grid_shape() + raise NotImplementedError() + + if a.shape == b.shape and a.block_shape == b.block_shape: + return BlockArray._fast_elementwise(op_name, a, b) + blocks_op = a.blocks.__getattribute__("__%s__" % op_name) + return BlockArray.from_blocks(blocks_op(b.blocks), result_shape=None, km=a.km) + + @staticmethod + def _fast_elementwise(op_name, a, b): + """ + Implements fast scheduling for basic element-wise operations. + """ + dtype = array_utils.get_bop_output_type(op_name, a.dtype, b.dtype) + # Schedule the op first. + blocks = np.empty(shape=a.grid.grid_shape, dtype=Block) + for grid_entry in a.grid.get_entry_iterator(): + a_block: Block = a.blocks[grid_entry] + b_block: Block = b.blocks[grid_entry] + blocks[grid_entry] = block = Block( + grid_entry=grid_entry, + grid_shape=a_block.grid_shape, + shape=a_block.shape, + dtype=dtype, + transposed=False, + km=a.km, + ) + block.oid = a.km.bop( + op_name, + a_block.oid, + b_block.oid, + a_block.transposed, + b_block.transposed, + axes={}, + syskwargs={ + "grid_entry": grid_entry, + "grid_shape": a.grid.grid_shape, + }, + ) + return BlockArray( + ArrayGrid(a.shape, a.block_shape, dtype.__name__), + a.km, + blocks=blocks, + ) - def tensordot(self, other, axes=2): + ################# + # Linear Algebra + ################# + + @staticmethod + def tensordot(a, b, axes=2): if isinstance(axes, int): pass elif array_utils.is_array_like(axes): @@ -788,59 +822,59 @@ def tensordot(self, other, axes=2): else: raise TypeError(f"Unexpected axes type '{type(axes).__name__}'") - other = self.check_or_convert_other(other, compute_block_shape=True) + if isinstance(a, BlockArray): + b = a.check_or_convert_other(b, compute_block_shape=True) + elif isinstance(b, BlockArray): + a = b.check_or_convert_other(a, compute_block_shape=True) + else: + raise NotImplementedError() - if array_utils.np_tensordot_param_test( - self.shape, self.ndim, other.shape, other.ndim, axes - ): + if array_utils.np_tensordot_param_test(a.shape, a.ndim, b.shape, b.ndim, axes): raise ValueError("shape-mismatch for sum") if axes > 0: - this_axes = self.grid.grid_shape[:-axes] - this_sum_axes = self.grid.grid_shape[-axes:] - other_axes = other.grid.grid_shape[axes:] - other_sum_axes = other.grid.grid_shape[:axes] - assert this_sum_axes == other_sum_axes - result_shape = tuple(self.shape[:-axes] + other.shape[axes:]) - result_block_shape = tuple( - self.block_shape[:-axes] + other.block_shape[axes:] - ) + a_axes = a.grid.grid_shape[:-axes] + a_sum_axes = a.grid.grid_shape[-axes:] + b_axes = b.grid.grid_shape[axes:] + b_sum_axes = b.grid.grid_shape[:axes] + assert a_sum_axes == b_sum_axes + result_shape = tuple(a.shape[:-axes] + b.shape[axes:]) + result_block_shape = tuple(a.block_shape[:-axes] + b.block_shape[axes:]) else: - this_axes = self.grid.grid_shape - other_axes = other.grid.grid_shape - this_sum_axes = () - result_shape = tuple(self.shape + other.shape) - result_block_shape = tuple(self.block_shape + other.block_shape) + a_axes = a.grid.grid_shape + b_axes = b.grid.grid_shape + a_sum_axes = () + result_shape = tuple(a.shape + b.shape) + result_block_shape = tuple(a.block_shape + b.block_shape) result_grid = ArrayGrid( shape=result_shape, block_shape=result_block_shape, dtype=array_utils.get_bop_output_type( - "tensordot", self.dtype, other.dtype + "tensordot", a.dtype, b.dtype ).__name__, ) - assert result_grid.grid_shape == tuple(this_axes + other_axes) - result = BlockArray(result_grid, self.km) - this_dims = list(itertools.product(*map(range, this_axes))) - other_dims = list(itertools.product(*map(range, other_axes))) - sum_dims = list(itertools.product(*map(range, this_sum_axes))) - for i in this_dims: - for j in other_dims: + assert result_grid.grid_shape == tuple(a_axes + b_axes) + result = BlockArray(result_grid, a.km) + a_dims = list(itertools.product(*map(range, a_axes))) + b_dims = list(itertools.product(*map(range, b_axes))) + sum_dims = list(itertools.product(*map(range, a_sum_axes))) + for i in a_dims: + for j in b_dims: grid_entry = tuple(i + j) result_block: Block = result.blocks[grid_entry] sum_oids = [] for k in sum_dims: - self_block: Block = self.blocks[tuple(i + k)] - other_block: Block = other.blocks[tuple(k + j)] - dot_grid_args = self._compute_tensordot_syskwargs( - self_block, other_block - ) - dotted_oid = self.km.bop( + a_block: Block = a.blocks[tuple(i + k)] + b_block: Block = b.blocks[tuple(k + j)] + # pylint: disable=protected-access + dot_grid_args = a._compute_tensordot_syskwargs(a_block, b_block) + dotted_oid = a.km.bop( "tensordot", - self_block.oid, - other_block.oid, - self_block.transposed, - other_block.transposed, + a_block.oid, + b_block.oid, + a_block.transposed, + b_block.transposed, axes=axes, syskwargs={ "grid_entry": dot_grid_args[0], @@ -850,177 +884,19 @@ def tensordot(self, other, axes=2): sum_oids.append( (dotted_oid, dot_grid_args[0], dot_grid_args[1], False) ) - result_block.oid = self.tree_reduce( + result_block.oid = a.tree_reduce( "sum", sum_oids, result_block.grid_entry, result_block.grid_shape ) return result - def __matmul__(self, other): - if len(self.shape) > 2: - # TODO (bcp): NumPy's implementation does a stacked matmul, which is not supported yet. - raise NotImplementedError( - "Matrix multiply for tensors of rank > 2 not supported yet." - ) - else: - return self.tensordot(other, 1) - - def __rmatmul__(self, other): - other = self.check_or_convert_other(other) - return other @ self - - __imatmul__ = __matmul__ - - ################# - # Arithmetic - ################# - - def _fast_element_wise(self, op_name, other): - """ - Implements fast scheduling for basic element-wise operations. - """ - dtype = array_utils.get_bop_output_type(op_name, self.dtype, other.dtype) - # Schedule the op first. - blocks = np.empty(shape=self.grid.grid_shape, dtype=Block) - for grid_entry in self.grid.get_entry_iterator(): - self_block: Block = self.blocks[grid_entry] - other_block: Block = other.blocks[grid_entry] - blocks[grid_entry] = block = Block( - grid_entry=grid_entry, - grid_shape=self_block.grid_shape, - shape=self_block.shape, - dtype=dtype, - transposed=False, - km=self.km, - ) - block.oid = self.km.bop( - op_name, - self_block.oid, - other_block.oid, - self_block.transposed, - other_block.transposed, - axes={}, - syskwargs={ - "grid_entry": grid_entry, - "grid_shape": self.grid.grid_shape, - }, - ) - return BlockArray( - ArrayGrid(self.shape, self.block_shape, dtype.__name__), - self.km, - blocks=blocks, - ) - - def __elementwise__(self, op_name, other): - other = self.check_or_convert_other(other) - if self.shape == other.shape and self.block_shape == other.block_shape: - return self._fast_element_wise(op_name, other) - blocks_op = self.blocks.__getattribute__("__%s__" % op_name) - return BlockArray.from_blocks( - blocks_op(other.blocks), result_shape=None, km=self.km - ) - - def __neg__(self): - return self.ufunc("negative") - - def __pos__(self): - return self - - def __abs__(self): - return self.ufunc("abs") - - def __mod__(self, other): - return self.__elementwise__("mod", other) - - def __rmod__(self, other): - other = self.check_or_convert_other(other) - return other.__elementwise__("mod", self) - - __imod__ = __mod__ - - def __add__(self, other): - if isinstance(other, array_sparse.SparseBlockArray): - return NotImplemented - return self.__elementwise__("add", other) - - def __radd__(self, other): - if isinstance(other, array_sparse.SparseBlockArray): - return NotImplemented - other = self.check_or_convert_other(other) - return other.__elementwise__("add", self) - - __iadd__ = __add__ - - def __sub__(self, other): - if isinstance(other, array_sparse.SparseBlockArray): - return NotImplemented - return self.__elementwise__("sub", other) - - def __rsub__(self, other): - if isinstance(other, array_sparse.SparseBlockArray): - return NotImplemented - other = self.check_or_convert_other(other) - return other.__elementwise__("sub", self) - - __isub__ = __sub__ - - def __mul__(self, other): - if isinstance(other, array_sparse.SparseBlockArray): - return NotImplemented - return self.__elementwise__("mul", other) - - def __rmul__(self, other): - if isinstance(other, array_sparse.SparseBlockArray): - return NotImplemented - other = self.check_or_convert_other(other) - return other.__elementwise__("mul", self) - - __imul__ = __mul__ - - def __truediv__(self, other): - if isinstance(other, array_sparse.SparseBlockArray): - return NotImplemented - return self.__elementwise__("truediv", other) - - def __rtruediv__(self, other): - if isinstance(other, array_sparse.SparseBlockArray): - return NotImplemented - other = self.check_or_convert_other(other) - return other / self - - __itruediv__ = __truediv__ - - def __floordiv__(self, other): - if isinstance(other, array_sparse.SparseBlockArray): - return NotImplemented - return self.__elementwise__("floor_divide", other) - - def __rfloordiv__(self, other): - if isinstance(other, array_sparse.SparseBlockArray): - return NotImplemented - other = self.check_or_convert_other(other) - return other.__elementwise__("floor_divide", self) - - __ifloordiv__ = __floordiv__ - - def __pow__(self, other): - if isinstance(other, array_sparse.SparseBlockArray): - return NotImplemented - return self.__elementwise__("pow", other) - - def __rpow__(self, other): - if isinstance(other, array_sparse.SparseBlockArray): - return NotImplemented - other = self.check_or_convert_other(other) - return other**self - - __ipow__ = __pow__ - ################# # Inequalities ################# - def __inequality__(self, op, other): + def __inequality__(self, op_name, other): other = self.check_or_convert_other(other) + if other is NotImplemented: + return NotImplemented assert ( other.shape == () or other.shape == self.shape ), "Currently supports comparison with scalars only." @@ -1037,129 +913,10 @@ def __inequality__(self, op, other): else: other_block: Block = other.blocks[grid_entry] result.blocks[grid_entry] = self.blocks[grid_entry].bop( - op, other_block, args={} + op_name, other_block, args={} ) - return result - def __ge__(self, other): - return self.__inequality__("ge", other) - - def __rge__(self, other): - other = self.check_or_convert_other(other) - return other.__inequality__("ge", self) - - def __gt__(self, other): - return self.__inequality__("gt", other) - - def __rgt__(self, other): - other = self.check_or_convert_other(other) - return other.__inequality__("gt", self) - - def __le__(self, other): - return self.__inequality__("le", other) - - def __rle__(self, other): - other = self.check_or_convert_other(other) - return other.__inequality__("le", self) - - def __lt__(self, other): - return self.__inequality__("lt", other) - - def __rlt__(self, other): - other = self.check_or_convert_other(other) - return other.__inequality__("lt", self) - - def __eq__(self, other): - return self.__inequality__("eq", other) - - def __req__(self, other): - other = self.check_or_convert_other(other) - return other.__inequality__("eq", self) - - def __ne__(self, other): - return self.__inequality__("ne", other) - - def __rne__(self, other): - other = self.check_or_convert_other(other) - return other.__inequality__("ne", self) - - ################## - # Boolean - ################## - - # TODO (hme): Type check bool ops. - def __bool__(self): - # pylint: disable=no-member - if np.sum(self.shape) == len(self.shape): - # If all ones or scalar, then this is defined. - return self.get().__bool__() - return True - - def __invert__(self): - return self.ufunc("invert") - - def __or__(self, other): - return self.__elementwise__("bitwise_or", other) - - def __ror__(self, other): - other = self.check_or_convert_other(other) - return other.__elementwise__("bitwise_or", self) - - __ior__ = __or__ - - def __and__(self, other): - return self.__elementwise__("bitwise_and", other) - - def __rand__(self, other): - other = self.check_or_convert_other(other) - return other.__elementwise__("bitwise_and", self) - - __iand__ = __and__ - - def __xor__(self, other): - return self.__elementwise__("bitwise_xor", other) - - def __rxor__(self, other): - other = self.check_or_convert_other(other) - return other.__elementwise__("bitwise_xor", self) - - __ixor__ = __xor__ - - def __lshift__(self, other): - return self.__elementwise__("left_shift", other) - - def __rlshift__(self, other): - other = self.check_or_convert_other(other) - return other.__elementwise__("left_shift", self) - - __ilshift__ = __lshift__ - - def __rshift__(self, other): - return self.__elementwise__("right_shift", other) - - def __rrshift__(self, other): - other = self.check_or_convert_other(other) - return other.__elementwise__("right_shift", self) - - __irshift__ = __rshift__ - - # All operators: https://docs.python.org/3/library/operator.html - - def astype(self, dtype): - grid = ArrayGrid(self.shape, self.block_shape, dtype.__name__) - result = BlockArray(grid, self.km) - for grid_entry in result.grid.get_entry_iterator(): - result.blocks[grid_entry] = self.blocks[grid_entry].astype(dtype) - return result - - def flattened_oids(self): - oids = [] - for grid_entry in self.grid.get_entry_iterator(): - oid = self.blocks[grid_entry].oid - oids.append(oid) - return oids - class Reshape: @staticmethod @@ -1239,7 +996,7 @@ def _arbitrary_reshape(self, arr: BlockArray, shape, block_shape) -> BlockArray: # Generate index mappings per block, and group source indices to minimize # RPCs and generation of new objects. km = arr.km - dst_arr = BlockArray.empty( + dst_arr = type(arr).empty( shape=shape, block_shape=block_shape, dtype=arr.dtype, km=km ) for dst_grid_entry in dst_arr.grid.get_entry_iterator(): @@ -1268,7 +1025,7 @@ def _arbitrary_reshape(self, arr: BlockArray, shape, block_shape) -> BlockArray: return dst_arr def _block_shape_reshape(self, arr, block_shape): - rarr: BlockArray = BlockArray.empty(arr.shape, block_shape, arr.dtype, arr.km) + rarr: BlockArray = type(arr).empty(arr.shape, block_shape, arr.dtype, arr.km) for grid_entry in rarr.grid.get_entry_iterator(): grid_entry_slice = rarr.grid.get_slice(grid_entry) # TODO (hme): This could be less costly. diff --git a/nums/core/array/random.py b/nums/core/array/random.py index 3452860d..5e417270 100644 --- a/nums/core/array/random.py +++ b/nums/core/array/random.py @@ -220,23 +220,21 @@ def sparse_randint( self, low, high=None, - dtype=int, + dtype=None, shape=None, block_shape=None, p=0.01, - fill_value=0, ): if dtype is None: - dtype = np.float64 + dtype = np.int64 assert isinstance(dtype, type) return self._sparse_sample_basic( "randint", + {"low": low, "high": high, "dtype": dtype}, shape, block_shape, dtype, - {"low": low, "high": high, "dtype": dtype}, p, - fill_value, ) def sparse_uniform( @@ -247,16 +245,14 @@ def sparse_uniform( block_shape=None, dtype=None, p=0.01, - fill_value=0, ): return self._sparse_sample_basic( "uniform", + {"low": low, "high": high}, shape, block_shape, dtype, - {"low": low, "high": high}, p, - fill_value, ) def sparse_normal( @@ -267,27 +263,24 @@ def sparse_normal( block_shape=None, dtype=None, p=0.01, - fill_value=0, ): return self._sparse_sample_basic( "normal", + {"loc": loc, "scale": scale}, shape, block_shape, dtype, - {"loc": loc, "scale": scale}, p, - fill_value, ) def _sparse_sample_basic( self, rfunc_name, + rfunc_args: Dict, shape, block_shape, dtype, - rfunc_args: Dict, p, - fill_value, ) -> SparseBlockArray: if shape is None: assert block_shape is None @@ -300,7 +293,7 @@ def _sparse_sample_basic( assert isinstance(dtype, type) assert "size" not in rfunc_args grid: ArrayGrid = ArrayGrid(shape, block_shape, dtype=dtype.__name__) - sba: SparseBlockArray = SparseBlockArray(grid, self._km, fill_value) + sba: SparseBlockArray = SparseBlockArray(grid, self._km) for grid_entry in sba.grid.get_entry_iterator(): rng_params = list(self._rng.new_block_rng_params()) this_block_shape = grid.get_block_shape(grid_entry) @@ -316,8 +309,8 @@ def _sparse_sample_basic( this_block_shape, dtype, p, - fill_value, syskwargs=syskwargs, ) + # pylint: disable=protected-access block._nnz = sba.km.sparse_nnz(block.oid, syskwargs=syskwargs) return sba diff --git a/nums/core/array/sparse.py b/nums/core/array/sparse.py index fb71b0bd..118edcdc 100644 --- a/nums/core/array/sparse.py +++ b/nums/core/array/sparse.py @@ -1,17 +1,20 @@ from typing import List +import itertools +import warnings +import numpy as np +import sparse + from nums.core.array import utils as array_utils -from nums.core.array.base import BlockArrayBase, Block +from nums.core.array.base import BlockBase, Block, BlockArrayBase from nums.core.array.blockarray import BlockArray from nums.core.kernel.kernel_manager import KernelManager from nums.core.grid.grid import ArrayGrid -import numpy as np -import itertools -import sparse -# TODO: merge with Block -# TODO: RHS binary operations -class SparseBlock(Block): +# pylint: disable=protected-access, redefined-builtin + + +class SparseBlock(BlockBase): def __init__( self, grid_entry, @@ -20,23 +23,23 @@ def __init__( dtype, transposed, km: KernelManager, - fill_value=0, id=None, index_dtype=np.int64, ): super().__init__(grid_entry, grid_shape, shape, dtype, transposed, km, id) - self.fill_value = fill_value self.index_dtype = index_dtype self.oid = None self._nnz: object = None - self._nbytes: object = ( - None # TODO: implement as lazily fetched exact result, same as nnz - ) + self._nbytes: object = None + + @property + def is_dense(self): + return False @property def nnz(self): if self._nnz is None: - self._nnz = self._km.sparse_nnz( + self._nnz = self.km.sparse_nnz( self.oid, syskwargs={ "grid_entry": self.grid_entry, @@ -44,30 +47,26 @@ def nnz(self): }, ) if not array_utils.is_int(self._nnz): - self._nnz = self._km.get(self._nnz) + self._nnz = self.km.get(self._nnz) return self._nnz - # TODO: implement as lazily fetched exact result @property def nbytes(self): - return self._estimate_nbytes(format="coo") - - # TODO: deprecate - def _estimate_nbytes(self, format=None): - if format is None: - return self.nnz - elif format == "coo": - return ( - self.nnz * np.dtype(self.dtype).itemsize - + self.nnz * self.ndim * np.dtype(self.index_dtype).itemsize + if self._nbytes is None: + self._nbytes = self.km.sparse_nbytes( + self.oid, + syskwargs={ + "grid_entry": self.grid_entry, + "grid_shape": self.grid_shape, + }, ) + if not array_utils.is_int(self._nbytes): + self._nbytes = self.km.get(self._nbytes) + return self._nbytes def __repr__(self): return f"SparseBlock({self.oid})" - def size(self): - return np.product(self.shape) - def copy(self, shallow=True): assert shallow, "Only shallow copies are currently supported." block = SparseBlock( @@ -76,40 +75,24 @@ def copy(self, shallow=True): self.shape, self.dtype, self.transposed, - self._km, + self.km, ) block.oid = self.oid return block - def transpose(self, defer=False, redistribute=False): - grid_entryT = tuple(reversed(self.grid_entry)) - grid_shapeT = tuple(reversed(self.grid_shape)) - blockT = SparseBlock( - grid_entry=grid_entryT, - grid_shape=grid_shapeT, - shape=tuple(reversed(self.shape)), - dtype=self.dtype, - transposed=not self.transposed, - km=self._km, - ) - blockT.oid = self.oid - if not defer: - blockT.transposed = False - if redistribute: - syskwargs = {"grid_entry": grid_entryT, "grid_shape": grid_shapeT} - else: - syskwargs = { - "grid_entry": self.grid_entry, - "grid_shape": self.grid_shape, - } - blockT.oid = self._km.transpose(self.oid, syskwargs=syskwargs) - return blockT - - def ufunc(self, op_name, device=None): - return self.uop_map(op_name, device=device) - - def uop_map(self, op_name, args=None, kwargs=None, device=None): - block = self.copy() + def map_uop(self, op_name, args=None, kwargs=None, device=None): + densify = array_utils.get_sparse_uop_densify(op_name) + if densify: + block = Block( + self.grid_entry, + self.grid_shape, + self.shape, + self.dtype, + self.transposed, + self.km, + ) + else: + block = self.copy() block.dtype = array_utils.get_uop_output_type(op_name, self.dtype) args = () if args is None else args kwargs = {} if kwargs is None else kwargs @@ -118,32 +101,48 @@ def uop_map(self, op_name, args=None, kwargs=None, device=None): else: syskwargs = {"device": device} block._device = device - block.oid = self._km.sparse_map_uop( - op_name, self.oid, args, kwargs, syskwargs=syskwargs + block.oid = self.km.sparse_map_uop( + op_name, self.oid, args, kwargs, densify, syskwargs=syskwargs ) - block._nnz = self._km.sparse_nnz(block.oid, syskwargs=syskwargs) - block.fill_value = np.__getattribute__(op_name)(self.fill_value) + block._nnz = self.km.sparse_nnz(block.oid, syskwargs=syskwargs) + block._nbytes = self.km.sparse_nbytes(block.oid, syskwargs=syskwargs) return block - def _block_from_scalar(self, other): + def block_from_scalar(self, other): assert array_utils.is_scalar(other) - block = SparseBlock( - self.grid_entry, - self.grid_shape, - (1,), - self.dtype, - False, - self._km, - fill_value=other, - ) - # TODO: generalize for different kernels - block.oid = self._km.put( - sparse.COO.from_numpy(np.array(other), fill_value=other), - syskwargs={ - "grid_entry": self.grid_entry, - "grid_shape": self.grid_shape, - }, - ) + # Construct sparse block only if value is 0. + if np.isclose(other, 0): + block = SparseBlock( + self.grid_entry, + self.grid_shape, + (1,), + self.dtype, + False, + self.km, + ) + block.oid = self.km.sparse_block_from_scalar( + other, + syskwargs={ + "grid_entry": self.grid_entry, + "grid_shape": self.grid_shape, + }, + ) + else: + block = Block( + self.grid_entry, + self.grid_shape, + (1,), + self.dtype, + False, + self.km, + ) + block.oid = self.km.block_from_scalar( + other, + syskwargs={ + "grid_entry": self.grid_entry, + "grid_shape": self.grid_shape, + }, + ) return block @staticmethod @@ -153,10 +152,7 @@ def init_block(op_name, block1, block2, args, device=None): result_grid_shape, result_shape, dtype, - ) = Block.block_meta(op_name, block1, block2, args) - fill_value = array_utils.get_bop_fill_value( - op_name, block1.fill_value, block2.fill_value - ) + ) = BlockBase.block_meta(op_name, block1, block2, args) # TODO: what happens when different index_dtype? block = SparseBlock( grid_entry=result_grid_entry, @@ -164,22 +160,30 @@ def init_block(op_name, block1, block2, args, device=None): shape=result_shape, dtype=dtype, transposed=False, - fill_value=fill_value, - km=block1._km, + km=block1.km, ) block._device = device return block - def bop(self, op_name, other, args: dict, device=None): - if not isinstance(other, Block): - other = self._block_from_scalar(other) - densify = array_utils.get_sparse_bop_return_type( - op_name, self.fill_value, other.fill_value - ) + def _check_bop_implemented(self, other): + if isinstance(other, BlockBase) or array_utils.is_scalar(other): + return True + return False + + @staticmethod + def binary_op(op_name, a: BlockBase, b: BlockBase, args: dict, device=None): + if isinstance(a, SparseBlock) and array_utils.is_scalar(b): + b = a.block_from_scalar(b) + elif isinstance(b, SparseBlock) and array_utils.is_scalar(a): + a = b.block_from_scalar(a) + if not isinstance(a, BlockBase) or not isinstance(b, BlockBase): + raise NotImplementedError() + + densify = array_utils.get_sparse_bop_densify(op_name, a.is_dense, b.is_dense) if densify: - block = Block.init_block(op_name, self, other, args, device) + block = Block.init_block(op_name, a, b, args, device) else: - block = SparseBlock.init_block(op_name, self, other, args, device) + block = SparseBlock.init_block(op_name, a, b, args, device) if device is None: syskwargs = { "grid_entry": block.grid_entry, @@ -187,46 +191,40 @@ def bop(self, op_name, other, args: dict, device=None): } else: syskwargs = {"device": device} - block.oid = self._km.sparse_bop( + block.oid = a.km.sparse_bop( op_name, - self.oid, - other.oid, - self.transposed, - other.transposed, + a.oid, + b.oid, + a.transposed, + b.transposed, axes=args.get("axes"), densify=densify, syskwargs=syskwargs, ) if not densify: - block._nnz = self._km.sparse_nnz(block.oid, syskwargs=syskwargs) + block._nnz = a.km.sparse_nnz(block.oid, syskwargs=syskwargs) + block._nbytes = a.km.sparse_nbytes(block.oid, syskwargs=syskwargs) return block - # TODO: densify when fill_value != 0 + def bop(self, op_name, other, args: dict, device=None): + return self.binary_op(op_name, self, other, args, device) + def tensordot(self, other, axes): - assert self.fill_value == 0 - if not other.is_dense: - assert other.fill_value == 0 - return self.bop("tensordot", other, args={"axes": axes}) + return self.binary_op("tensordot", self, other, args={"axes": axes}) -# TODO: merge with BlockArray, have decorator for dense-only methods? -class SparseBlockArray(BlockArray): +class SparseBlockArray(BlockArrayBase): def __init__( self, grid: ArrayGrid, km: KernelManager, - fill_value=0, blocks: np.ndarray = None, ): - self.grid = grid - self.km = km - self.shape = self.grid.shape - self.block_shape = self.grid.block_shape - self.grid_shape = self.grid.grid_shape - self.size = np.product(self.shape) - self.ndim = len(self.shape) - self.dtype = self.grid.dtype - self.blocks = blocks + if blocks is not None: + assert ( + blocks.dtype == SparseBlock + ), "SparseBlockArray must be initialized with SparseBlocks" + super().__init__(grid, km, blocks) if self.blocks is None: self.blocks = np.empty(shape=self.grid_shape, dtype=SparseBlock) for grid_entry in self.grid.get_entry_iterator(): @@ -237,19 +235,17 @@ def __init__( self.dtype, False, self.km, - fill_value, ) - self.fill_value = fill_value self._nnz = -1 self._nbytes = -1 @property - def nnz(self): - return self._get_nnz() + def is_dense(self): + return False @property - def nbytes(self): - return self._get_nbytes() + def nnz(self): + return self._get_nnz() def _get_nnz(self): if self._nnz == -1: @@ -258,6 +254,10 @@ def _get_nnz(self): self._nnz += self.blocks[grid_entry].nnz return self._nnz + @property + def nbytes(self): + return self._get_nbytes() + def _get_nbytes(self): if self._nbytes == -1: self._nbytes = 0 @@ -265,11 +265,32 @@ def _get_nbytes(self): self._nbytes += self.blocks[grid_entry].nbytes return self._nbytes + def __repr__(self): + return f"SparseBlockArray({self.blocks})" + + @classmethod + def empty(cls, shape, block_shape, dtype, km: KernelManager): + return SparseBlockArray.create("empty", shape, block_shape, dtype, km) + + @classmethod + def create(cls, create_op_name, shape, block_shape, dtype, km: KernelManager): + grid = ArrayGrid(shape=shape, block_shape=block_shape, dtype=dtype.__name__) + grid_meta = grid.to_meta() + arr = SparseBlockArray(grid, km) + for grid_entry in grid.get_entry_iterator(): + arr.blocks[grid_entry].oid = km.new_sparse_block( + create_op_name, + grid_entry, + grid_meta, + syskwargs={"grid_entry": grid_entry, "grid_shape": grid.grid_shape}, + ) + return arr + @classmethod - def from_np(cls, arr, block_shape, copy, km, fill_value=0): + def from_np(cls, arr, block_shape, copy, km): dtype_str = str(arr.dtype) grid = ArrayGrid(arr.shape, block_shape, dtype_str) - rarr = SparseBlockArray(grid, km, fill_value) + rarr = SparseBlockArray(grid, km) grid_entry_iterator = grid.get_entry_iterator() for grid_entry in grid_entry_iterator: grid_slice = grid.get_slice(grid_entry) @@ -277,7 +298,7 @@ def from_np(cls, arr, block_shape, copy, km, fill_value=0): if copy: block = np.copy(block) # TODO: generalize for different kernels - block = sparse.COO.from_numpy(block, fill_value) + block = sparse.COO.from_numpy(block, fill_value=0) rarr.blocks[grid_entry].oid = km.put( block, syskwargs={"grid_entry": grid_entry, "grid_shape": grid.grid_shape}, @@ -286,10 +307,10 @@ def from_np(cls, arr, block_shape, copy, km, fill_value=0): return rarr @classmethod - def from_sparse(cls, arr, block_shape, copy, km, fill_value=0): + def from_sparse(cls, arr, block_shape, copy, km): dtype_str = str(arr.dtype) grid = ArrayGrid(arr.shape, block_shape, dtype_str) - rarr = SparseBlockArray(grid, km, fill_value) + rarr = SparseBlockArray(grid, km) grid_entry_iterator = grid.get_entry_iterator() for grid_entry in grid_entry_iterator: grid_slice = grid.get_slice(grid_entry) @@ -306,14 +327,16 @@ def from_sparse(cls, arr, block_shape, copy, km, fill_value=0): @classmethod def from_scalar(cls, val, km): - if not array_utils.is_scalar(val): - raise ValueError("%s is not a scalar." % val) - return SparseBlockArray.from_np( - np.array(val), (), copy=False, km=km, fill_value=val - ) + # Only create SparseBlockArray with 0s. Other scalars should use dense BlockArray. + if not np.isclose(val, 0): + warnings.warn( + "%s cannot fill SparseBlockArray. Converting to BlockArray." % val + ) + return BlockArray.from_np(np.array(val), (), copy=False, km=km) + return SparseBlockArray.from_np(np.array(val), (), copy=False, km=km) @classmethod - def from_blocks(cls, arr: np.ndarray, result_shape, km, fill_value): + def from_blocks(cls, arr: np.ndarray, result_shape, km): sample_idx = tuple(0 for _ in arr.shape) if isinstance(arr, SparseBlock): sample_block = arr @@ -328,7 +351,7 @@ def from_blocks(cls, arr: np.ndarray, result_shape, km, fill_value): shape=result_shape, block_shape=result_block_shape, dtype=result_dtype_str ) assert arr.shape == result_grid.grid_shape - result = SparseBlockArray(result_grid, km, fill_value) + result = SparseBlockArray(result_grid, km) for grid_entry in result_grid.get_entry_iterator(): if isinstance(arr, SparseBlock): block: SparseBlock = arr @@ -338,12 +361,12 @@ def from_blocks(cls, arr: np.ndarray, result_shape, km, fill_value): return result @classmethod - def from_ba(cls, ba: BlockArrayBase, fill_value=0): + def from_ba(cls, ba: BlockArrayBase): assert ( ba.shape != () ), "from_ba does not support scalar BlockArray. Use from_scalar." grid = ArrayGrid(ba.shape, ba.block_shape, ba.dtype.__name__) - sba = SparseBlockArray(grid, ba.km, fill_value) + sba = SparseBlockArray(grid, ba.km) for grid_entry in grid.get_entry_iterator(): block: Block = ba.blocks[grid_entry] sblock: SparseBlock = sba.blocks[grid_entry] @@ -353,11 +376,10 @@ def from_ba(cls, ba: BlockArrayBase, fill_value=0): } sblock.oid = sba.km.dense_to_sparse( block.oid, - fill_value, syskwargs=syskwargs, ) sblock._nnz = sba.km.sparse_nnz(sblock.oid, syskwargs=syskwargs) - sba.fill_value = fill_value + sblock._nbytes = sba.km.sparse_nbytes(sblock.oid, syskwargs=syskwargs) return sba def to_ba(self): @@ -378,11 +400,18 @@ def todense(self) -> BlockArray: def copy(self): grid_copy = self.grid.from_meta(self.grid.to_meta()) - rarr_copy = SparseBlockArray(grid_copy, self.km, self.fill_value) + rarr_copy = SparseBlockArray(grid_copy, self.km) for grid_entry in grid_copy.get_entry_iterator(): rarr_copy.blocks[grid_entry] = self.blocks[grid_entry].copy() return rarr_copy + def astype(self, dtype): + grid = ArrayGrid(self.shape, self.block_shape, dtype.__name__) + result = BlockArray(grid, self.km) + for grid_entry in result.grid.get_entry_iterator(): + result.blocks[grid_entry] = self.blocks[grid_entry].astype(dtype) + return result + @staticmethod def to_block_array(obj, km: KernelManager, block_shape=None): if isinstance(obj, (BlockArray, SparseBlockArray)): @@ -397,176 +426,270 @@ def to_block_array(obj, km: KernelManager, block_shape=None): raise Exception("Unsupported type %s" % type(obj)) if block_shape is None: block_shape = km.get_block_shape(np_array.shape, np_array.dtype) + # Assume object is dense. return BlockArray.from_np(np_array, block_shape, False, km) - def transpose(self, defer=False, redistribute=False): - if defer and redistribute: - warnings.warn("defer is True, redistribute=True will be ignored.") - metaT = self.grid.to_meta() - metaT["shape"] = tuple(reversed(metaT["shape"])) - metaT["block_shape"] = tuple(reversed(metaT["block_shape"])) - gridT = ArrayGrid.from_meta(metaT) - rarrT = SparseBlockArray(gridT, self.km, self.fill_value) - rarrT.blocks = np.copy(self.blocks.T) - for grid_entry in rarrT.grid.get_entry_iterator(): - rarrT.blocks[grid_entry] = rarrT.blocks[grid_entry].transpose( - defer, redistribute - ) - return rarrT - def check_or_convert_other(self, other, compute_block_shape=False): block_shape = None if compute_block_shape else self.block_shape return SparseBlockArray.to_block_array(other, self.km, block_shape=block_shape) + def _check_bop_implemented(self, other): + if isinstance( + other, (BlockArrayBase, np.ndarray, list) + ) or array_utils.is_scalar(other): + return True + return False + def ufunc(self, op_name): - result = self.copy() + densify = array_utils.get_sparse_uop_densify(op_name) + if densify: + result = BlockArray(self.grid, self.km) + else: + result = self.copy() for grid_entry in self.grid.get_entry_iterator(): result.blocks[grid_entry] = self.blocks[grid_entry].ufunc(op_name) - func = np.__getattribute__(op_name) - result.fill_value = func(self.fill_value) result._nnz = -1 + result._nbytes = -1 return result - def sdtp(self, *block_arrays: List[BlockArray]): - """ - Perform a sampled tensor product among an arbitrary number of block arrays. - """ - assert np.allclose(self.fill_value, 0) - for i, ba in enumerate(block_arrays): - assert len(ba.shape) == 1 - assert ba.shape[0] == self.shape[i] - assert ba.block_shape[0] == self.block_shape[i] - # Sparsity of result is same as self. - result: SparseBlockArray = SparseBlockArray(self.grid, self.km, self.fill_value) + def reduce_axis(self, op_name, axis, keepdims=False): + if not (axis is None or isinstance(axis, (int, np.int32, np.int64))): + raise NotImplementedError("Only integer axis is currently supported.") + if 0 in self.shape: + return SparseBlockArray.create("zeros", (), (), float, self.km) + block_reduced_oids = np.empty_like(self.blocks, dtype=tuple) for grid_entry in self.grid.get_entry_iterator(): - dense_oids = [ - ba.blocks[grid_entry[i]].oid for i, ba in enumerate(block_arrays) - ] - result.blocks[grid_entry].oid = self.km.sdtp( - self.blocks[grid_entry].oid, - *dense_oids, - syskwargs={"grid_entry": grid_entry, "grid_shape": result.grid_shape}, + block = self.blocks[grid_entry] + block_oid = self.km.sparse_reduce_axis( + op_name=op_name, + arr=block.oid, + axis=axis, + keepdims=keepdims, + transposed=block.transposed, + syskwargs={ + "grid_entry": block.grid_entry, + "grid_shape": block.grid_shape, + }, ) - return result - - def sdtd(self, x: BlockArray, y: BlockArray, axes: int): - """ - Perform a sampled dense-dense tensor dot between two tensors x and y. - In addition to several compatibility-related constraints to ensure - the operation is actually valid, we also require that the fill value of - the sparse array is 0, and that the axes over which the tensor dot - is being performed are not partitioned. - The last constraint can be eliminated if we add a sampled element-wise kernel. - """ - assert np.allclose(self.fill_value, 0) - - if array_utils.np_tensordot_param_test(x.shape, x.ndim, y.shape, y.ndim, axes): - raise ValueError("shape-mismatch for sum") - - x_axes = x.grid.grid_shape[:-axes] - x_sum_axes = x.grid.grid_shape[-axes:] - y_axes = y.grid.grid_shape[axes:] - y_sum_axes = y.grid.grid_shape[:axes] - assert x_sum_axes == y_sum_axes - result_shape = tuple(x.shape[:-axes] + y.shape[axes:]) - result_block_shape = tuple(x.block_shape[:-axes] + y.block_shape[axes:]) + block_reduced_oids[grid_entry] = ( + block_oid, + block.grid_entry, + block.grid_shape, + False, + ) + result_shape = [] + result_block_shape = [] + for curr_axis in range(len(self.shape)): + axis_size, axis_block_size = ( + self.shape[curr_axis], + self.block_shape[curr_axis], + ) + if curr_axis == axis or axis is None: + if keepdims: + axis_size, axis_block_size = 1, 1 + else: + continue + result_shape.append(axis_size) + result_block_shape.append(axis_block_size) + result_shape = tuple(result_shape) + result_block_shape = tuple(result_block_shape) + result_dtype = array_utils.get_reduce_output_type(op_name, self.dtype) result_grid = ArrayGrid( shape=result_shape, block_shape=result_block_shape, - dtype=array_utils.get_bop_output_type( - "tensordot", x.dtype, y.dtype - ).__name__, + dtype=result_dtype.__name__, ) - # Ensure dims over which we sum are not partitioned. - # This is currently required for scalable sampled dense. - assert np.sum(x_sum_axes) == np.sum(y_sum_axes) == axes + result = BlockArray(result_grid, self.km) - assert result_grid.grid_shape == tuple(x_axes + y_axes) - assert result_grid.grid_shape == self.grid_shape - assert result_grid.block_shape == self.block_shape - result: SparseBlockArray = SparseBlockArray(self.grid, self.km, self.fill_value) - this_dims = list(itertools.product(*map(range, x_axes))) - other_dims = list(itertools.product(*map(range, y_axes))) - sum_dims = tuple([0] * axes) - for i in this_dims: - for j in other_dims: - grid_entry = tuple(i + j) - x_block: Block = x.blocks[tuple(i + sum_dims)] - y_block: Block = y.blocks[tuple(sum_dims + j)] - result.blocks[grid_entry].oid = self.km.sdtd( - self.blocks[grid_entry].oid, - x_block.oid, - y_block.oid, - axes=axes, + if axis is None: + if result.shape == (): + result_block: Block = result.blocks[()] + else: + result_block: Block = result.blocks[:].item() + result_block.oid = self.tree_reduce( + op_name, + block_reduced_oids.flatten().tolist(), + result_block.grid_entry, + result_block.grid_shape, + False, + ) + else: + for result_grid_entry in result_grid.get_entry_iterator(): + block_reduced_oids_axis = [] + for sum_dim in range(self.grid.grid_shape[axis]): + grid_entry = list(result_grid_entry) + if keepdims: + grid_entry[axis] = sum_dim + else: + grid_entry = grid_entry[:axis] + [sum_dim] + grid_entry[axis:] + grid_entry = tuple(grid_entry) + block_reduced_oids_axis.append(block_reduced_oids[grid_entry]) + result_block: Block = result.blocks[result_grid_entry] + result_block.oid = self.tree_reduce( + op_name, + block_reduced_oids_axis, + result_block.grid_entry, + result_block.grid_shape, + False, + ) + return result + + # pylint: disable=arguments-differ + def tree_reduce( + self, + op_name, + blocks_or_oids, + result_grid_entry, + result_grid_shape, + densify, + *args, + ): + """ + Basic tree reduce imp. + Schedules op on same node as left operand. + :param op_name: The reduction op. + :param blocks_or_oids: A list of type Block or a list of tuples. + Tuples must be of the form + (oid, grid_entry, grid_shape, transposed) + :param result_grid_entry: The grid entry of the result block. This will be used + to compute the final reduction step. + :param result_grid_shape: The grid entry of the result block. This will be used + to compute the final reduction step. + :return: The oid of the result. + """ + oid_list = blocks_or_oids + if isinstance(blocks_or_oids[0], Block): + oid_list = [ + (b.oid, b.grid_entry, b.grid_shape, b.transposed) + for b in blocks_or_oids + ] + if len(oid_list) == 1: + return oid_list[0][0] + q = oid_list + if densify: + while len(q) > 1: + a_oid, a_ge, a_gs, a_T = q.pop(0) + b_oid, _, _, b_T = q.pop(0) + ge, gs = ( + (result_grid_entry, result_grid_shape) + if len(q) == 0 + else (a_ge, a_gs) + ) + c_oid = self.km.bop_reduce( + op_name, + a_oid, + b_oid, + a_T, + b_T, syskwargs={ - "grid_entry": grid_entry, - "grid_shape": result.grid_shape, + "grid_entry": ge, + "grid_shape": gs, }, ) - return result + q.append((c_oid, ge, gs, False)) + else: + while len(q) > 1: + a_oid, a_ge, a_gs, a_T = q.pop(0) + b_oid, _, _, b_T = q.pop(0) + ge, gs = ( + (result_grid_entry, result_grid_shape) + if len(q) == 0 + else (a_ge, a_gs) + ) + c_oid = self.km.sparse_bop_reduce( + op_name, + a_oid, + b_oid, + a_T, + b_T, + syskwargs={ + "grid_entry": ge, + "grid_shape": gs, + }, + ) + q.append((c_oid, ge, gs, False)) + r_oid, r_ge, r_gs, _ = q.pop(0) + assert r_ge == result_grid_entry + assert r_gs == result_grid_shape + return r_oid + + ################# + # Arithmetic + ################# - def _fast_elementwise(self, op_name, other, densify): - dtype = array_utils.get_bop_output_type(op_name, self.dtype, other.dtype) + @staticmethod + def elementwise(op_name, a, b): + if isinstance(a, SparseBlockArray): + b = a.check_or_convert_other(b) + elif isinstance(b, SparseBlockArray): + a = b.check_or_convert_other(a) + else: + raise NotImplementedError() + + densify = array_utils.get_sparse_bop_densify(op_name, a.is_dense, b.is_dense) + if a.shape == b.shape and a.block_shape == b.block_shape: + return SparseBlockArray._fast_elementwise(op_name, a, b, densify) + else: + blocks_op = a.blocks.__getattribute__(f"__{op_name}__") + if densify: + result = BlockArray.from_blocks( + blocks_op(b.blocks), + result_shape=None, + km=a.km, + ) + else: + result = SparseBlockArray.from_blocks( + blocks_op(b.blocks), + result_shape=None, + km=a.km, + ) + return result + + @staticmethod + def _fast_elementwise(op_name, a, b, densify): + # a, b have the same grid_shape and block_shape + dtype = array_utils.get_bop_output_type(op_name, a.dtype, b.dtype) if densify: - blocks = np.empty(shape=self.grid_shape, dtype=Block) + block_type = Block else: - blocks = np.empty(shape=self.grid_shape, dtype=SparseBlock) - for grid_entry in self.grid.get_entry_iterator(): - self_block: SparseBlock = self.blocks[grid_entry] - other_block: Block = other.blocks[grid_entry] - blocks[grid_entry] = block = self_block.bop(op_name, other_block, args={}) - block.oid = self.km.sparse_bop( + block_type = SparseBlock + blocks = np.empty(shape=a.grid_shape, dtype=block_type) + for grid_entry in a.grid.get_entry_iterator(): + a_block: BlockBase = a.blocks[grid_entry] + b_block: BlockBase = b.blocks[grid_entry] + blocks[grid_entry] = block_type( + grid_entry, + a_block.grid_shape, + a_block.shape, + dtype, + transposed=False, + km=a.km, + ) + blocks[grid_entry].oid = a.km.sparse_bop( op_name, - self_block.oid, - other_block.oid, - self_block.transposed, - other_block.transposed, + a_block.oid, + b_block.oid, + a_block.transposed, + b_block.transposed, axes={}, densify=densify, syskwargs={ "grid_entry": grid_entry, - "grid_shape": self.grid.grid_shape, + "grid_shape": a.grid.grid_shape, }, ) - grid = ArrayGrid(self.shape, self.block_shape, dtype.__name__) + grid = ArrayGrid(a.shape, a.block_shape, dtype.__name__) if densify: - return BlockArray(grid, self.km, blocks=blocks) + return BlockArray(grid, a.km, blocks=blocks) else: - fill_value = array_utils.get_bop_fill_value( - op_name, self.fill_value, other.fill_value - ) - result = SparseBlockArray(grid, self.km, fill_value, blocks=blocks) - return result + return SparseBlockArray(grid, a.km, blocks=blocks) - def __elementwise__(self, op_name, other): - other = self.check_or_convert_other(other) - densify = array_utils.get_sparse_bop_return_type( - op_name, - self.fill_value, - other.fill_value, - ) - if self.shape == other.shape and self.block_shape == other.block_shape: - return self._fast_elementwise(op_name, other, densify) - blocks_op = self.blocks.__getattribute__("__%s__" % op_name) - if densify: - result = BlockArray.from_blocks( - blocks_op(other.blocks), - result_shape=None, - km=self.km, - ) - else: - fill_value = array_utils.get_bop_fill_value( - op_name, self.fill_value, other.fill_value - ) - result = SparseBlockArray.from_blocks( - blocks_op(other.blocks), - result_shape=None, - fill_value=fill_value, - km=self.km, - ) - return result + ################# + # Linear Algebra + ################# - def tensordot(self, other, axes=2): + @staticmethod + def tensordot(a, b, axes=2): if isinstance(axes, int): pass elif array_utils.is_array_like(axes): @@ -574,73 +697,65 @@ def tensordot(self, other, axes=2): else: raise TypeError(f"Unexpected axes type '{type(axes).__name__}'") - other = self.check_or_convert_other(other, compute_block_shape=True) + if isinstance(a, SparseBlockArray): + b = a.check_or_convert_other(b, compute_block_shape=True) + elif isinstance(b, SparseBlockArray): + a = b.check_or_convert_other(a, compute_block_shape=True) + else: + raise NotImplementedError() - if array_utils.np_tensordot_param_test( - self.shape, self.ndim, other.shape, other.ndim, axes - ): + if array_utils.np_tensordot_param_test(a.shape, a.ndim, b.shape, b.ndim, axes): raise ValueError("shape-mismatch for sum") - # Pydata/Sparse only works with fill_value == 0 - # TODO: densify when fill_value != 0 - assert self.fill_value == 0 - if isinstance(other, SparseBlockArray): - assert other.fill_value == 0, "Sparse-dense tensordot may not be tractable." - densify = array_utils.get_sparse_bop_return_type( - "tensordot", - self.fill_value, - other.fill_value, + densify = array_utils.get_sparse_bop_densify( + "tensordot", a.is_dense, b.is_dense ) if axes > 0: - this_axes = self.grid.grid_shape[:-axes] - this_sum_axes = self.grid.grid_shape[-axes:] - other_axes = other.grid.grid_shape[axes:] - other_sum_axes = other.grid.grid_shape[:axes] - assert this_sum_axes == other_sum_axes - result_shape = tuple(self.shape[:-axes] + other.shape[axes:]) - result_block_shape = tuple( - self.block_shape[:-axes] + other.block_shape[axes:] - ) + a_axes = a.grid.grid_shape[:-axes] + a_sum_axes = a.grid.grid_shape[-axes:] + b_axes = b.grid.grid_shape[axes:] + b_sum_axes = b.grid.grid_shape[:axes] + assert a_sum_axes == b_sum_axes + result_shape = tuple(a.shape[:-axes] + b.shape[axes:]) + result_block_shape = tuple(a.block_shape[:-axes] + b.block_shape[axes:]) else: - this_axes = self.grid.grid_shape - other_axes = other.grid.grid_shape - this_sum_axes = () - result_shape = tuple(self.shape + other.shape) - result_block_shape = tuple(self.block_shape + other.block_shape) + a_axes = a.grid.grid_shape + b_axes = b.grid.grid_shape + a_sum_axes = () + result_shape = tuple(a.shape + b.shape) + result_block_shape = tuple(a.block_shape + b.block_shape) result_grid = ArrayGrid( shape=result_shape, block_shape=result_block_shape, dtype=array_utils.get_bop_output_type( - "tensordot", self.dtype, other.dtype + "tensordot", a.dtype, b.dtype ).__name__, ) - assert result_grid.grid_shape == tuple(this_axes + other_axes) + assert result_grid.grid_shape == tuple(a_axes + b_axes) if densify: - result = BlockArray(result_grid, self.km) + result = BlockArray(result_grid, a.km) else: - result = SparseBlockArray(result_grid, self.km, self.fill_value) - this_dims = list(itertools.product(*map(range, this_axes))) - other_dims = list(itertools.product(*map(range, other_axes))) - sum_dims = list(itertools.product(*map(range, this_sum_axes))) - for i in this_dims: - for j in other_dims: + result = SparseBlockArray(result_grid, a.km) + a_dims = list(itertools.product(*map(range, a_axes))) + b_dims = list(itertools.product(*map(range, b_axes))) + sum_dims = list(itertools.product(*map(range, a_sum_axes))) + for i in a_dims: + for j in b_dims: grid_entry = tuple(i + j) result_block: Block = result.blocks[grid_entry] sum_oids = [] for k in sum_dims: - self_block: Block = self.blocks[tuple(i + k)] - other_block: Block = other.blocks[tuple(k + j)] - dot_grid_args = self._compute_tensordot_syskwargs( - self_block, other_block - ) - dotted_oid = self.km.sparse_bop( + a_block: Block = a.blocks[tuple(i + k)] + b_block: Block = b.blocks[tuple(k + j)] + dot_grid_args = a._compute_tensordot_syskwargs(a_block, b_block) + dotted_oid = a.km.sparse_bop( "tensordot", - self_block.oid, - other_block.oid, - self_block.transposed, - other_block.transposed, + a_block.oid, + b_block.oid, + a_block.transposed, + b_block.transposed, axes=axes, densify=densify, syskwargs={ @@ -651,81 +766,117 @@ def tensordot(self, other, axes=2): sum_oids.append( (dotted_oid, dot_grid_args[0], dot_grid_args[1], False) ) - result_block.oid = self.tree_reduce( - "sum", sum_oids, result_block.grid_entry, result_block.grid_shape + result_block.oid = a.tree_reduce( + "sum", + sum_oids, + result_block.grid_entry, + result_block.grid_shape, + densify, ) if not densify: - result_block._nnz = self.km.sparse_nnz( - result_block.oid, - syskwargs={ - "grid_entry": result_block.grid_entry, - "grid_shape": result_block.grid_shape, - }, + syskwargs = { + "grid_entry": result_block.grid_entry, + "grid_shape": result_block.grid_shape, + } + result_block._nnz = a.km.sparse_nnz( + result_block.oid, syskwargs=syskwargs + ) + result_block._nbytes = a.km.sparse_nbytes( + result_block.oid, syskwargs=syskwargs ) return result - def __add__(self, other): - return self.__elementwise__("add", other) - - def __radd__(self, other): - return self.__elementwise__("add", other) - - __iadd__ = __add__ - - def __sub__(self, other): - return self.__elementwise__("sub", other) - - def __rsub__(self, other): - # FIXME: not commutative - return self.__elementwise__("sub", other) - - __isub__ = __sub__ - - def __mul__(self, other): - return self.__elementwise__("mul", other) - - def __rmul__(self, other): - return self.__elementwise__("mul", other) - - __imul__ = __mul__ - - def __truediv__(self, other): - return self.__elementwise__("truediv", other) - - def __rtruediv__(self, other): - return self.__elementwise__("truediv", other) - - __itruediv__ = __truediv__ - - def __floordiv__(self, other): - return self.__elementwise__("floordiv", other) - - def __rfloordiv__(self, other): - return self.__elementwise__("floordiv", other) + def sdtp(self, *block_arrays: List[BlockArray]): + """ + Perform a sampled tensor product among an arbitrary number of block arrays. + """ + for i, ba in enumerate(block_arrays): + assert len(ba.shape) == 1 + assert ba.shape[0] == self.shape[i] + assert ba.block_shape[0] == self.block_shape[i] + # Sparsity of result is same as self. + result: SparseBlockArray = SparseBlockArray(self.grid, self.km) + for grid_entry in self.grid.get_entry_iterator(): + dense_oids = [ + ba.blocks[grid_entry[i]].oid for i, ba in enumerate(block_arrays) + ] + result.blocks[grid_entry].oid = self.km.sdtp( + self.blocks[grid_entry].oid, + *dense_oids, + syskwargs={"grid_entry": grid_entry, "grid_shape": result.grid_shape}, + ) + return result - __ifloordiv__ = __floordiv__ + def sdtd(self, x: BlockArray, y: BlockArray, axes: int): + """ + Perform a sampled dense-dense tensor dot between two tensors x and y. + In addition to several compatibility-related constraints to ensure + the operation is actually valid, we also require that the fill value of + the sparse array is 0, and that the axes over which the tensor dot + is being performed are not partitioned. + The last constraint can be eliminated if we add a sampled element-wise kernel. + """ + if array_utils.np_tensordot_param_test(x.shape, x.ndim, y.shape, y.ndim, axes): + raise ValueError("shape-mismatch for sum") - def __pow__(self, other): - return self.__elementwise__("pow", other) + x_axes = x.grid.grid_shape[:-axes] + x_sum_axes = x.grid.grid_shape[-axes:] + y_axes = y.grid.grid_shape[axes:] + y_sum_axes = y.grid.grid_shape[:axes] + assert x_sum_axes == y_sum_axes + result_shape = tuple(x.shape[:-axes] + y.shape[axes:]) + result_block_shape = tuple(x.block_shape[:-axes] + y.block_shape[axes:]) + result_grid = ArrayGrid( + shape=result_shape, + block_shape=result_block_shape, + dtype=array_utils.get_bop_output_type( + "tensordot", x.dtype, y.dtype + ).__name__, + ) + # Ensure dims over which we sum are not partitioned. + # This is currently required for scalable sampled dense. + assert np.sum(x_sum_axes) == np.sum(y_sum_axes) == axes - def __rpow__(self, other): - return self.__elementwise__("pow", other) + assert result_grid.grid_shape == tuple(x_axes + y_axes) + assert result_grid.grid_shape == self.grid_shape + assert result_grid.block_shape == self.block_shape + result: SparseBlockArray = SparseBlockArray(self.grid, self.km) + x_dims = list(itertools.product(*map(range, x_axes))) + y_dims = list(itertools.product(*map(range, y_axes))) + sum_dims = tuple([0] * axes) + for i in x_dims: + for j in y_dims: + grid_entry = tuple(i + j) + x_block: Block = x.blocks[tuple(i + sum_dims)] + y_block: Block = y.blocks[tuple(sum_dims + j)] + result.blocks[grid_entry].oid = self.km.sdtd( + self.blocks[grid_entry].oid, + x_block.oid, + y_block.oid, + axes=axes, + syskwargs={ + "grid_entry": grid_entry, + "grid_shape": result.grid_shape, + }, + ) + return result - __ipow__ = __pow__ + ################# + # Inequalities + ################# def __inequality__(self, op_name, other): other = self.check_or_convert_other(other) - assert other.shape == (), "Currently supports comparison with scalars only." + assert ( + other.shape == () or other.shape == self.shape + ), "Currently supports comparison with scalars only." shape = array_utils.broadcast(self.shape, other.shape).shape block_shape = array_utils.broadcast_block_shape( self.shape, other.shape, self.block_shape ) dtype = bool.__name__ grid = ArrayGrid(shape, block_shape, dtype) - fill_value = array_utils.get_bop_fill_value( - op_name, self.fill_value, other.fill_value - ) - result = SparseBlockArray(grid, self.km, fill_value) + result = SparseBlockArray(grid, self.km) for grid_entry in result.grid.get_entry_iterator(): other_block: Block = other.blocks.item() result.blocks[grid_entry] = self.blocks[grid_entry].bop( diff --git a/nums/core/array/utils.py b/nums/core/array/utils.py index 8cefd766..356b78f7 100644 --- a/nums/core/array/utils.py +++ b/nums/core/array/utils.py @@ -18,12 +18,9 @@ import numpy as np import scipy.special -import sparse from nums.core.settings import np_ufunc_map from nums.core.array.errors import AxisError -from nums.core.array.base import Block -from nums.core.array.sparse import SparseBlock # pylint: disable = no-member, trailing-whitespace @@ -455,62 +452,25 @@ def normalize_axis_index(axis, ndim): return axis % ndim -# def get_sparse_bop_return_type(op_name, a: Block, b: Block): -# def sample_array(block): -# s = np.eye(2) -# if isinstance(block, SparseBlock): -# return sparse.COO.from_numpy(s, fill_value=block.fill_value) -# return s - -# sa = sample_array(a) -# sb = sample_array(b) -# if op_name == "tensordot": -# result = sparse.tensordot(sa, sb) -# else: -# op_name = np_ufunc_map.get(op_name, op_name) -# try: -# ufunc = np.__getattribute__(op_name) -# except Exception as _: -# ufunc = scipy.special.__getattribute__(op_name) -# result = sparse.elemwise(ufunc, sa, sb) -# if isinstance(result, sparse.SparseArray): -# return False -# return True - - -def get_sparse_bop_return_type(op_name, a_fv, b_fv): - def sample_array(fv): - s = np.eye(2) - if fv is not None: - return sparse.COO.from_numpy(s, fill_value=fv) - return s - - sa = sample_array(a_fv) - sb = sample_array(b_fv) - if op_name == "tensordot": - result = sparse.tensordot(sa, sb) - else: - op_name = np_ufunc_map.get(op_name, op_name) - try: - ufunc = np.__getattribute__(op_name) - except Exception as _: - ufunc = scipy.special.__getattribute__(op_name) - result = sparse.elemwise(ufunc, sa, sb) - if isinstance(result, sparse.SparseArray): +def get_sparse_uop_densify(op_name): + ufunc = np.__getattribute__(op_name) + if ufunc(0) == 0: return False return True -def get_bop_fill_value(op_name, a_fv, b_fv): - if a_fv is None and b_fv is None: - return None - if a_fv is None: - return b_fv - if b_fv is None: - return a_fv - op_name = np_ufunc_map.get(op_name, op_name) - try: - func = np.__getattribute__(op_name) - except Exception as _: - func = scipy.special.__getattribute__(op_name) - return func(a_fv, b_fv) +sd_bop_sparse_ops = [ + "mul", +] + + +def get_sparse_bop_densify(op_name, a_dense: bool, b_dense: bool): + if a_dense and b_dense: + return True + + if a_dense or b_dense: + if op_name in sd_bop_sparse_ops: + return False + return True + + return False diff --git a/nums/core/array/view.py b/nums/core/array/view.py index a9186c9c..9fa40b7e 100644 --- a/nums/core/array/view.py +++ b/nums/core/array/view.py @@ -19,7 +19,7 @@ from nums.core.array import selection from nums.core.array import utils as array_utils -from nums.core.array.base import Block, BlockArrayBase +from nums.core.array.base import BlockBase, BlockArrayBase from nums.core.array.selection import BasicSelection from nums.core.kernel.kernel_manager import KernelManager from nums.core.grid.grid import ArrayGrid @@ -39,6 +39,7 @@ def from_subscript(cls, bab, subscript): def __init__(self, source, sel: BasicSelection = None, block_shape: tuple = None): self._source: BlockArrayBase = source self._km: KernelManager = self._source.km + self._concrete_cls = type(source) if sel is None: sel = BasicSelection.from_shape(self._source.shape) @@ -84,7 +85,7 @@ def basic_select(self, subscript: tuple): result: ArrayView = ArrayView(self._source, sel) return result - def create(self, concrete_cls=None) -> BlockArrayBase: + def create(self) -> BlockArrayBase: if self.sel.basic_steps(): if self.sel.is_aligned(self._source.block_shape): # Assertion below should form a conjunction with the above condition. @@ -95,20 +96,21 @@ def create(self, concrete_cls=None) -> BlockArrayBase: self.sel.get_broadcastable_block_shape(self.block_shape), self._source.block_shape, ) - return self.create_references(concrete_cls) + return self.create_references(self._concrete_cls) else: - return self.create_basic_single_step(concrete_cls) + return self.create_basic_single_step(self._concrete_cls) else: - return self.create_basic_multi_step(concrete_cls) + return self.create_basic_multi_step(self._concrete_cls) def create_references(self, concrete_cls) -> BlockArrayBase: # TODO (hme): Double check this. - array_cls = BlockArrayBase if concrete_cls is None else concrete_cls - dst_ba: BlockArrayBase = array_cls(self.grid, self._km) + # array_cls = BlockArrayBase if concrete_cls is None else concrete_cls + # dst_ba: BlockArrayBase = array_cls(self.grid, self._km) + dst_ba: BlockArrayBase = concrete_cls(self.grid, self._km) if 0 in self.shape: return dst_ba grid_offset = self.sel.position().value // np.array( - self._source.block_shape, dtype=np.int + self._source.block_shape, dtype=np.intp ) dst_inflated_shape = self.sel.get_broadcastable_shape() dst_inflated_block_shape = self.sel.get_broadcastable_block_shape( @@ -123,7 +125,9 @@ def create_references(self, concrete_cls) -> BlockArrayBase: ): dst_grid_entry = dst_grid_entry_iterator[dst_index] src_grid_entry = tuple( - (np.array(dst_inflated_grid_entry, dtype=np.int) + grid_offset).tolist() + ( + np.array(dst_inflated_grid_entry, dtype=np.intp) + grid_offset + ).tolist() ) dst_ba.blocks[dst_grid_entry].oid = self._source.blocks[src_grid_entry].oid dst_ba.blocks[dst_grid_entry].transposed = self._source.blocks[ @@ -171,7 +175,7 @@ def create_basic_single_step(self, concrete_cls) -> BlockArrayBase: ] if src_dst_intersection_block.is_empty(): continue - src_block: Block = self._source.blocks[src_grid_entry] + src_block: BlockBase = self._source.blocks[src_grid_entry] src_oids.append(src_block.oid) src_sel_block: BasicSelection = src_sel_arr[src_grid_entry] src_dep_sel_loc = src_dst_intersection_block - src_sel_block.position() @@ -180,7 +184,7 @@ def create_basic_single_step(self, concrete_cls) -> BlockArrayBase: src_dst_intersection_block - dst_sel_offset_block.position() ) dst_params.append((dst_block_sel_loc.selector(), False)) - dst_block: Block = dst_ba.blocks.reshape(dst_grid_bc.grid_shape)[ + dst_block: BlockBase = dst_ba.blocks.reshape(dst_grid_bc.grid_shape)[ dst_grid_entry_bc ] dst_block.oid = km.create_block( @@ -261,7 +265,7 @@ def assign_references(self, dst_sel: BasicSelection, value): # but the destination selection may not have the same shape as value. # May need to broadcast value to destination selection output shape. dst_offset = dst_sel.position().value // np.array( - self._source.block_shape, dtype=np.int + self._source.block_shape, dtype=np.intp ) # Do we need to broadcast? if isinstance(value, ArrayView) and ( @@ -273,7 +277,7 @@ def assign_references(self, dst_sel: BasicSelection, value): # We don't need to create value to perform the reference copy. # No broadcasting required, so this should be okay. src_offset = value.sel.position().value // np.array( - value._source.block_shape, dtype=np.int + value._source.block_shape, dtype=np.intp ) src_inflated_shape = dst_sel.get_broadcastable_shape() src_inflated_block_shape = dst_sel.get_broadcastable_block_shape( @@ -286,18 +290,18 @@ def assign_references(self, dst_sel: BasicSelection, value): # Num axes in value grid may be too small. dst_grid_entry = tuple( ( - np.array(src_grid_entry_inflated, dtype=np.int) + dst_offset + np.array(src_grid_entry_inflated, dtype=np.intp) + dst_offset ).tolist() ) src_grid_entry = tuple( ( - np.array(src_grid_entry_inflated, dtype=np.int) + src_offset + np.array(src_grid_entry_inflated, dtype=np.intp) + src_offset ).tolist() ) # This is a reference assignment, and the grid properties between the # two blocks may differ, so retain those properties in the copy. - dst_block: Block = self._source.blocks[dst_grid_entry] - src_block_copy: Block = value._source.blocks[src_grid_entry].copy() + dst_block: BlockBase = self._source.blocks[dst_grid_entry] + src_block_copy: BlockBase = value._source.blocks[src_grid_entry].copy() src_block_copy.grid_entry = dst_block.grid_entry src_block_copy.grid_shape = dst_block.grid_shape self._source.blocks[dst_grid_entry] = src_block_copy @@ -322,13 +326,13 @@ def assign_references(self, dst_sel: BasicSelection, value): src_grid_entry = src_grid_entry_iterator[src_index] dst_grid_entry = tuple( ( - np.array(src_grid_entry_inflated, dtype=np.int) + dst_offset + np.array(src_grid_entry_inflated, dtype=np.intp) + dst_offset ).tolist() ) # This is a reference assignment, and the grid properties between the # two blocks may differ, so retain those properties in the copy. - dst_block: Block = self._source.blocks[dst_grid_entry] - src_block_copy: Block = src_ba.blocks[src_grid_entry].copy() + dst_block: BlockBase = self._source.blocks[dst_grid_entry] + src_block_copy: BlockBase = src_ba.blocks[src_grid_entry].copy() src_block_copy.grid_entry = dst_block.grid_entry src_block_copy.grid_shape = dst_block.grid_shape self._source.blocks[dst_grid_entry] = src_block_copy @@ -389,7 +393,7 @@ def basic_assign_single_step(self, dst_sel: BasicSelection, value): src_oids = [] src_params = [] dst_params = [] - dst_block: Block = dst_ba.blocks[dst_grid_entry] + dst_block: BlockBase = dst_ba.blocks[dst_grid_entry] for src_index, src_grid_entry_bc in enumerate( src_inflated_grid.get_entry_iterator() ): @@ -400,7 +404,7 @@ def basic_assign_single_step(self, dst_sel: BasicSelection, value): continue src_grid_entry = src_grid_entry_iterator[src_index] - src_block: Block = src_ba_bc.blocks[src_grid_entry] + src_block: BlockBase = src_ba_bc.blocks[src_grid_entry] src_oids.append(src_block.oid) src_sel_block_offset: BasicSelection = src_sel_offset[src_grid_entry_bc] diff --git a/nums/core/kernel/kernel_interface.py b/nums/core/kernel/kernel_interface.py index 04bd194d..0314011c 100644 --- a/nums/core/kernel/kernel_interface.py +++ b/nums/core/kernel/kernel_interface.py @@ -167,7 +167,7 @@ def any(self, arr, syskwargs: Dict): # Sparse - def dense_to_sparse(self, arr, fill_value, syskwargs: Dict): + def dense_to_sparse(self, arr, syskwargs: Dict): raise NotImplementedError() def sparse_to_dense(self, arr, syskwargs: Dict): @@ -179,6 +179,9 @@ def sparse_nnz(self, arr, syskwargs: Dict): def sparse_nbytes(self, arr, syskwargs: Dict): raise NotImplementedError() + def new_sparse_block(self, op_name, grid_entry, grid_meta, syskwargs: Dict): + raise NotImplementedError() + def sparse_random_block( self, rng_params, @@ -187,17 +190,27 @@ def sparse_random_block( shape, dtype, p, - fill_value, syskwargs: Dict, ): raise NotImplementedError() - def sparse_map_uop(self, op_name, arr, args, kwargs, syskwargs: Dict): + def sparse_map_uop(self, op_name, arr, args, kwargs, densify, syskwargs: Dict): raise NotImplementedError() def sparse_bop(self, op, a1, a2, a1_T, a2_T, axes, densify, syskwargs: Dict): raise NotImplementedError() + def sparse_reduce_axis( + self, op_name, arr, axis, keepdims, transposed, syskwargs: Dict + ): + raise NotImplementedError() + + def sparse_bop_reduce(self, op, a1, a2, a1_T, a2_T, syskwargs: Dict): + raise NotImplementedError() + + def sparse_block_from_scalar(self, x, syskwargs: Dict): + raise NotImplementedError() + def sdtp(self, s, *dense_arrays, syskwargs: Dict): raise NotImplementedError() diff --git a/nums/core/kernel/numpy_kernel.py b/nums/core/kernel/numpy_kernel.py index ead191c7..1789622f 100644 --- a/nums/core/kernel/numpy_kernel.py +++ b/nums/core/kernel/numpy_kernel.py @@ -526,9 +526,8 @@ def any(self, arr): # Sparse - def dense_to_sparse(self, arr, fill_value): - result = COO.from_numpy(arr, fill_value=fill_value) - return result + def dense_to_sparse(self, arr): + return COO.from_numpy(arr, fill_value=0) def sparse_to_dense(self, arr): return arr.todense() @@ -539,6 +538,16 @@ def sparse_nnz(self, arr): def sparse_nbytes(self, arr): return arr.nbytes + def new_sparse_block(self, op_name, grid_entry, grid_meta): + op_func = sparse.__getattribute__(op_name) + grid = ArrayGrid.from_meta(grid_meta) + block_shape = grid.get_block_shape(grid_entry) + if op_name == "eye": + assert np.all(np.diff(grid_entry) == 0) + return op_func(*block_shape, dtype=grid.dtype) + else: + return op_func(block_shape, dtype=grid.dtype) + def sparse_random_block( self, rng_params, @@ -547,7 +556,6 @@ def sparse_random_block( shape, dtype, p, - fill_value, ): rng = block_rng_legacy(*rng_params) rfunc = rng.__getattribute__(rfunc_name) @@ -559,14 +567,11 @@ def sparse_random_block( random_state=rng, data_rvs=lambda s: rfunc(**rfunc_args, size=s), format="coo", - fill_value=fill_value, + fill_value=0, ) - if rfunc_name != "randint": - # Only random and integer supports sampling of a specific type. - result = result.astype(dtype) - return result + return result.astype(dtype) - def sparse_map_uop(self, op_name, arr, args, kwargs): + def sparse_map_uop(self, op_name, arr, args, kwargs, densify): """ Args: func: types.Callable @@ -577,6 +582,10 @@ def sparse_map_uop(self, op_name, arr, args, kwargs): args = list(args) args.insert(0, arr) result = sparse.elemwise(ufunc, *args, **kwargs) + if densify and isinstance(result, sparse.SparseArray): + result = result.todense() + elif not densify: + assert isinstance(result, sparse.SparseArray) return result def sparse_bop(self, op, a1, a2, a1_T, a2_T, axes, densify): @@ -605,12 +614,45 @@ def sparse_bop(self, op, a1, a2, a1_T, a2_T, axes, densify): assert isinstance(result, sparse.SparseArray) return result + def sparse_reduce_axis(self, op_name, arr, axis, keepdims, transposed): + assert isinstance(arr, sparse.COO) + op_func = np.__getattribute__(op_name) + if transposed: + arr = arr.T + return arr.reduce(op_func, axis=axis, keepdims=keepdims) + + def sparse_bop_reduce(self, op, a1, a2, a1_T, a2_T): + assert isinstance(a1, sparse.COO) and isinstance(a2, sparse.COO) + if a1_T: + a1 = a1.T + if a2_T: + a2 = a2.T + + # These are faster. + if op == "sum": + r = a1 + a2 + elif op == "prod": + r = a1 * a2 + else: + a = sparse.stack([a1, a2], axis=0) + r = a.reduce(np.__getattribute__(op), axis=0, keepdims=False) + + if a1 is np.nan or a2 is np.nan or r is np.nan: + assert np.isscalar(a1) and np.isscalar(a2) and np.isscalar(r) + else: + assert a1.shape == a2.shape == r.shape + return r + + def sparse_block_from_scalar(self, x): + assert np.isscalar(x) + return sparse.COO.from_numpy(np.array(x), fill_value=0) + def sdtp(self, s: sparse.COO, *dense_arrays): data = np.copy(s.data) for position in range(s.nnz): for axis in range(len(s.shape)): data[position] *= dense_arrays[axis][s.coords[axis][position]] - return sparse.COO(s.coords, data, shape=s.shape, fill_value=s.fill_value) + return sparse.COO(s.coords, data, shape=s.shape, fill_value=0) def sdtd(self, s: sparse.COO, x: np.ndarray, y: np.ndarray, axes: int): # Check some things. @@ -635,4 +677,4 @@ def sdtd(self, s: sparse.COO, x: np.ndarray, y: np.ndarray, axes: int): sx = x[tuple(x_coords)] sy = y[tuple(y_coords)] data[position] *= np.tensordot(sx, sy, axes=axes) - return sparse.COO(s.coords, data, shape=s.shape, fill_value=s.fill_value) + return sparse.COO(s.coords, data, shape=s.shape, fill_value=0) diff --git a/nums/experimental/optimizer/fusion.py b/nums/experimental/optimizer/fusion.py index e20b04e7..40373351 100644 --- a/nums/experimental/optimizer/fusion.py +++ b/nums/experimental/optimizer/fusion.py @@ -5,7 +5,6 @@ import numpy as np from nums.core.array import utils as array_utils -from nums.core.array.base import Block from nums.core.grid.grid import Device from nums.experimental.optimizer.clusterstate import ClusterState @@ -49,7 +48,7 @@ def traverse(self, node: TreeNode, fuseable_nodes): ) if child_leafs is None: # This branch has been pruned. - return None, None, None + return None, None, 0 if len(child_block_id_set) <= self.max_args: # If it's a frontier node, then there's no point in fusing it. # If it's a leaf, we still want to count toward node_leafs, @@ -103,11 +102,15 @@ def update_graph(self, root, fuseable_nodes): def fuse_node(self, node: TreeNode): result = FunctionNode(node.cluster_state) - result.op_func, result.children = node.fuse(result, self.km) + result.op_func, result.children, result.tree_node_meta = node.fuse( + result, self.km + ) result.set_shape(node.shape()) result.set_grid_entry(node.grid_entry()) result.set_grid_shape(node.grid_shape()) result.set_dtype(node.dtype()) + assert result.tree_node_meta.shape == result.shape() + assert result.tree_node_meta.dtype == result.dtype() expression_str = node.expression() result.set_expression(expression_str) diff --git a/nums/experimental/optimizer/fusion_utils.py b/nums/experimental/optimizer/fusion_utils.py index cc4e8134..c47aedd1 100644 --- a/nums/experimental/optimizer/fusion_utils.py +++ b/nums/experimental/optimizer/fusion_utils.py @@ -40,3 +40,21 @@ def set_using_marker(node, inputs): node.update_child([child], [inputs[child.marker]]) inputs[child.marker].parent = node return node + + +def print_graph(root: TreeNode): + """ + Recursively print string representations of tree nodes using BFS. + """ + print("print_graph") + nodes = [root] + while len(nodes) > 0: + node = nodes.pop(0) + if hasattr(node, "child"): + nodes.append(node.child) + if hasattr(node, "left"): + nodes.append(node.left) + nodes.append(node.right) + if hasattr(node, "children"): + nodes.extend(node.children) + print(node) diff --git a/nums/experimental/optimizer/graph.py b/nums/experimental/optimizer/graph.py index 3ac3f3df..2ffc7a51 100644 --- a/nums/experimental/optimizer/graph.py +++ b/nums/experimental/optimizer/graph.py @@ -21,11 +21,12 @@ from nums.core.settings import sync_nnz from nums.core.array import utils as array_utils -from nums.core.array.base import Block +from nums.core.array.base import BlockBase, Block +from nums.core.array.sparse import SparseBlock from nums.core.grid.grid import Device from nums.core.kernel.kernel_manager import KernelManager from nums.experimental.optimizer.clusterstate import ClusterState -from nums.experimental.optimizer.size import TreeNodeSize +from nums.experimental.optimizer.node_meta import TreeNodeMeta, LeafMeta def subsample(total_items, max_items, rs: np.random.RandomState): @@ -52,6 +53,13 @@ def __init__(self, cluster_state: ClusterState, tree_node_id=None): self._dtype = None self._expression = None + # Type LeafMeta for leaves: shape, nnz, dtype, and whether output is_dense. + # Type TreeNodeMeta for other nodes: dtype and whether output is_dense. + self.tree_node_meta: TreeNodeMeta = None + # Whether this operation invokes dense or sparse kernel. + # Need to explicitly set is_dense property after initialization. + self.dense_kernel = True + def get_root(self): if self.parent is None: return self @@ -123,6 +131,12 @@ def make_bop(self, op_name, other, args=None): assert self.parent is None and other.parent is None bop.left, bop.right = self, other bop.left.parent, bop.right.parent = bop, bop + bop.tree_node_meta = bop.left.tree_node_meta.bop_partial( + op_name, bop.right.tree_node_meta + ) + bop.dense_kernel = ( + bop.left.tree_node_meta.is_dense and bop.right.tree_node_meta.is_dense + ) return bop def tensordot(self, other, axes): @@ -154,7 +168,6 @@ def __init__(self, cluster_state: ClusterState, tree_node_id=None): super().__init__(cluster_state, tree_node_id) self.block = None self.marker = -1 - self.tree_node_size = None def get_children(self): return [] @@ -186,8 +199,8 @@ def copy(self, cluster_state, parent=None, new_ids=False): # This property is only used for fusion. leaf.marker = self.marker - # This property is for experimental output size estimation. - leaf.tree_node_size = self.tree_node_size + leaf.tree_node_meta = self.tree_node_meta + leaf.dense_kernel = self.dense_kernel return leaf def get_leafs(self): @@ -229,7 +242,7 @@ def expression(self): def fuse(self, func_node, km: KernelManager): f = km.get_fuseable("identity") - return f, [self.copy(self.cluster_state, func_node)] + return f, [self.copy(self.cluster_state, func_node)], self.tree_node_meta def is_scalar(self): return self.block.size() == 1 @@ -242,10 +255,11 @@ def __init__(self, cluster_state: ClusterState, tree_node_id=None): self.op_name = None def __repr__(self): - return "UnaryOp(name=%s, id=%s, child=%s)" % ( + return "UnaryOp(name=%s, id=%s, child=%s, dense_kernel=%s)" % ( self.op_name, str(self.tree_node_id), str(self.child.tree_node_id), + self.dense_kernel, ) def get_children(self): @@ -264,6 +278,9 @@ def copy(self, cluster_state, parent=None, new_ids=False): uop.child = self.child.copy(cluster_state, parent=uop, new_ids=new_ids) uop.op_name = self.op_name uop.copy_on_op = self.copy_on_op + + uop.tree_node_meta = self.tree_node_meta + uop.dense_kernel = self.dense_kernel return uop def update_child(self, old_children, new_children): @@ -313,11 +330,11 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf: assert isinstance(self.child, Leaf) result = self._collapse(device) new_leaf: Leaf = result[0] - new_block: Block = result[1] + new_block: BlockBase = result[1] self.cluster_state.commit_uop(self._mem_cost(), self.child.block.id, device) # self.cluster_state.add_block(new_block.id, new_block.size(), [device]) self.cluster_state.add_block( - new_block.id, new_leaf.tree_node_size.nbytes, [device] + new_block.id, new_leaf.tree_node_meta.nbytes, [device] ) if not self.cluster_state.created_on_only: assert self.cluster_state.blocks_local( @@ -330,26 +347,28 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf: def _collapse(self, device: Device): assert isinstance(self.child, Leaf) - block: Block = self.child.block + block: BlockBase = self.child.block op_name, args = self.op_name, {} if op_name == "transpose": - block: Block = block.transpose(defer=True) + block: BlockBase = block.transpose(defer=True) else: - block: Block = block.ufunc(op_name, device=device) + block: BlockBase = block.ufunc(op_name, device=device) leaf: Leaf = Leaf(self.cluster_state) leaf.block = block - leaf.tree_node_size = self.child.tree_node_size.uop(op_name) + assert isinstance(self.child.tree_node_meta, LeafMeta) + leaf.tree_node_meta = self.child.tree_node_meta.uop(op_name) leaf.copy_on_op = self.copy_on_op return leaf, block def _mem_cost(self): assert isinstance(self.child, Leaf) - block: Block = self.child.block + block: BlockBase = self.child.block if block.is_dense: return np.product(block.shape) if sync_nnz > 1: - self.child.tree_node_size.nnz = block.nnz # Blocking fetch - return self.child.tree_node_size.uop(self.op_name).nbytes + self.child.tree_node_meta.nnz = block.nnz # Blocking fetch + assert isinstance(self.child.tree_node_meta, LeafMeta) + return self.child.tree_node_meta.uop(self.op_name).nbytes def shape(self): if self._shape is None: @@ -387,24 +406,35 @@ def dtype(self): def expression(self): if self._expression is None: - self._expression = "UnaryOp(op=%s, x=%s)" % ( + self._expression = "UnaryOp(op=%s, dense_kernel=%s, x=%s)" % ( self.op_name, + self.dense_kernel, self.child.expression(), ) return self._expression def fuse(self, func_node, km: KernelManager): - child_op, child_args = self.child.fuse(func_node, km) + child_op, child_args, child_meta = self.child.fuse(func_node, km) if self.op_name == "transpose": self_op = km.get_fuseable("transpose") else: - self_op = km.get_fuseable("map_uop") - self_op = partial(self_op, op_name=self.op_name, args=(), kwargs={}) + if self.dense_kernel: + self_op = km.get_fuseable("map_uop") + self_op = partial(self_op, op_name=self.op_name, args=(), kwargs={}) + else: + self_op = km.get_fuseable("sparse_map_uop") + self_op = partial( + self_op, + op_name=self.op_name, + args=(), + kwargs={}, + densify=self.tree_node_meta.is_dense, + ) def fused(*args): return self_op(arr=child_op(*args)) - return fused, child_args + return fused, child_args, child_meta.uop(self.op_name) class ReduceAxis(UnaryOp): @@ -430,53 +460,66 @@ def copy(self, cluster_state, parent=None, new_ids=False): ra.axis = self.axis ra.keepdims = self.keepdims ra.copy_on_op = self.copy_on_op + + ra.tree_node_meta = self.tree_node_meta + ra.dense_kernel = self.dense_kernel return ra def _collapse(self, device: Device): assert isinstance(self.child, Leaf) - child_block: Block = self.child.block + child_block: BlockBase = self.child.block op_name, args = self.op_name, {} - block = Block( + block: BlockBase = Block( grid_entry=self.grid_entry(), grid_shape=self.grid_shape(), shape=self.shape(), dtype=self.dtype(), transposed=False, - km=child_block._km, - ) - block.oid = child_block._km.reduce_axis( - op_name=op_name, - arr=child_block.oid, - axis=self.axis, - keepdims=self.keepdims, - transposed=child_block.transposed, - syskwargs={"device": device}, + km=child_block.km, ) + if self.dense_kernel: + block.oid = child_block.km.reduce_axis( + op_name=op_name, + arr=child_block.oid, + axis=self.axis, + keepdims=self.keepdims, + transposed=child_block.transposed, + syskwargs={"device": device}, + ) + else: + block.oid = child_block.km.sparse_reduce_axis( + op_name=op_name, + arr=child_block.oid, + axis=self.axis, + keepdims=self.keepdims, + transposed=child_block.transposed, + syskwargs={"device": device}, + ) block._device = device leaf: Leaf = Leaf(self.cluster_state) leaf.block = block - leaf.tree_node_size = self.child.tree_node_size.reduce_axis( + assert isinstance(self.child.tree_node_meta, LeafMeta) + leaf.tree_node_meta = self.child.tree_node_meta.reduce_axis( self.op_name, self.axis, self.keepdims, - self.child.block.transposed, ) leaf.copy_on_op = self.copy_on_op return leaf, block def _mem_cost(self): assert isinstance(self.child, Leaf) - block: Block = self.child.block + block: BlockBase = self.child.block if block.is_dense: return np.product(self.shape()) if sync_nnz > 1: - self.child.tree_node_size.nnz = block.nnz # Blocking fetch - return self.child.tree_node_size.reduce_axis( + self.child.tree_node_meta.nnz = block.nnz # Blocking fetch + assert isinstance(self.child.tree_node_meta, LeafMeta) + return self.child.tree_node_meta.reduce_axis( self.op_name, self.axis, self.keepdims, - block.transposed, ).nbytes def update_tuple_property(self, val, keep_dim_val: Union[int, tuple] = 1): @@ -519,18 +562,25 @@ def dtype(self): def expression(self): if self._expression is None: - self._expression = "ReduceAxis(op=%s, x=%s, axis=%s, keepdims=%s)" % ( - self.op_name, - self.child.expression(), - str(self.axis), - str(self.keepdims), + self._expression = ( + "ReduceAxis(op=%s, axis=%s, keepdims=%s, dense_kernel=%s, x=%s)" + % ( + self.op_name, + str(self.axis), + str(self.keepdims), + self.dense_kernel, + self.child.expression(), + ) ) return self._expression def fuse(self, func_node, km: KernelManager): - child_op, child_args = self.child.fuse(func_node, km) + child_op, child_args, child_meta = self.child.fuse(func_node, km) - self_op = km.get_fuseable("reduce_axis") + if self.dense_kernel: + self_op = km.get_fuseable("reduce_axis") + else: + self_op = km.get_fuseable("sparse_reduce_axis") kwargs = { "op_name": self.op_name, "axis": self.axis, @@ -543,7 +593,11 @@ def fuse(self, func_node, km: KernelManager): def fused(*args): return self_op(arr=child_op(*args)) - return fused, child_args + return ( + fused, + child_args, + child_meta.reduce_axis(self.op_name, self.axis, self.keepdims), + ) class BinaryOp(TreeNode): @@ -563,11 +617,12 @@ def __repr__(self): "matmul": "@", "tensordot": "@", }[self.op_name] - return "BOp(id=%s, op=%s%s%s)" % ( + return "BOp(id=%s, op=%s%s%s, dense_kernel=%s)" % ( self.tree_node_id, str(self.left.tree_node_id), bop_symbol, str(self.right.tree_node_id), + self.dense_kernel, ) def get_children(self): @@ -591,6 +646,9 @@ def copy(self, cluster_state, parent=None, new_ids=False): bop.left = self.left.copy(cluster_state, bop, new_ids=new_ids) bop.right = self.right.copy(cluster_state, bop, new_ids=new_ids) bop.copy_on_op = self.copy_on_op + + bop.tree_node_meta = self.tree_node_meta + bop.dense_kernel = self.dense_kernel return bop def update_child(self, old_children, new_children): @@ -661,7 +719,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf: assert isinstance(self.left, Leaf) and isinstance(self.right, Leaf) result = self._collapse(device) new_leaf: Leaf = result[0] - new_block: Block = result[1] + new_block: BlockBase = result[1] # This updates load on nodes and channels. # This also updates block states to indicate that they now reside on the provided nodes. # Update the cluster state after computing the leaf, so that transfer costs are properly @@ -672,7 +730,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf: # Update cluster state with new block. # self.cluster_state.add_block(new_block.id, new_block.size(), [device]) self.cluster_state.add_block( - new_block.id, new_leaf.tree_node_size.nbytes, [device] + new_block.id, new_leaf.tree_node_meta.nbytes, [device] ) if not self.cluster_state.created_on_only: assert self.cluster_state.blocks_local( @@ -690,8 +748,8 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf: def _collapse(self, device: Device): assert isinstance(self.left, Leaf) and isinstance(self.right, Leaf) - lblock: Block = self.left.block - rblock: Block = self.right.block + lblock: BlockBase = self.left.block + rblock: BlockBase = self.right.block if self.op_name == "matmul": op_name, args = "tensordot", {"axes": 1} elif self.op_name == "tensordot": @@ -699,12 +757,14 @@ def _collapse(self, device: Device): else: op_name, args = self.op_name, {} assert array_utils.can_broadcast_shapes(lblock.shape, rblock.shape) - block: Block = lblock.bop(op_name, rblock, args=args, device=device) + block: BlockBase = lblock.bop(op_name, rblock, args=args, device=device) leaf: Leaf = Leaf(self.cluster_state) leaf.block = block - leaf.tree_node_size = self.left.tree_node_size.bop( + assert isinstance(self.left.tree_node_meta, LeafMeta) + assert isinstance(self.right.tree_node_meta, LeafMeta) + leaf.tree_node_meta = self.left.tree_node_meta.bop( op_name, - self.right.tree_node_size, + self.right.tree_node_meta, **args, ) leaf.copy_on_op = self.copy_on_op @@ -714,8 +774,8 @@ def _mem_cost(self): # Computes the memory required to perform this operation. # We approximate by just computing the memory required to store the result. assert isinstance(self.left, Leaf) and isinstance(self.right, Leaf) - lblock: Block = self.left.block - rblock: Block = self.right.block + lblock: BlockBase = self.left.block + rblock: BlockBase = self.right.block if self.op_name == "matmul": op_name, args = "tensordot", {"axes": 1} elif self.op_name == "tensordot": @@ -726,11 +786,13 @@ def _mem_cost(self): if lblock.is_dense and rblock.is_dense: return np.product(self.shape()) if sync_nnz > 1: - self.left.tree_node_size.nnz = lblock.nnz # Blocking fetch - self.right.tree_node_size.nnz = rblock.nnz # Blocking fetch - return self.left.tree_node_size.bop( + self.left.tree_node_meta.nnz = lblock.nnz # Blocking fetch + self.right.tree_node_meta.nnz = rblock.nnz # Blocking fetch + assert isinstance(self.left.tree_node_meta, LeafMeta) + assert isinstance(self.right.tree_node_meta, LeafMeta) + return self.left.tree_node_meta.bop( op_name, - self.right.tree_node_size, + self.right.tree_node_meta, **args, ).nbytes @@ -801,27 +863,44 @@ def expression(self): if self._expression is None: if self.op_name == "matmul" or self.op_name == "tensordot": axes = self.args.get("axes", 1) - self._expression = "BinaryOp(op=%s, x=%s, y=%s, axes=%s)" % ( - self.op_name, - self.left.expression(), - self.right.expression(), - axes, + self._expression = ( + "BinaryOp(op=%s, axes=%s, dense_kernel=%s, x=%s, y=%s)" + % ( + self.op_name, + axes, + self.dense_kernel, + self.left.expression(), + self.right.expression(), + ) ) - self._expression = "BinaryOp(op=%s, x=%s, y=%s)" % ( + self._expression = "BinaryOp(op=%s, dense_kernel=%s, x=%s, y=%s)" % ( self.op_name, + self.dense_kernel, self.left.expression(), self.right.expression(), ) return self._expression def fuse(self, func_node, km: KernelManager): - left_op, left_args = self.left.fuse(func_node, km) - right_op, right_args = self.right.fuse(func_node, km) - - self_op = km.get_fuseable("bop") + left_op, left_args, left_meta = self.left.fuse(func_node, km) + right_op, right_args, right_meta = self.right.fuse(func_node, km) axes = 1 if self.args is None else self.args.get("axes", 1) - self_op = partial(self_op, op=self.op_name, a1_T=False, a2_T=False, axes=axes) + if self.dense_kernel: + self_op = km.get_fuseable("bop") + self_op = partial( + self_op, op=self.op_name, a1_T=False, a2_T=False, axes=axes + ) + else: + self_op = km.get_fuseable("sparse_bop") + self_op = partial( + self_op, + op=self.op_name, + a1_T=False, + a2_T=False, + axes=axes, + densify=self.tree_node_meta.is_dense, + ) num_left = len(left_args) # Combine the left and right args. args = left_args + right_args @@ -836,7 +915,21 @@ def fused(*args): args2 = args[num_left:] return self_op(a1=left_op(*args1), a2=right_op(*args2)) - return fused, args + if self.op_name == "matmul": + op_name, extra_args = "tensordot", {"axes": 1} + elif self.op_name == "tensordot": + op_name, extra_args = "tensordot", self.args + else: + op_name, extra_args = self.op_name, {} + return ( + fused, + args, + left_meta.bop( + op_name, + right_meta, + **extra_args, + ), + ) class FunctionNode(TreeNode): @@ -867,7 +960,7 @@ def finalize(self, km: KernelManager): km.register(self.op_hash, self.op_func, {}) def __repr__(self): - return "Function(id=%s, op=%s, args=%s" % ( + return "Function(id=%s, op=%s, args=%s)" % ( self.tree_node_id, self.op_hash, len(self.children), @@ -895,11 +988,13 @@ def copy(self, cluster_state, parent=None, new_ids=False): fnode.parent = parent fnode.op_hash = self.op_hash fnode.op_func = self.op_func - fnode.op_expression = self.op_expression fnode.children = [ child.copy(cluster_state, fnode, new_ids=new_ids) for child in self.children ] fnode.copy_on_op = self.copy_on_op + + fnode.tree_node_meta = self.tree_node_meta + fnode.dense_kernel = self.dense_kernel return fnode def update_child(self, old_children, new_children): @@ -981,7 +1076,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf: block_ids = [child.block.id for child in self.children] result = self._collapse(device) new_leaf: Leaf = result[0] - new_block: Block = result[1] + new_block: BlockBase = result[1] # This updates load on nodes and channels. # This also updates block states to indicate that they now reside on the provided nodes. # Update the cluster state after computing the leaf, so that transfer costs are properly @@ -1006,16 +1101,21 @@ def _collapse(self, device: Device): assert isinstance(child, Leaf) block_oids.append(child.block.oid) if km is None: - km = child.block._km - block: Block = Block( + km = child.block.km + if self.tree_node_meta.is_dense: + block_type = Block + else: + block_type = SparseBlock + block: BlockBase = block_type( self._grid_entry, self._grid_shape, self._shape, self._dtype, False, km ) block._device = device - block.oid = block._km.call( + block.oid = block.km.call( self.op_hash, *block_oids, syskwargs={"device": device} ) leaf: Leaf = Leaf(self.cluster_state) leaf.block = block + leaf.tree_node_meta = self.tree_node_meta leaf.copy_on_op = self.copy_on_op return leaf, block @@ -1063,7 +1163,7 @@ def __init__(self, cluster_state: ClusterState, tree_node_id=None): self.children: List[TreeNode] = None def __repr__(self): - return "Einsum(id=%s, subscript=%s, operands=%s" % ( + return "Einsum(id=%s, subscript=%s, operands=%s)" % ( self.tree_node_id, self.subscript, len(self.children), @@ -1175,7 +1275,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf: block_ids = [child.block.id for child in self.children] result = self._collapse(device) new_leaf: Leaf = result[0] - new_block: Block = result[1] + new_block: BlockBase = result[1] # This updates load on nodes and channels. # This also updates block states to indicate that they now reside on the provided nodes. # Update the cluster state after computing the leaf, so that transfer costs are properly @@ -1184,7 +1284,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf: # Update cluster state with new block. # self.cluster_state.add_block(new_block.id, new_block.size(), [device]) self.cluster_state.add_block( - new_block.id, new_leaf.tree_node_size.nbytes, [device] + new_block.id, new_leaf.tree_node_meta.nbytes, [device] ) if not self.cluster_state.created_on_only: for block_id in block_ids: @@ -1203,19 +1303,22 @@ def _collapse(self, device: Device): assert isinstance(child, Leaf) block_oids.append(child.block.oid) if km is None: - km = child.block._km - block: Block = Block( + km = child.block.km + block: BlockBase = Block( self.grid_entry(), self.grid_shape(), self.shape(), self.dtype(), False, km ) block._device = device - block.oid = block._km.einsum( + block.oid = block.km.einsum( self.subscript, *block_oids, syskwargs={"device": device} ) leaf: Leaf = Leaf(self.cluster_state) leaf.block = block # Assume dense for simplicity. - leaf.tree_node_size = TreeNodeSize( - self.shape(), np.prod(self.shape()), block.dtype, block.fill_value + leaf.tree_node_meta = LeafMeta( + self.shape(), + np.prod(self.shape()), + block.dtype, + block.is_dense, ) leaf.copy_on_op = self.copy_on_op return leaf, block diff --git a/nums/experimental/optimizer/grapharray.py b/nums/experimental/optimizer/grapharray.py index 5f604118..07de2b6f 100644 --- a/nums/experimental/optimizer/grapharray.py +++ b/nums/experimental/optimizer/grapharray.py @@ -23,7 +23,7 @@ from nums.core.settings import sync_nnz from nums.core.array import utils as array_utils -from nums.core.array.base import BlockArrayBase, Block +from nums.core.array.base import BlockBase, BlockArrayBase from nums.core.array.blockarray import BlockArray from nums.core.array.sparse import SparseBlockArray from nums.core.kernel.kernel_manager import KernelManager @@ -35,12 +35,13 @@ Leaf, UnaryOp, ReduceAxis, + FunctionNode, Einsum, ) from nums.experimental.optimizer.reduction_ops import TreeReductionOp from nums.experimental.optimizer.fusion import FuseGraph from nums.experimental.optimizer.fusion_utils import set_using_marker, traverse_marker -from nums.experimental.optimizer.size import TreeNodeSize +from nums.experimental.optimizer.node_meta import LeafMeta class GraphArray(object): @@ -50,7 +51,7 @@ def graphs_from_ba( ) -> np.ndarray: graphs = np.empty(shape=ba.grid.grid_shape, dtype=np.object) for grid_entry in ba.grid.get_entry_iterator(): - block: Block = ba.blocks[grid_entry] + block: BlockBase = ba.blocks[grid_entry] # Allocate the block to the node on which it's created. km: KernelManager = ba.km device: Device = km.device_grid.get_device( @@ -66,17 +67,18 @@ def graphs_from_ba( nnz = block.nnz # Blocking fetch else: nnz = np.prod(block.shape) - leaf.tree_node_size = TreeNodeSize( + leaf.tree_node_meta = LeafMeta( block.shape, nnz, block.dtype, - block.fill_value, + block.is_dense, ) + leaf.dense_kernel = block.is_dense leaf.copy_on_op = copy_on_op graphs[grid_entry] = leaf cluster_state.add_block( - block.id, leaf.tree_node_size.nbytes, devices=[device] + block.id, leaf.tree_node_meta.nbytes, devices=[device] ) cluster_state.init_mem_load(device, block.id) return graphs @@ -110,9 +112,7 @@ def to_ba(self): ), "Cannot convert unsolved GraphArray to BlockArray." if sample_node.block.is_dense: return BlockArray(self.grid.copy(), self.km, self.to_blocks()) - return SparseBlockArray( - self.grid.copy(), self.km, sample_node.block.fill_value, self.to_blocks() - ) + return SparseBlockArray(self.grid.copy(), self.km, self.to_blocks()) def __init__( self, @@ -166,7 +166,7 @@ def iterator(self): yield node def to_blocks(self) -> np.ndarray: - blocks: np.ndarray = np.empty(self.grid.grid_shape, dtype=Block) + blocks: np.ndarray = np.empty(self.grid.grid_shape, dtype=BlockBase) for grid_entry in self.grid.get_entry_iterator(): leaf: TreeNode = self.graphs[grid_entry] assert isinstance(leaf, Leaf), "%s,%s" % (str(leaf), type(leaf)) @@ -176,28 +176,29 @@ def to_blocks(self) -> np.ndarray: def other_to_ga(self, other): return GraphArray.to_ga(other, self.cluster_state, self.km, self.copy_on_op) - def tensordot(self, other, axes=2): - other = self.other_to_ga(other) + @staticmethod + def tensordot(a, b, axes=2): + b = a.other_to_ga(b) # TODO: Reuse BlockArrayBase tensordot operator. - this_axes = self.grid.grid_shape[:-axes] - this_sum_axes = self.grid.grid_shape[-axes:] - other_axes = other.grid.grid_shape[axes:] - other_sum_axes = other.grid.grid_shape[:axes] - assert this_sum_axes == other_sum_axes - result_shape = tuple(self.shape[:-axes] + other.shape[axes:]) - result_block_shape = tuple(self.block_shape[:-axes] + other.block_shape[axes:]) + a_axes = a.grid.grid_shape[:-axes] + a_sum_axes = a.grid.grid_shape[-axes:] + b_axes = b.grid.grid_shape[axes:] + b_sum_axes = b.grid.grid_shape[:axes] + assert a_sum_axes == b_sum_axes + result_shape = tuple(a.shape[:-axes] + b.shape[axes:]) + result_block_shape = tuple(a.block_shape[:-axes] + b.block_shape[axes:]) result_grid = ArrayGrid( shape=result_shape, block_shape=result_block_shape, - dtype=self.dtype.__name__, + dtype=a.dtype.__name__, ) - assert result_grid.grid_shape == tuple(this_axes + other_axes) + assert result_grid.grid_shape == tuple(a_axes + b_axes) result_graphs = np.empty(shape=result_grid.grid_shape, dtype=np.object) - this_dims = list(itertools.product(*map(range, this_axes))) - other_dims = list(itertools.product(*map(range, other_axes))) - sum_dims = list(itertools.product(*map(range, this_sum_axes))) - for i in this_dims: - for j in other_dims: + a_dims = list(itertools.product(*map(range, a_axes))) + b_dims = list(itertools.product(*map(range, b_axes))) + sum_dims = list(itertools.product(*map(range, a_sum_axes))) + for i in a_dims: + for j in b_dims: # A \in \R^{I \times K} # B \in \R^{K \times J} # C \in \R^{I \times J} @@ -205,38 +206,41 @@ def tensordot(self, other, axes=2): grid_entry = tuple(i + j) if len(sum_dims) == 1: k = sum_dims[0] - self_node: TreeNode = self.graphs[tuple(i + k)] - other_node: TreeNode = other.graphs[tuple(k + j)] - dot_node: TreeNode = self_node.tensordot(other_node, axes=axes) + a_node: TreeNode = a.graphs[tuple(i + k)] + b_node: TreeNode = b.graphs[tuple(k + j)] + dot_node: TreeNode = a_node.tensordot(b_node, axes=axes) result_graphs[grid_entry] = dot_node else: - rop = TreeReductionOp(self.cluster_state) + rop = TreeReductionOp(a.cluster_state) rop.set_grid_entry(grid_entry) rop.set_grid_shape(result_grid.grid_shape) rop.op_name = "sum" - rop.copy_on_op = self.copy_on_op + rop.copy_on_op = a.copy_on_op for k in sum_dims: - self_node: TreeNode = self.graphs[tuple(i + k)] - other_node: TreeNode = other.graphs[tuple(k + j)] - dot_node: TreeNode = self_node.tensordot(other_node, axes=axes) + a_node: TreeNode = a.graphs[tuple(i + k)] + b_node: TreeNode = b.graphs[tuple(k + j)] + dot_node: TreeNode = a_node.tensordot(b_node, axes=axes) # Explicitly add parent here, since sum depends on prod. # Not needed for other ops; make_bop takes care of it. # We don't need to copy the node here since the local # tree structure here is never exposed. dot_node.parent = rop rop.add_child(dot_node) + rop.tree_node_meta = dot_node.tree_node_meta + # Assume children are all dense or all sparse. + rop.dense_kernel = dot_node.tree_node_meta.is_dense result_graphs[grid_entry] = rop return GraphArray( result_grid, - self.cluster_state, + a.cluster_state, result_graphs, - self.km, - copy_on_op=self.copy_on_op, + a.km, + copy_on_op=a.copy_on_op, ) def __matmul__(self, other): - return self.tensordot(other, axes=1) + return self.tensordot(self, other, axes=1) def ga_from_arr(self, arr: Union[TreeNode, np.ndarray], result_shape: tuple): if isinstance(arr, TreeNode): @@ -370,6 +374,8 @@ def _add_uop(self, op_name, grid_entry, old_arr, new_arr): assert old_root.parent is None uop.child = old_root old_root.parent = uop + uop.tree_node_meta = old_root.tree_node_meta.uop_partial(op_name) + uop.dense_kernel = old_root.tree_node_meta.is_dense new_arr[grid_entry] = uop def reduce_axis(self, op_name, axis, keepdims): @@ -389,6 +395,12 @@ def reduce_axis(self, op_name, axis, keepdims): reduced_tnode.op_name = op_name reduced_tnode.axis = axis reduced_tnode.keepdims = keepdims + reduced_tnode.tree_node_meta = tnode.tree_node_meta.reduce_axis_partial( + op_name, + axis, + keepdims, + ) + reduced_tnode.dense_kernel = tnode.tree_node_meta.is_dense reduced_graphs[grid_entry] = reduced_tnode # Compute output GraphArray properties. @@ -443,6 +455,8 @@ def reduce_axis(self, op_name, axis, keepdims): rop.add_child(child) assert child.parent is None child.parent = rop + rop.tree_node_meta = child.tree_node_meta + rop.dense_kernel = child.tree_node_meta.is_dense if result_grid.shape == (): # keepdims = False. result_graphs[()] = rop @@ -470,6 +484,8 @@ def reduce_axis(self, op_name, axis, keepdims): rop.add_child(child) assert child.parent is None child.parent = rop + rop.tree_node_meta = child.tree_node_meta + rop.dense_kernel = child.tree_node_meta.is_dense result_graphs[result_grid_entry] = rop return GraphArray( @@ -486,16 +502,22 @@ def sum(self, axis=None, keepdims=False): def compile(self, max_args: int): result_graphs = np.empty_like(self.graphs, dtype=self.graphs.dtype) counter = 0 + first_grid_entry = (0,) * len(self.grid.shape) for grid_entry in self.grid.get_entry_iterator(): graph = self.graphs[grid_entry] _, leaf_inputs = traverse_marker(graph, 0) - if grid_entry == (0,) or grid_entry == (0, 0): # generic - result_graphs[grid_entry] = FuseGraph( - graph, self.km, max_args=max_args - )() - fused_graph = result_graphs[grid_entry] - fused_graph.op_expression = fused_graph._expression + if grid_entry == first_grid_entry: + fused_graph = FuseGraph(graph, self.km, max_args=max_args)() + if not isinstance(fused_graph, FunctionNode): + # Stopgap as this function currently assumes root is FunctionNode. + return self + result_graphs[grid_entry] = fused_graph + # result_graphs[grid_entry] = FuseGraph( + # graph, self.km, max_args=max_args + # )() + # fused_graph = result_graphs[grid_entry] else: + # TODO: support subtree fusion in this section. FuseGraph already does. fused_graph_copy = fused_graph.copy(self.cluster_state, new_ids=True) fused_graph_copy = set_using_marker(fused_graph_copy, leaf_inputs) fused_graph_copy.set_grid_entry(grid_entry) diff --git a/nums/experimental/optimizer/size.py b/nums/experimental/optimizer/node_meta.py similarity index 63% rename from nums/experimental/optimizer/size.py rename to nums/experimental/optimizer/node_meta.py index e813259b..236fe580 100644 --- a/nums/experimental/optimizer/size.py +++ b/nums/experimental/optimizer/node_meta.py @@ -3,55 +3,72 @@ from nums.core.array import utils as array_utils -# TODO: integrate this class with FuseGraph. -class TreeNodeSize: +class TreeNodeMeta: """ - Encapsulated by each TreeNode to keep track of estimated or observed sizes of blocks. + Encapsulated by each TreeNode to keep track of node metadata like shape and output density. + """ + + def __init__(self, dtype, is_dense): + self.dtype = dtype + self.is_dense = is_dense + + def copy(self): + return TreeNodeMeta( + self.dtype, + self.is_dense, + ) + + def uop_partial(self, op_name): + """ + Does not update nnz, as it can't be estimated for non-leaf nodes. + """ + is_dense = self.is_dense or array_utils.get_sparse_uop_densify(op_name) + return TreeNodeMeta( + dtype=array_utils.get_uop_output_type(op_name, self.dtype), + is_dense=is_dense, + ) + + def reduce_axis_partial(self, op_name, axis, keepdims): + return TreeNodeMeta( + dtype=array_utils.get_uop_output_type(op_name, self.dtype), + is_dense=True, + ) + + def bop_partial(self, op_name, other, **kwargs): + dtype = array_utils.get_bop_output_type(op_name, self.dtype, other.dtype) + is_dense = array_utils.get_sparse_bop_densify( + op_name, + self.is_dense, + other.is_dense, + ) + return TreeNodeMeta(dtype, is_dense) + + +class LeafMeta(TreeNodeMeta): + """ + Encapsulated by each Leaf to keep track of sizes of blocks in addition to TreeNodeMeta. Sparse binary operations use B(n, p) to estimate nnz as needed. """ - def __init__(self, shape, nnz, dtype, fill_value=None, index_dtype=np.int64): + def __init__(self, shape, nnz, dtype, is_dense, index_dtype=np.int64): + super().__init__(dtype, is_dense) self.shape = shape self.nnz = nnz - self.dtype = dtype - self.fill_value = fill_value self.index_dtype = index_dtype if self.is_dense: assert nnz == np.prod(shape) else: assert index_dtype is not None - self.bop_estimation_map = { - "add": self.add, - "sum": self.add, - "sub": self.add, - "mul": self.mul, - "prod": self.mul, - "truediv": self.truediv, - "pow": self.pow, - "matmul": lambda other: self.tensordot(other, axes=1), - "tensordot": self.tensordot, - "lt": lambda other: self.inequality("lt", other), - "le": lambda other: self.inequality("le", other), - "gt": lambda other: self.inequality("gt", other), - "ge": lambda other: self.inequality("ge", other), - "eq": lambda other: self.inequality("eq", other), - "ne": lambda other: self.inequality("ne", other), - } - def copy(self): - return TreeNodeSize( + return LeafMeta( self.shape, self.nnz, self.dtype, - self.fill_value, + self.is_dense, self.index_dtype, ) - @property - def is_dense(self): - return self.fill_value is None - @property def nbytes(self): if self.is_dense: @@ -65,40 +82,49 @@ def nbytes(self): def uop(self, op_name): if op_name == "transpose": - return TreeNodeSize( + return LeafMeta( shape=tuple(reversed(self.shape)), nnz=self.nnz, dtype=self.dtype, - fill_value=self.fill_value, + is_dense=self.is_dense, ) - if self.is_dense: - fill_value = None - else: - fill_value = np.__getattribute__(op_name)(self.fill_value) - return TreeNodeSize( + nnz = self.nnz + is_dense = self.is_dense + if not is_dense and array_utils.get_sparse_uop_densify(op_name): + nnz = np.prod(self.shape) + is_dense = True + return LeafMeta( shape=self.shape, - nnz=self.nnz, + nnz=nnz, dtype=array_utils.get_uop_output_type(op_name, self.dtype), - fill_value=fill_value, + is_dense=is_dense, ) - def reduce_axis(self, op_name, axis, keepdims, transposed): - # Assume dense for now + def reduce_axis(self, op_name, axis, keepdims): shape = list(self.shape) - if transposed: - shape.reverse() if axis is None: shape = [] elif keepdims: shape[axis] = 1 else: shape.pop(axis) - return TreeNodeSize( - shape=tuple(shape), - nnz=np.prod(shape), - dtype=array_utils.get_uop_output_type(op_name, self.dtype), - fill_value=None, - ) + if self.is_dense: + return LeafMeta( + shape=tuple(shape), + nnz=np.prod(shape), + dtype=array_utils.get_uop_output_type(op_name, self.dtype), + is_dense=True, + ) + else: + # If any element in reduced axis is nonzero, result is nonzero. + p1 = self.nnz / np.prod(self.shape) + nnz = int((1 - (1 - p1) ** self.shape[axis]) * np.prod(shape)) + return LeafMeta( + shape=tuple(shape), + nnz=nnz, + dtype=array_utils.get_uop_output_type(op_name, self.dtype), + is_dense=False, + ) def _nnz_disjunction(self, other, shape): # If either element is nonzero, result is nonzero. @@ -118,7 +144,6 @@ def _nnz_conjunction(self, other, shape): def _nnz_selection(self, other, shape): # If self element is nonzero, result is nonzero. - assert self.fill_value == 0 n1 = np.prod(self.shape) p1 = self.nnz / n1 return int(p1 * np.prod(shape)) @@ -126,29 +151,27 @@ def _nnz_selection(self, other, shape): def add(self, other): shape = array_utils.broadcast_shape(self.shape, other.shape) dtype = array_utils.get_bop_output_type("add", self.dtype, other.dtype) - is_dense = array_utils.get_sparse_bop_return_type( + is_dense = array_utils.get_sparse_bop_densify( "add", - self.fill_value, - other.fill_value, + self.is_dense, + other.is_dense, ) if is_dense: - return TreeNodeSize( + return LeafMeta( shape=shape, nnz=np.prod(shape), dtype=dtype, - fill_value=None, + is_dense=True, ) if self.is_dense or other.is_dense: raise ValueError( "TreeNodeSize.__add__ is inconsistent with sparse bop rules." ) - return TreeNodeSize( + return LeafMeta( shape=shape, nnz=self._nnz_disjunction(other, shape), dtype=dtype, - fill_value=array_utils.get_bop_fill_value( - "add", self.fill_value, other.fill_value - ), + is_dense=False, ) __add__ = add @@ -156,42 +179,33 @@ def add(self, other): def mul(self, other): shape = array_utils.broadcast_shape(self.shape, other.shape) dtype = array_utils.get_bop_output_type("mul", self.dtype, other.dtype) - is_dense = array_utils.get_sparse_bop_return_type( + is_dense = array_utils.get_sparse_bop_densify( "mul", - self.fill_value, - other.fill_value, + self.is_dense, + other.is_dense, ) if is_dense: - return TreeNodeSize( + return LeafMeta( shape=shape, nnz=np.prod(shape), dtype=dtype, - fill_value=None, + is_dense=True, ) if not self.is_dense and not other.is_dense: - if self.fill_value == 0 and other.fill_value == 0: - nnz = self._nnz_conjunction(other, shape) - elif self.fill_value == 0: - nnz = self._nnz_selection(other, shape) - elif other.fill_value == 0: - nnz = other._nnz_selection(self, shape) - else: - nnz = self._nnz_disjunction(other, shape) - elif self.fill_value == 0 and other.is_dense: + nnz = self._nnz_conjunction(other, shape) + elif not self.is_dense and other.is_dense: nnz = self._nnz_selection(other, shape) - elif self.is_dense and other.fill_value == 0: + elif self.is_dense and not other.is_dense: nnz = other._nnz_selection(self, shape) else: raise ValueError( "TreeNodeSize.__mul__ is inconsistent with sparse bop rules." ) - return TreeNodeSize( + return LeafMeta( shape=shape, nnz=nnz, dtype=dtype, - fill_value=array_utils.get_bop_fill_value( - "mul", self.fill_value, other.fill_value - ), + is_dense=False, ) __mul__ = mul @@ -199,35 +213,33 @@ def mul(self, other): def truediv(self, other): shape = array_utils.broadcast_shape(self.shape, other.shape) dtype = array_utils.get_bop_output_type("truediv", self.dtype, other.dtype) - is_dense = array_utils.get_sparse_bop_return_type( + is_dense = array_utils.get_sparse_bop_densify( "truediv", - self.fill_value, - other.fill_value, + self.is_dense, + other.is_dense, ) if is_dense: - return TreeNodeSize( + return LeafMeta( shape=shape, nnz=np.prod(shape), dtype=dtype, - fill_value=None, + is_dense=True, ) if self.is_dense or other.is_dense: raise ValueError( "TreeNodeSize.__add__ is inconsistent with sparse bop rules." ) - if other.fill_value == 0: # nan + if not other.is_dense: # nan nnz = other._nnz_selection(self, shape) - elif self.fill_value == 0: + elif not self.is_dense: nnz = self._nnz_selection(other, shape) else: nnz = self._nnz_disjunction(other, shape) - return TreeNodeSize( + return LeafMeta( shape=shape, nnz=nnz, dtype=dtype, - fill_value=array_utils.get_bop_fill_value( - "truediv", self.fill_value, other.fill_value - ), + is_dense=False, ) __truediv__ = truediv @@ -248,22 +260,17 @@ def tensordot(self, other, axes): shape = tuple(self.shape + other.shape) sum_shape = (1,) dtype = array_utils.get_bop_output_type("tensordot", self.dtype, other.dtype) - is_dense = array_utils.get_sparse_bop_return_type( - "tensordot", self.fill_value, other.fill_value + is_dense = array_utils.get_sparse_bop_densify( + "tensordot", self.is_dense, other.is_dense ) if is_dense: - return TreeNodeSize( + return LeafMeta( shape=shape, nnz=np.prod(shape), dtype=dtype, - fill_value=None, + is_dense=True, ) - if ( - self.is_dense - or other.is_dense - or self.fill_value != 0 - or other.fill_value != 0 - ): + if self.is_dense or other.is_dense: raise ValueError( "TreeNodeSize.tensordot is inconsistent with sparse bop rules." ) @@ -273,36 +280,50 @@ def tensordot(self, other, axes): p2 = other.nnz / n2 m = np.prod(shape) k = np.prod(sum_shape) - return TreeNodeSize( + return LeafMeta( shape=shape, nnz=int((1 - (1 - p1 * p2) ** k) * m), dtype=dtype, - fill_value=0, + is_dense=False, ) def inequality(self, op_name, other): assert other.shape == () dtype = array_utils.get_bop_output_type(op_name, self.dtype, other.dtype) - fill_value = array_utils.get_bop_fill_value( - op_name, self.fill_value, other.fill_value - ) - return TreeNode( + return LeafMeta( shape=self.shape, nnz=self.nnz, dtype=dtype, - fill_value=fill_value, + is_dense=self.is_dense, ) - def bop_dense(self, other): + def bop_dense(self, other, **kwargs): # NOTE: as catch-all fallback, this may be wrong for unknown non-elementwise binary ops. shape = array_utils.broadcast_shape(self.shape, other.shape) dtype = array_utils.get_bop_output_type("add", self.dtype, other.dtype) - return TreeNodeSize( + return LeafMeta( shape=shape, nnz=np.prod(shape), dtype=dtype, - fill_value=None, + is_dense=False, ) def bop(self, op_name, other, **kwargs): - return self.bop_estimation_map.get(op_name, self.bop_dense)(other, **kwargs) + bop_estimation_map = { + "add": self.add, + "sum": self.add, + "sub": self.add, + "mul": self.mul, + "prod": self.mul, + "truediv": self.truediv, + "pow": self.pow, + "matmul": lambda other: self.tensordot(other, axes=1), + "tensordot": self.tensordot, + "lt": lambda other: self.inequality("lt", other), + "le": lambda other: self.inequality("le", other), + "gt": lambda other: self.inequality("gt", other), + "ge": lambda other: self.inequality("ge", other), + "eq": lambda other: self.inequality("eq", other), + "ne": lambda other: self.inequality("ne", other), + } + return bop_estimation_map.get(op_name, self.bop_dense)(other, **kwargs) diff --git a/nums/experimental/optimizer/reduction_ops.py b/nums/experimental/optimizer/reduction_ops.py index d5879d15..50228ad3 100644 --- a/nums/experimental/optimizer/reduction_ops.py +++ b/nums/experimental/optimizer/reduction_ops.py @@ -20,7 +20,7 @@ import numpy as np from nums.core.settings import sync_nnz -from nums.core.array.base import Block +from nums.core.array.base import BlockBase from nums.core.grid.grid import Device from nums.core.kernel.kernel_manager import KernelManager from nums.experimental.optimizer.clusterstate import ClusterState @@ -41,10 +41,11 @@ def __init__(self, cluster_state: ClusterState, tree_node_id=None, seed=1337): self.action_leaf_q = [] def __repr__(self): - return "Reduc(id=%s, op=%s, in=%d)" % ( + return "Reduc(id=%s, op=%s, in=%d, dense_kernel=%s)" % ( str(self.tree_node_id), self.op_name, len(self.children_dict), + self.dense_kernel, ) def get_children(self): @@ -83,6 +84,9 @@ def copy(self, cluster_state, parent=None, new_ids=False): if child.tree_node_id in self.leafs_dict: rop.leafs_dict[child_copy.tree_node_id] = child_copy # TODO (hme): How do we properly copy random state? + + rop.tree_node_meta = self.tree_node_meta + rop.dense_kernel = self.dense_kernel return rop def add_child(self, child: TreeNode): @@ -215,7 +219,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> TreeNode: assert isinstance(left, Leaf) and isinstance(right, Leaf) result = self._collapse(device, left, right) new_leaf: Leaf = result[0] - new_block: Block = result[1] + new_block: BlockBase = result[1] # Update action leaf queue. assert set(leaf_ids) == {self.action_leaf_q.pop(0), self.action_leaf_q.pop(0)} @@ -231,7 +235,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> TreeNode: # Update cluster state with new block. # self.cluster_state.add_block(new_block.id, new_block.size(), [device]) self.cluster_state.add_block( - new_block.id, new_leaf.tree_node_size.nbytes, [device] + new_block.id, new_leaf.tree_node_meta.nbytes, [device] ) if not self.cluster_state.created_on_only: assert self.cluster_state.blocks_local(left.block.id, right.block.id) @@ -257,30 +261,42 @@ def execute_on(self, device: Device, leaf_ids=None) -> TreeNode: return self def _collapse(self, device: Device, left: Leaf, right: Leaf): - lblock: Block = left.block - rblock: Block = right.block + lblock: BlockBase = left.block + rblock: BlockBase = right.block if self.op_name == "matmul": raise ValueError("matmul is not a supported reduction operator.") op_name, args = self.op_name, {} assert lblock.shape == rblock.shape - block: Block = lblock.copy() + block: BlockBase = lblock.copy() block.transposed = False block.dtype = array_utils.get_reduce_output_type(self.op_name, lblock.dtype) - block.oid = lblock._km.bop_reduce( - op_name, - lblock.oid, - rblock.oid, - lblock.transposed, - rblock.transposed, - syskwargs={"device": device}, - ) + if self.dense_kernel: + assert lblock.is_dense and rblock.is_dense + block.oid = lblock.km.bop_reduce( + op_name, + lblock.oid, + rblock.oid, + lblock.transposed, + rblock.transposed, + syskwargs={"device": device}, + ) + else: + assert not lblock.is_dense or not rblock.is_dense + block.oid = lblock.km.sparse_bop_reduce( + op_name, + lblock.oid, + rblock.oid, + lblock.transposed, + rblock.transposed, + syskwargs={"device": device}, + ) block._device = device leaf: Leaf = Leaf(self.cluster_state) leaf.block = block - leaf.tree_node_size = left.tree_node_size.bop( + leaf.tree_node_meta = left.tree_node_meta.bop( op_name, - right.tree_node_size, + right.tree_node_meta, **args, ) leaf.copy_on_op = self.copy_on_op @@ -294,22 +310,22 @@ def _mem_cost(self, leafs): shape = None for leaf in leafs: assert leaf.tree_node_id in self.leafs_dict - leaf_block: Block = leaf.block + leaf_block: BlockBase = leaf.block if shape is None: shape = leaf_block.shape else: assert leaf_block.shape == shape - leaf_block: Block = leafs[0].block + leaf_block: BlockBase = leafs[0].block if leaf_block.is_dense: return leaf_block.size() if sync_nnz > 1: - leafs[0].tree_node_size.nnz = leafs[0].block.nnz # Blocking fetch - leafs[1].tree_node_size.nnz = leafs[1].block.nnz # Blocking fetch + leafs[0].tree_node_meta.nnz = leafs[0].block.nnz # Blocking fetch + leafs[1].tree_node_meta.nnz = leafs[1].block.nnz # Blocking fetch return ( leafs[0] - .tree_node_size.bop( + .tree_node_meta.bop( self.op_name, - leafs[1].tree_node_size, + leafs[1].tree_node_meta, ) .nbytes ) @@ -350,10 +366,14 @@ def expression(self): # This will force a different hash for large fused reductions, # if this is, for whatever reason, needed. size = len(self.children_dict) - self._expression = "TreeReductionOp(op=%s, size=%s, id=%s)" % ( - self.op_name, - str(size), - self.tree_node_id, + self._expression = ( + "TreeReductionOp(op=%s, size=%s, id=%s, dense_kernel=%s)" + % ( + self.op_name, + str(size), + self.tree_node_id, + self.dense_kernel, + ) ) return self._expression diff --git a/nums/numpy/api/creation.py b/nums/numpy/api/creation.py index a76e815c..5257e38b 100644 --- a/nums/numpy/api/creation.py +++ b/nums/numpy/api/creation.py @@ -227,7 +227,7 @@ def array(object, dtype=None, copy=True, order="K", ndmin=0, subok=False) -> Blo def from_coo(a: sparse.COO): - assert(isinstance(a, sparse.COO)) + assert isinstance(a, sparse.COO) dtype = np.__getattribute__(str(a.dtype)) shape = a.shape app = _instance() diff --git a/nums/numpy/random.py b/nums/numpy/random.py index 4f031df5..edd7d6a4 100644 --- a/nums/numpy/random.py +++ b/nums/numpy/random.py @@ -79,25 +79,20 @@ def permutation(self, x): arr_perm = self.rs().permutation(shape[0], block_shape[0]).get() return x[arr_perm] - def sparse_randn(self, density, fill_value, *shape): + def sparse_randn(self, density, *shape): shape, block_shape = self._get_shapes(shape, _np.float64) return self.rs().sparse_normal( shape=shape, block_shape=block_shape, p=density, - fill_value=fill_value, ) - def sparse_randint( - self, low, high=None, size=None, dtype=int, density=0.01, fill_value=0 - ): + def sparse_randint(self, low, high=None, size=None, dtype=int, density=0.01): if high is None: high = low low = 0 shape, block_shape = self._get_shapes(size, dtype) - return self.rs().sparse_randint( - low, high, dtype, shape, block_shape, density, fill_value - ) + return self.rs().sparse_randint(low, high, dtype, shape, block_shape, density) # Default imp. diff --git a/tests/core/array/test_bop.py b/tests/core/array/test_bop.py index d14da1b0..166c4a8e 100644 --- a/tests/core/array/test_bop.py +++ b/tests/core/array/test_bop.py @@ -69,7 +69,7 @@ def test_tensordot_basic(app_inst: ArrayApplication): shape = 2, 4, 10, 15 npX = np.arange(np.product(shape)).reshape(*shape) rX = app_inst.array(npX, block_shape=(1, 2, 10, 3)) - rResult = rX.T.tensordot(rX, axes=1) + rResult = rX.tensordot(rX.T, rX, axes=1) assert np.allclose(rResult.get(), (np.tensordot(npX.T, npX, axes=1))) common.check_block_integrity(rResult) @@ -93,7 +93,7 @@ def test_tensordot_large_shape(app_inst: ArrayApplication): block_a = app_inst.array(a, block_shape=(30, 5, 3, 2)) block_b = app_inst.array(b, block_shape=(2, 3, 5, 25)) - block_c = block_a.tensordot(block_b, axes=1) + block_c = block_a.tensordot(block_a, block_b, axes=1) assert np.allclose(block_c.get(), c) common.check_block_integrity(block_c) @@ -129,7 +129,7 @@ def test_tensordot_all_shapes(app_inst: ArrayApplication): ) block_a = app_inst.array(a, block_shape=a_block_shape) block_b = app_inst.array(b, block_shape=b_block_shape) - block_c = block_a.tensordot(block_b, axes=axes) + block_c = block_a.tensordot(block_a, block_b, axes=axes) assert np.allclose(block_c.get(), c) common.check_block_integrity(block_c) diff --git a/tests/core/array/test_sparse.py b/tests/core/array/test_sparse.py index 4ef9bc06..b03c8472 100644 --- a/tests/core/array/test_sparse.py +++ b/tests/core/array/test_sparse.py @@ -1,9 +1,7 @@ import numpy as np import sparse -import pytest from nums.core.array.application import ArrayApplication -from nums.core.array.blockarray import BlockArray from nums.core.array.sparse import SparseBlockArray from nums.core.array.random import NumsRandomState @@ -11,9 +9,8 @@ def test_sparse_init(app_inst: ArrayApplication): x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]]) x_ba = app_inst.array(x1, block_shape=(2, 2)) - x_sba = SparseBlockArray.from_ba(x_ba, fill_value=0) + x_sba = SparseBlockArray.from_ba(x_ba) assert x_sba.nnz == 8 - assert x_sba.nbytes == 8 * 4 + 2 * 8 * 8 y_ba = x_sba.to_ba() assert np.array_equal(x1, y_ba.get()) @@ -21,12 +18,11 @@ def test_sparse_init(app_inst: ArrayApplication): def test_from_coo(app_inst: ArrayApplication): row_coords = [0, 1, 0, 1, 2, 2, 3, 3] col_coords = [3, 2, 2, 3, 0, 1, 0, 1] - values = [1, 1, 1, 1, 2, 2, 2, 2] + values = [1, 1, 1, 1, 2, 2, 2, 2] x_sp = sparse.COO([row_coords, col_coords], values) x_de = x_sp.todense() x_sba = app_inst.array(x_sp, block_shape=(2, 2)) assert x_sba.nnz == 8 - assert x_sba.nbytes == 8 * 4 + 2 * 8 * 8 y_ba = x_sba.to_ba() assert np.array_equal(x_de, y_ba.get()) @@ -34,24 +30,26 @@ def test_from_coo(app_inst: ArrayApplication): def test_sparse_random(app_inst: ArrayApplication): rs: NumsRandomState = app_inst.random_state(1337) x_sba = rs.sparse_randint( - 1, high=5, dtype=int, shape=(15, 10), block_shape=(5, 5), p=0.1, fill_value=0 + 1, + high=5, + dtype=np.int64, + shape=(100, 50), + block_shape=(50, 50), + p=0.1, ) x_ba = x_sba.to_ba() x_np = x_ba.get() x_sp = sparse.COO.from_numpy(x_np, fill_value=0) assert x_sba.nnz == x_sp.nnz - assert x_sba.nbytes == x_sp.nbytes - print(x_np) def test_sparse_uop(app_inst: ArrayApplication): x = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]]) - x_sp = sparse.COO.from_numpy(x, fill_value=1) + x_sp = sparse.COO.from_numpy(x, fill_value=0) x_sp = sparse.elemwise(np.negative, x_sp) x_ba = app_inst.array(x, block_shape=(2, 2)) - x_sba = SparseBlockArray.from_ba(x_ba, fill_value=1) + x_sba = SparseBlockArray.from_ba(x_ba) y_sba = -x_sba - assert y_sba.fill_value == x_sp.fill_value # -1 assert y_sba.nnz == x_sp.nnz # 12 y_ba = y_sba.to_ba() assert np.array_equal(np.negative(x), y_ba.get()) @@ -60,44 +58,46 @@ def test_sparse_uop(app_inst: ArrayApplication): def test_sparse_add(app_inst: ArrayApplication): x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]]) x2 = x1 * 2 - x1_sp = sparse.COO.from_numpy(x1, fill_value=2) - x2_sp = sparse.COO.from_numpy(x2, fill_value=2) + x1_sp = sparse.COO.from_numpy(x1, fill_value=0) + x2_sp = sparse.COO.from_numpy(x2, fill_value=0) x1_ba = app_inst.array(x1, block_shape=(2, 2)) x2_ba = app_inst.array(x2, block_shape=(2, 2)) - x1_sba = SparseBlockArray.from_ba(x1_ba, fill_value=2) - x2_sba = SparseBlockArray.from_ba(x2_ba, fill_value=2) + x1_sba = SparseBlockArray.from_ba(x1_ba) + x2_sba = SparseBlockArray.from_ba(x2_ba) y_sp = x1_sp + x2_sp y_sba = x1_sba + x2_sba - assert y_sba.fill_value == y_sp.fill_value # 4 assert y_sba.nnz == y_sp.nnz # 16 y_ba = y_sba.to_ba() assert np.array_equal(x1 + x2, y_ba.get()) - # Test sparse-dense. + # Test dense-sparse. y_ba = x2_ba + x1_sba # __radd__ assert np.array_equal(x2 + x1_sp, y_ba.get()) + y_ba = x1_sba - x2_ba # __sub__ + assert np.array_equal(x1_sp - x2, y_ba.get()) + y_ba = x2_ba - x1_sba # __rsub__ + assert np.array_equal(x2 - x1_sp, y_ba.get()) # Test sparse-scalar. - y_sp = x1_sp - 1 - y_sba = x1_sba - 1 - assert y_sba.fill_value == y_sp.fill_value # 4 - assert y_sba.nnz == y_sp.nnz # 16 - y_ba = y_sba.to_ba() - assert np.array_equal(x1 - 1, y_ba.get()) + y = (x1_sp - 1).todense() # __sub__ + y_ba = x1_sba - 1 + assert np.array_equal(y, y_ba.get()) + y = (1 - x1_sp).todense() # __rsub__ + y_ba = 1 - x1_sba + assert np.array_equal(y, y_ba.get()) def test_sparse_mul(app_inst: ArrayApplication): x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]]) x2 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]]) - x1_sp = sparse.COO.from_numpy(x1, fill_value=2) - x2_sp = sparse.COO.from_numpy(x2, fill_value=2) + x1_sp = sparse.COO.from_numpy(x1, fill_value=0) + x2_sp = sparse.COO.from_numpy(x2, fill_value=0) x1_ba = app_inst.array(x1, block_shape=(2, 2)) x2_ba = app_inst.array(x2, block_shape=(2, 2)) - x1_sba = SparseBlockArray.from_ba(x1_ba, fill_value=2) - x2_sba = SparseBlockArray.from_ba(x2_ba, fill_value=2) + x1_sba = SparseBlockArray.from_ba(x1_ba) + x2_sba = SparseBlockArray.from_ba(x2_ba) y_sp = x1_sp * x2_sp y_sba = x1_sba * x2_sba - assert y_sba.fill_value == y_sp.fill_value # 4 assert y_sba.nnz == y_sp.nnz # 16 y_ba = y_sba.to_ba() assert np.array_equal(x1 * x2, y_ba.get()) @@ -105,10 +105,10 @@ def test_sparse_mul(app_inst: ArrayApplication): # Test sparse-dense. rs: NumsRandomState = app_inst.random_state(1337) x1_sba = rs.sparse_randint( - 1, high=5, dtype=int, shape=(100, 50), block_shape=(5, 5), p=0.1, fill_value=0 + 1, high=5, dtype=int, shape=(100, 50), block_shape=(5, 5), p=0.1 ) x2_sba = rs.sparse_randint( - 1, high=5, dtype=int, shape=(1, 50), block_shape=(5, 5), p=0.1, fill_value=0 + 1, high=5, dtype=int, shape=(1, 50), block_shape=(5, 5), p=0.1 ) x1_ba = x1_sba.to_ba() x2_ba = x2_sba.to_ba() @@ -118,7 +118,6 @@ def test_sparse_mul(app_inst: ArrayApplication): x2_sp = sparse.COO.from_numpy(x2, fill_value=0) y_sp = x1_sp * x2 y_sba = x2_ba * x1_sba # __rmul__ - assert y_sba.fill_value == y_sp.fill_value assert y_sba.nnz == y_sp.nnz y_ba = y_sba.to_ba() assert np.array_equal(x1 * x2, y_ba.get()) @@ -126,33 +125,34 @@ def test_sparse_mul(app_inst: ArrayApplication): def test_neq(app_inst: ArrayApplication): x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]]) - x1_sp = sparse.COO.from_numpy(x1, fill_value=2) + x1_sp = sparse.COO.from_numpy(x1, fill_value=0) x1_ba = app_inst.array(x1, block_shape=(2, 2)) - x1_sba = SparseBlockArray.from_ba(x1_ba, fill_value=2) + x1_sba = SparseBlockArray.from_ba(x1_ba) y_sp = x1_sp > 0 y_sba = x1_sba > 0 - assert y_sba.fill_value == y_sp.fill_value # True assert y_sba.nnz == y_sp.nnz # 8 (nnz changes!) y_ba = y_sba.to_ba() assert np.array_equal(x1 > 0, y_ba.get()) def test_tensordot(app_inst: ArrayApplication): - x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]]) + x1 = np.array([[0, 0, 1, 2], [0, 0, 3, 4], [5, 6, 0, 0], [7, 8, 0, 0]]) x2 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]]) x1_sp = sparse.COO.from_numpy(x1, fill_value=0) x2_sp = sparse.COO.from_numpy(x2, fill_value=0) x1_ba = app_inst.array(x1, block_shape=(2, 2)) x2_ba = app_inst.array(x2, block_shape=(2, 2)) - x1_sba = SparseBlockArray.from_ba(x1_ba, fill_value=0) - x2_sba = SparseBlockArray.from_ba(x2_ba, fill_value=0) + x1_sba = SparseBlockArray.from_ba(x1_ba) + x2_sba = SparseBlockArray.from_ba(x2_ba) y_sp = sparse.tensordot(x1_sp, x2_sp, axes=1) - y_sba = x1_sba.tensordot(x2_sba, axes=1) - assert y_sba.fill_value == y_sp.fill_value # 0 + y_sba = x1_sba @ x2_sba assert y_sba.nnz == y_sp.nnz # y_ba = y_sba.to_ba() assert np.array_equal(np.tensordot(x1, x2, axes=1), y_ba.get()) + y_ba = x1_ba @ x2_sba # __rmatmul__ + assert np.array_equal(np.tensordot(x1, x2, axes=1), y_ba.get()) + def test_sdtp(app_inst: ArrayApplication): shape = 50, 50, 50 diff --git a/tests/experimental/optimizer/test_copy.py b/tests/experimental/optimizer/test_copy.py index 5de7b42a..ea31aac6 100644 --- a/tests/experimental/optimizer/test_copy.py +++ b/tests/experimental/optimizer/test_copy.py @@ -88,7 +88,7 @@ def tensordot( cluster_state = ClusterState(lhs.km.devices()) lhs_ga: GraphArray = GraphArray.from_ba(lhs, cluster_state, copy_on_op=copy_on_op) rhs_ga: GraphArray = GraphArray.from_ba(rhs, cluster_state, copy_on_op=copy_on_op) - return lhs_ga.tensordot(rhs_ga, axes=axes) + return GraphArray.tensordot(lhs_ga, rhs_ga, axes=axes) def optimized_tensordot( diff --git a/tests/experimental/optimizer/test_fusion.py b/tests/experimental/optimizer/test_fusion.py index 7661fdae..4ced2e78 100644 --- a/tests/experimental/optimizer/test_fusion.py +++ b/tests/experimental/optimizer/test_fusion.py @@ -26,12 +26,14 @@ from nums.core.array.application import BlockArray from nums.core.array.base import BlockArrayBase +from nums.core.array.sparse import SparseBlockArray from nums.experimental.optimizer.clusterstate import ClusterState from nums.experimental.optimizer.grapharray import ( GraphArray, ) from nums.experimental.optimizer.tree_search import RandomTS from nums.experimental.optimizer.fusion import FuseGraph +from nums.experimental.optimizer.fusion_utils import print_graph from nums.experimental.optimizer.graph import TreeNode, Leaf, FunctionNode import conftest @@ -45,12 +47,8 @@ def fusion2(app, x, y): return x @ y -def fusion3(app, s, q, p): - return s * (q @ p.T) - - def ga_op( - app, func, x: BlockArray, y: BlockArray, copy_on_op=True, max_args=2 + app, func, x: BlockArrayBase, y: BlockArrayBase, copy_on_op=True, max_args=2 ) -> BlockArray: cluster_state: ClusterState = ClusterState(x.km.devices()) x_ga: GraphArray = GraphArray.from_ba(x, cluster_state, copy_on_op=copy_on_op) @@ -69,27 +67,6 @@ def ga_op( return BlockArray(result_ga.grid, x.km, result_ga.to_blocks()) -def ga_op_sampled_dense_dense( - app, func, s: BlockArray, p: BlockArray, q: BlockArray, copy_on_op=True, max_args=3 -) -> BlockArray: - cluster_state: ClusterState = ClusterState(s.km.devices()) - s_ga: GraphArray = GraphArray.from_ba(s, cluster_state, copy_on_op=copy_on_op) - p_ga: GraphArray = GraphArray.from_ba(p, cluster_state, copy_on_op=copy_on_op) - q_ga: GraphArray = GraphArray.from_ba(q, cluster_state, copy_on_op=copy_on_op) - op_ga: GraphArray = func(app, s_ga, p_ga, q_ga) - start_time = time.time() - fused_ga: GraphArray = op_ga.compile(max_args) - end_time = time.time() - result_ga: GraphArray = RandomTS( - seed=conftest.rs, - max_samples_per_step=1, - max_reduction_pairs=1, - force_final_action=True, - ).solve(fused_ga) - - return BlockArray(result_ga.grid, s.km, result_ga.to_blocks()) - - def test_fusion(app_inst_mock_none): app = app_inst_mock_none x_shape, x_block_shape = (10,), (5,) @@ -140,24 +117,132 @@ def test_tensordot(app_inst_mock_none): assert block.dtype == opt_z.dtype -def test_sparse_array(app_inst_mock_none): +def ga_op_sparse_2( + app, func, x: BlockArrayBase, y: BlockArrayBase, copy_on_op=True, max_args=2 +) -> SparseBlockArray: + cluster_state: ClusterState = ClusterState(x.km.devices()) + x_ga: GraphArray = GraphArray.from_ba(x, cluster_state, copy_on_op=copy_on_op) + y_ga: GraphArray = GraphArray.from_ba(y, cluster_state, copy_on_op=copy_on_op) + op_ga: GraphArray = func(app, x_ga, y_ga) + start_time = time.time() + fused_ga: GraphArray = op_ga.compile(max_args) + end_time = time.time() + result_ga: GraphArray = RandomTS( + seed=conftest.rs, + max_samples_per_step=1, + max_reduction_pairs=1, + force_final_action=True, + ).solve(fused_ga) + + return SparseBlockArray(result_ga.grid, x.km, result_ga.to_blocks()) + + +def spmm(app, p, q): + return p @ q + + +def test_spmm(app_inst_mock_none): app = app_inst_mock_none - q_shape, q_block_shape = (10, 2), (2, 2) - p_shape, p_block_shape = (20, 2), (2, 2) - s_shape, s_block_shape = (10, 20), (2, 2) - real_q = np.random.random(np.product(q_shape)).reshape(q_shape) + p_shape, p_block_shape = (10, 4), (2, 2) + q_shape, q_block_shape = (4, 10), (2, 2) + p: SparseBlockArray = app.random.sparse_normal( + shape=p_shape, block_shape=p_block_shape, p=0.1 + ) + q: SparseBlockArray = app.random.sparse_normal( + shape=q_shape, block_shape=q_block_shape, p=0.1 + ) + real_p = p.to_ba().get() + real_q = q.to_ba().get() + z: SparseBlockArray = spmm(app, p, q) + start_time = time.time() + opt_z: SparseBlockArray = ga_op_sparse_2(app, spmm, p, q) + end_time = time.time() + print(end_time - start_time) + assert z.nnz == opt_z.nnz + assert np.allclose(z.to_ba().get(), spmm(np, real_p, real_q)) + assert app.allclose(z.to_ba(), opt_z.to_ba()).get() + + +def ga_op_sparse_3( + app, + func, + s: BlockArrayBase, + p: BlockArrayBase, + q: BlockArrayBase, + copy_on_op=True, + max_args=3, +) -> SparseBlockArray: + cluster_state: ClusterState = ClusterState(s.km.devices()) + s_ga: GraphArray = GraphArray.from_ba(s, cluster_state, copy_on_op=copy_on_op) + p_ga: GraphArray = GraphArray.from_ba(p, cluster_state, copy_on_op=copy_on_op) + q_ga: GraphArray = GraphArray.from_ba(q, cluster_state, copy_on_op=copy_on_op) + op_ga: GraphArray = func(app, s_ga, p_ga, q_ga) + start_time = time.time() + fused_ga: GraphArray = op_ga.compile(max_args) + end_time = time.time() + print(fused_ga.graphs[0, 0]) + result_ga: GraphArray = RandomTS( + seed=conftest.rs, + max_samples_per_step=1, + max_reduction_pairs=1, + force_final_action=True, + ).solve(fused_ga) + + return SparseBlockArray(result_ga.grid, s.km, result_ga.to_blocks()) + + +def sparse_fusion(app, s, p, q): + return (s + s) * p * q + + +def test_sparse_fusion(app_inst_mock_none): + app = app_inst_mock_none + s_shape, s_block_shape = (10, 4), (2, 2) + p_shape, p_block_shape = (10, 4), (2, 2) + q_shape, q_block_shape = (10, 4), (2, 2) real_p = np.random.random(np.product(p_shape)).reshape(p_shape) - real_s = np.random.random(np.product(s_shape)).reshape(s_shape) + real_q = np.random.random(np.product(q_shape)).reshape(q_shape) + s: SparseBlockArray = app.random.sparse_normal( + shape=s_shape, block_shape=s_block_shape, p=0.1 + ) + p: BlockArray = app.array(real_p, p_block_shape) q: BlockArray = app.array(real_q, q_block_shape) + real_s = s.to_ba().get() + z: SparseBlockArray = sparse_fusion(app, s, p, q) + start_time = time.time() + opt_z: SparseBlockArray = ga_op_sparse_3(app, sparse_fusion, s, p, q) + end_time = time.time() + print(end_time - start_time) + assert z.nnz == opt_z.nnz + assert np.allclose(z.to_ba().get(), sparse_fusion(np, real_s, real_p, real_q)) + assert app.allclose(z.to_ba(), opt_z.to_ba()).get() + + +def sddmm(app, s, p, q): + return s * (p @ q.T) + + +def test_sddmm(app_inst_mock_none): + app = app_inst_mock_none + s_shape, s_block_shape = (20, 10), (2, 2) + p_shape, p_block_shape = (20, 4), (2, 2) + q_shape, q_block_shape = (10, 4), (2, 2) + real_p = np.random.random(np.product(p_shape)).reshape(p_shape) + real_q = np.random.random(np.product(q_shape)).reshape(q_shape) + s: SparseBlockArray = app.random.sparse_normal( + shape=s_shape, block_shape=s_block_shape, p=0.1 + ) p: BlockArray = app.array(real_p, p_block_shape) - s: BlockArray = app.array(real_s, s_block_shape) - z: BlockArray = fusion3(app, s, q, p) + q: BlockArray = app.array(real_q, q_block_shape) + real_s = s.to_ba().get() + z: SparseBlockArray = sddmm(app, s, p, q) start_time = time.time() - opt_z: BlockArray = ga_op_sampled_dense_dense(app, fusion3, s, q, p) + opt_z: SparseBlockArray = ga_op_sparse_3(app, sddmm, s, p, q) end_time = time.time() print(end_time - start_time) - assert np.allclose(z.get(), fusion3(np, real_s, real_q, real_p)) - assert app.allclose(z, opt_z).get() + assert z.nnz == opt_z.nnz + assert np.allclose(z.to_ba().get(), sddmm(np, real_s, real_p, real_q)) + assert app.allclose(z.to_ba(), opt_z.to_ba()).get() if __name__ == "__main__": @@ -165,8 +250,8 @@ def test_sparse_array(app_inst_mock_none): app = conftest.mock_ray_cluster((1, 1)) # test_sparse_array(app) - # test_fusion(app) - test_tensordot_variant2(app) + test_fusion(app) + # test_tensordot_variant2(app) conftest.destroy_mock_cluster(app) # app = conftest.mock_cluster((10, 1)) diff --git a/tests/experimental/optimizer/test_size.py b/tests/experimental/optimizer/test_meta.py similarity index 71% rename from tests/experimental/optimizer/test_size.py rename to tests/experimental/optimizer/test_meta.py index a5606c6b..2b06ba78 100644 --- a/tests/experimental/optimizer/test_size.py +++ b/tests/experimental/optimizer/test_meta.py @@ -1,98 +1,99 @@ import numpy as np import sparse -from nums.experimental.optimizer.size import TreeNodeSize +from nums.experimental.optimizer.node_meta import LeafMeta def test_nbytes(): x1 = np.eye(10) x1_sp = sparse.COO.from_numpy(x1, fill_value=0) - x1_ts = TreeNodeSize( + x1_ts = LeafMeta( (10, 10), 10, np.int64, - 0, + False, ) assert x1_sp.nbytes == x1_ts.nbytes def test_uop(): - x1_ts = TreeNodeSize( + x1_ts = LeafMeta( (10, 10), 10, np.int64, - 1, + False, ) y_ts = x1_ts.uop("negative") assert np.allclose(y_ts.nnz, x1_ts.nnz) - assert y_ts.fill_value == -1 + assert not y_ts.is_dense def test_add(): - x1_ts = TreeNodeSize( + x1_ts = LeafMeta( (10, 10), 10, np.int64, - 1, + False, ) - x2_ts = TreeNodeSize( + x2_ts = LeafMeta( (10, 10), 20, np.int64, - 2, + False, ) y_ts = x1_ts + x2_ts assert np.allclose(y_ts.nnz, int((1 - 0.9 * 0.8) * 100)) - assert y_ts.fill_value == 3 + assert not y_ts.is_dense def test_mul(): - x1_ts = TreeNodeSize( + x1_ts = LeafMeta( (10, 10), 10, np.int64, - 0, + False, ) - x2_ts = TreeNodeSize( + x2_ts = LeafMeta( (10, 10), 20, np.int64, - 0, + False, ) y_ts = x1_ts * x2_ts assert np.allclose(y_ts.nnz, int(0.1 * 0.2 * 100)) - x2_ts = TreeNodeSize( + x2_ts = LeafMeta( (10, 1), 10, np.int64, - 1, + False, ) y_ts = x1_ts * x2_ts assert np.allclose(y_ts.nnz, int(0.1 * 100)) - x2_ts = TreeNodeSize( + x2_ts = LeafMeta( (10, 1), 10, np.int64, - None, + True, ) y_ts = x1_ts * x2_ts assert np.allclose(y_ts.nnz, int(0.1 * 100)) + assert not y_ts.is_dense def test_tensordot(): - x1_ts = TreeNodeSize( + x1_ts = LeafMeta( (10, 10), 10, np.int64, - 0, + False, ) - x2_ts = TreeNodeSize( + x2_ts = LeafMeta( (10, 10), 20, np.int64, - 0, + False, ) y_ts = x1_ts.tensordot(x2_ts, axes=1) assert np.allclose(y_ts.nnz, int((1 - (1 - 0.1 * 0.2) ** 10) * 100)) diff --git a/tests/experimental/optimizer/test_ops.py b/tests/experimental/optimizer/test_ops.py index 2098a692..156c9651 100644 --- a/tests/experimental/optimizer/test_ops.py +++ b/tests/experimental/optimizer/test_ops.py @@ -200,21 +200,24 @@ def compute_graph_array(app: ArrayApplication, ga) -> BlockArray: app = app_inst_mock_small cluster_state = ClusterState(app.km.devices()) - X = app.random.sparse_normal(shape=(10, 3), block_shape=(5, 3)) + X = app.random.sparse_normal(shape=(10, 6), block_shape=(5, 3), p=0.1) Xc = X - theta: SparseBlockArray = SparseBlockArray.from_ba( - app.zeros((Xc.shape[1],), (Xc.block_shape[1],), dtype=Xc.dtype) - ) + print(Xc.nnz) + theta: BlockArray = app.ones((Xc.shape[1],), (Xc.block_shape[1],), dtype=Xc.dtype) X_ga: GraphArray = GraphArray.from_ba(Xc, cluster_state) theta_ga: GraphArray = GraphArray.from_ba(theta, cluster_state) Z_ga: GraphArray = X_ga @ theta_ga Z_ga: GraphArray = collapse_graph_array(app, Z_ga) + # one_ga: GraphArray = GraphArray.from_ba( + # BlockArray.from_scalar(1, app.km), cluster_state + # ) one_ga: GraphArray = GraphArray.from_ba( - SparseBlockArray.from_scalar(1, app.km), cluster_state + app.ones((Xc.shape[0],), (Xc.block_shape[0],), dtype=Xc.dtype), cluster_state ) mu_ga: GraphArray = collapse_graph_array(app, one_ga / (one_ga + app.exp(-Z_ga))) - mu_ba: SparseBlockArray = compute_graph_array(app, mu_ga) - print(mu_ba.todense().get()) + mu_ba: BlockArray = compute_graph_array(app, mu_ga) + print(mu_ba.get()) + assert np.allclose(1 / (1 + np.exp(-(X.to_ba().get() @ theta.get()))), mu_ba.get()) def test_sparse_bop_2(app_inst_mock_small): @@ -256,7 +259,9 @@ def compute_graph_array(app: ArrayApplication, ga) -> BlockArray: P = P_sba.to_ba().get() Q = Q_sba.to_ba().get() S = S_sba.to_ba().get() - assert np.allclose(S - P @ Q.T, X_sba.to_ba().get()) + X = X_sba.to_ba().get() + print(X) + assert np.allclose(S - P @ Q.T, X) if __name__ == "__main__": diff --git a/tests/experimental/optimizer/test_tensordot.py b/tests/experimental/optimizer/test_tensordot.py index 7657b898..c06ae1ae 100644 --- a/tests/experimental/optimizer/test_tensordot.py +++ b/tests/experimental/optimizer/test_tensordot.py @@ -41,7 +41,7 @@ def optimized_tensordot( cluster_state: ClusterState = ClusterState(lhs.km.devices()) lhs_ga: GraphArray = GraphArray.from_ba(lhs, cluster_state, copy_on_op=copy_on_op) rhs_ga: GraphArray = GraphArray.from_ba(rhs, cluster_state, copy_on_op=copy_on_op) - tensordot_ga = lhs_ga.tensordot(rhs_ga, axes=axes) + tensordot_ga = GraphArray.tensordot(lhs_ga, rhs_ga, axes=axes) global random_state print("*" * 50) print("op grid shape", tensordot_ga.grid.grid_shape) @@ -119,7 +119,7 @@ def test_load_sqr(app_inst_mock_big): cluster_state: ClusterState = ClusterState(app.km.devices()) lhs_ga: GraphArray = GraphArray.from_ba(lhs, cluster_state) rhs_ga: GraphArray = GraphArray.from_ba(rhs, cluster_state) - tensordot_ga = lhs_ga.tensordot(rhs_ga, axes=axes) + tensordot_ga = GraphArray.tensordot(lhs_ga, rhs_ga, axes=axes) mem_diff = max(cluster_state.resources[0]) - min(cluster_state.resources[0]) net_in_diff = max(cluster_state.resources[1]) - min(cluster_state.resources[1]) @@ -163,7 +163,7 @@ def test_load_single_block_rhs(app_inst_mock_big): cluster_state: ClusterState = ClusterState(app.km.devices()) lhs_ga: GraphArray = GraphArray.from_ba(lhs, cluster_state) rhs_ga: GraphArray = GraphArray.from_ba(rhs, cluster_state) - tensordot_ga = lhs_ga.tensordot(rhs_ga, axes=axes) + tensordot_ga = GraphArray.tensordot(lhs_ga, rhs_ga, axes=axes) print("memory", cluster_state.resources[0]) print("net_in", cluster_state.resources[1])