Source code for mygrad.tensor_base

"""
This module defines the base tensor class along with all of its essential
attributes and special methods. Public math methods, e.g. ``sum``, ``mean``,
etc., are bound to the Tensor class in ``mygrad.__init__.py``.
"""
from collections import deque
from numbers import Integral, Number
from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Deque,
    Dict,
    Iterator,
    List,
    Optional,
    Sequence,
    Set,
    Tuple,
    Type,
    TypeVar,
    Union,
)
from weakref import ReferenceType, finalize

import numpy as np

import mygrad._utils.duplicating_graph as _dup
import mygrad._utils.graph_tracking as _track
import mygrad._utils.lock_management as _mem
from mygrad._tensor_core_ops.indexing import GetItem, SetItem
from mygrad._utils import WeakRef, WeakRefIterable, collect_all_tensors_and_clear_grads
from mygrad.errors import DisconnectedView
from mygrad.math.arithmetic.ops import (
    Add,
    Divide,
    Multiply,
    Negative,
    Positive,
    Power,
    Square,
    Subtract,
)
from mygrad.math.misc.ops import MatMul
from mygrad.math.sequential.ops import (
    CumProd,
    CumSum,
    Max,
    Mean,
    Min,
    Prod,
    StdDev,
    Sum,
    Variance,
)
from mygrad.operation_base import Operation, _NoValue
from mygrad.tensor_manip.array_shape.ops import Flatten, Ravel, Reshape, Squeeze
from mygrad.tensor_manip.transpose_like.ops import (
    MoveAxis,
    SwapAxes,
    Tensor_Transpose_Property,
    Transpose,
)
from mygrad.typing import ArrayLike, DTypeLike, DTypeLikeReals, Index, Shape

__all__ = ["Tensor", "asarray", "astensor", "implements_numpy_override"]

if TYPE_CHECKING:  # pragma: no cover
    from mygrad.ufuncs._ufunc_creators import ufunc as mygrad_ufunc


T = TypeVar("T")

CONSTANT_ONLY_DTYPES = (np.integer, np.bool_)


def _resolve_constant(*others: Any, constant: Optional[bool]) -> Optional[bool]:
    """Determines if `constant` should be resolved to True based on `others`.
    Otherwise defers to a tensor-creator to handle further resolutions based on dtype.
    """
    if constant is not None:
        return constant
    for other in others:
        if isinstance(other, Tensor) and not other.constant:
            # let subsequent tensor casting infer constant from dtype
            return None
    # all inputs are constants
    return True


[docs]def asarray(a: ArrayLike, dtype: DTypeLike = None, order: str = None) -> np.ndarray:
    """Convert the input to an array.

    This docstring is adapted from that of ``numpy.asarray``

    Parameters
    ----------
    a : array_like
        Input data, in any form - including a mygrad tensor - that can be converted to an array. This
        includes lists, lists of tuples, tuples, tuples of tuples, tuples
        of lists and ndarrays.

    dtype : data-type, optional
        By default, the data-type is inferred from the input data.

    order : {'C', 'F'}, optional
        Whether to use row-major (C-style) or
        column-major (Fortran-style) memory representation.
        Defaults to 'C'.

    Returns
    -------
    out : ndarray
        Array interpretation of `a`.  No copy is performed if the input
        is already an ndarray with matching dtype and order.  If `a` is a
        subclass of ndarray, a base class ndarray is returned.

    Examples
    --------
    Convert a list into an array:

    >>> import mygrad as mg
    >>> a = [1, 2]
    >>> mg.asarray(a)
    array([1, 2])

    Convert a tensor into an array. No copy of the
    underlying numpy array is created:

    >>> t = mg.Tensor([1, 2.])
    >>> mg.asarray(t)
    array([1., 2.])
    >>> t.data is np.asarray(t))
    True

    Existing arrays are not copied:

    >>> a = np.array([1, 2])
    >>> mg.asarray(a) is a
    True

    If `dtype` is set, array is copied only if dtype does not match:

    >>> a = np.array([1, 2], dtype=np.float32)
    >>> mg.asarray(a, dtype=np.float32) is a
    True
    >>> mg.asarray(a, dtype=np.float64) is a
    False

    Contrary to `asanyarray`, ndarray subclasses are not passed through:

    >>> issubclass(np.recarray, np.ndarray)
    True
    >>> a = np.array([(1.0, 2), (3.0, 4)], dtype='f4,i4').view(np.recarray)
    >>> mg.asarray(a) is a
    False
    >>> np.asanyarray(a) is a
    True
    """
    if isinstance(a, Tensor):
        a = a.data  # faster than passing the tensor directly
    return np.asarray(a, dtype=dtype, order=order)


[docs]def tensor(
    arr_like: ArrayLike,
    dtype: DTypeLikeReals = None,
    *,
    constant: Optional[bool] = None,
    copy: bool = True,
    ndmin: int = 0,
) -> "Tensor":
    """
    Create a tensor

    This documentation was adapted from that of ``numpy.array`

    Parameters
    ----------
    arr_like : array_like
        A tensor, any object exposing the array interface, an object whose
        __array__ method returns an tensor, a real number, any (nested) sequence.

    dtype : data-type, optional
        The desired data-type for the tensor. Restricted to integer and float type.
        If not specified, then the type will be determined as the minimum type required
        to hold the objects in the sequence.

    constant : Optional[bool]
        If ``True``, this tensor is treated as a constant, and thus does not
        facilitate back propagation (i.e. ``constant_tensor.grad`` will always
        return ``None``).

        If a new tensor is returned:
         - Defaults to ``False`` for float-type data.
         - Defaults to ``True`` for integer-type data.

    copy : bool, optional
        If true (default), or if a copy is needed to satisfy any of the
        other requirements (``dtype``, ``constant``, etc.) then a new tensor
        is created from copied data. Otherwise the tensor will be returned
        unchanged.

    ndmin : int, optional
        Specifies the minimum number of dimensions that the resulting
        tensor should have. Ones will be prepended to the shape as
        needed to meet this requirement.

    Returns
    -------
    out : Tensor
        A tensor satisfying the specified requirements.

    See Also
    --------
    empty_like : Return an empty tensor with shape and type of input.
    ones_like : Return an tensor of ones with shape and type of input.
    zeros_like : Return an tensor of zeros with shape and type of input.
    full_like : Return a new tensor with shape of input filled with value.
    empty : Return a new uninitialized tensor.
    ones : Return a new tensor setting values to one.
    zeros : Return a new tensor setting values to zero.
    full : Return a new tensor of given shape filled with value.

    Examples
    --------
    >>> import mygrad as mg
    >>> mg.tensor([1, 2, 3])
    Tensor([1, 2, 3])

    Upcasting:

    >>> mg.tensor([1, 2, 3.0])
    Tensor([ 1.,  2.,  3.])

    More than one dimension:

    >>> mg.tensor([[1, 2], [3, 4]])
    Tensor([[1, 2],
            [3, 4]])

    Minimum dimensions 2:

    >>> mg.tensor([1, 2, 3], ndmin=2)
    Tensor([[1, 2, 3]])

    Type provided:

    >>> mg.tensor([1, 2, 3], dtype="float32")
    Tensor([1., 2., 3.], dtype=float32)
    """

    if isinstance(arr_like, Tensor) and copy is False:
        if (constant is None or arr_like.constant is constant) and (
            dtype is None or (arr_like.dtype == np.dtype(dtype))
        ):
            if not isinstance(ndmin, Integral):
                raise TypeError(
                    f"TypeError: `ndmin` requires a non-negative integer (got type {type(ndmin)})"
                )
            if ndmin < 0:
                ndmin = 0  # numpy does this
            if ndmin > arr_like.ndim:
                arr_like = arr_like[(*(None for _ in range(ndmin - arr_like.ndim)),)]
            # return tensor as-as
            return arr_like

    return Tensor(arr_like, dtype=dtype, constant=constant, copy=copy, ndmin=ndmin)


[docs]def astensor(
    t: ArrayLike, dtype: DTypeLikeReals = None, *, constant: Optional[bool] = None
) -> "Tensor":
    """Convert the input to a tensor.

    A tensor `t` is returned unchanged - its gradient and computational
    graph state preserved - if dtype and constant are compatible.
    A copy of the underlying numpy array is created only if dtype is
    incompatible or if a non-constant tensor is being created from a constant.

    Parameters
    ----------
    t : array_like
        Input data, in any form that can be converted to a tensor. This
        includes lists, lists of tuples, tuples, tuples of tuples, tuples
        of lists and ndarrays.

    dtype : data-type, optional
        By default, the data-type is inferred from the input data.

    constant : Optional[bool]
        By default, `constant` is inferred from `t` if `t` is a tensor,
        otherwise it defaults to `False`.

        Defaults to ``False`` for float-type data.
        Defaults to ``True`` for integer-type data.

        Integer-type tensors must be constant.

    Returns
    -------
    out : Tensor
        Tensor interpretation of `a`.  No copy is performed if the input
        is already a tensor with matching dtype and constant-flag.

    Examples
    --------
    Convert a list into an array:

    >>> import mygrad as mg
    >>> import numpy as np
    >>> t = [1, 2]
    >>> mg.astensor(t)
    Tensor([1, 2])

    Convert an array into a tensor. No copy of the
    underlying numpy array is created:

    >>> a = np.array([1.0, 2.0])
    >>> mg.astensor(a)
    Tensor([1., 2.])
    >>> a is mg.astensor(a).data
    True

    Existing tensors are not copied and their gradients and
    computational graphs are preserved:

    >>> t1 = 2 * mg.tensor([1, 2])
    >>> t2 = mg.astensor(t1)
    >>> t1 is t2
    True
    >>> t1.creator is t2.creator
    True

    If `dtype` is set, a new tensor is created - with copied data - only
    if dtype does not match:

    >>> t = mg.Tensor([1, 2], dtype=np.float32)
    >>> mg.astensor(t, dtype=np.float32) is t
    True
    >>> mg.astensor(t, dtype=np.float64) is t
    False

    Otherwise, if `constant` is set, a new tensor is created (with
    no copy of the underlying data) only if constant doesn't match.

    >>> t1 = mg.tensor([1, 2], constant=False)
    >>> mg.astensor(t1, constant=False) is t
    True
    >>> mg.astensor(t1, constant=True) is t1
    False
    >>> mg.astensor(t1, constant=True).data is t1.data
    True
    """
    return tensor(t, dtype=dtype, constant=constant, copy=False, ndmin=0)


_REGISTERED_UFUNC: Dict[np.ufunc, Type["mygrad_ufunc"]] = {}
_REGISTERED_DIFFERENTIABLE_NUMPY_FUNCS: Dict[
    Callable[..., np.ndarray], Callable[..., "Tensor"]
] = {}

_REGISTERED_BOOL_ONLY_UFUNC: Set[np.ufunc] = {
    np.isnan,
    np.isfinite,
    np.isinf,
    np.isnat,
    np.signbit,
    np.logical_not,
    np.logical_and,
    np.logical_or,
    np.logical_xor,
    np.greater,
    np.greater_equal,
    np.less,
    np.less_equal,
    np.equal,
    np.not_equal,
}

# These are ufuncs that users might mistake for being differentiable functions;
# for this reason we make explicit the fact that only constant tensors are permitted
# in these operations.
_REGISTERED_CONST_ONLY_UFUNC = {
    np.floor_divide,
    np.remainder,
    np.mod,
    np.fmod,
    np.divmod,
    np.rint,
    np.sign,
    np.floor,
    np.ceil,
    np.trunc,
}


_REGISTERED_NO_DIFF_NUMPY_FUNCS: Set[Callable] = {
    np.allclose,
    np.bincount,
    np.can_cast,
    np.copyto,
    np.isclose,
    np.may_share_memory,
    np.min_scalar_type,
    np.result_type,
    np.shares_memory,
    np.shape,
}


class implements_numpy_override:
    """Registers a mygrad-based override for a NumPy function of the same name, via
    the standard __array_function__ interface. [1]_

    Examples
    --------
    >>> @implements_numpy_override()  # np.reshape to be overridden
    ... def reshape(x, shape):
    ...    # a mygrad-based implementation of numpy.reshape
    ...    print("hello world")

    >>> import numpy as np
    >>> import mygrad as mg
    >>> np.reshape(mg.tensor(1.), 2)
    'hello world'

    You can also explicit provide the numpy function explicitly

    >>> import numpy as np
    >>> @implements_numpy_override(np.reshape)  # np.reshape to be overridden
    ... def some_function(x, shape):
    ...    pass

    References
    ----------
    .. [1] https://numpy.org/devdocs/reference/arrays.classes.html?#numpy.class.__array_function__
    """

    __slots__ = ("numpy_func",)

    def __init__(self, numpy_func: Optional[Callable] = None):
        # if None, `numpy_func` is inferred from the name of the decorated function
        self.numpy_func = numpy_func

    def __call__(self, wrapped_func: T) -> T:
        if self.numpy_func is None:
            try:
                self.numpy_func = getattr(np, wrapped_func.__name__)
            except AttributeError:
                raise AttributeError(
                    f"@implements_numpy_override tried to register an override for the function numpy.{wrapped_func.__name__}, but no "
                    f"such function exists."
                )

        _REGISTERED_DIFFERENTIABLE_NUMPY_FUNCS[self.numpy_func] = wrapped_func
        return wrapped_func


class _ConstantOnly(ValueError):
    pass


def _as_constant_array(t: Union["Tensor", np.ndarray]) -> np.ndarray:
    """Passes through all non-tensor objects and constant tensors. Raises on
    non-constant tensors."""
    if isinstance(t, Tensor):
        if t.constant is False:
            raise _ConstantOnly()
        return t.data
    return t


class Tensor:
    """A numpy-array-like object capable of serving as a node in a computational
    graph that supports back-propagation of derivatives via the chain rule.
    See the Examples section of the docstring for more details.

    Like the numpy array, mygrad's tensor stores data as an N-dimensional array
    and provides an interface accessing, setting, and performing vectorized
    operations along the various dimensions of this array. Vectorized operations
    support numpy-style broadcasting semantics.

    The contents of a tensor can be accessed and written to using all variety
    of basic and advanced indexing (along with mixtures of the two).

    Creating a Tensor
    -----------------
    ``mygrad.Tensor`` can be passed any "array-like" object of numerical data.
    This includes numbers, sequences (e.g. lists), nested sequences, numpy-ndarrays,
    and other mygrad-tensors. mygrad also provides familiar numpy-style tensor-creation
    functions (e.g. ``mygrad.arange``, ``mygrad.linspace``, etc.)

    >>> import mygrad as mg
    >>> mg.tensor(2.3)  # creating a 0-dimensional tensor
    Tensor(2.3)
    >>> mg.tensor(np.array([1.2, 3.0]))  # casting a numpy-array to a tensor
    Tensor([1.2, 3.0])
    >>> mg.tensor([[1, 2], [3, 4]])  # creating a 2-dimensional tensor
    Tensor([[1, 2],
            [3, 4]])
    >>> mg.arange(4)    # using numpy-style tensor creation functions
    Tensor([0, 1, 2, 3])

    Creating a non-constant tensor will copy array data:

    >>> import numpy as np
    >>> arr = np.arange(10.)
    >>> t_var = tensor(arr, constant=False)
    >>> np.shares_memory(arr, t_var)
    False

    Creating constant tensor will not make a copy of the array data:

    >>> t_const = mg.tensor(arr, constant=True)
    >>> np.shares_memory(arr, t_const)
    True

    Forward and Back-Propagation
    ----------------------------
    Let's construct a computational graph consisting of two zero-dimensional
    tensors, ``x`` and ``y``, which are used to compute an output tensor,
    ````. This is a "forward pass imperative" style for creating a computational
    graph - the graph is constructed as we carry out the forward-pass computation.

    >>> x = mg.tensor(3.0)
    >>> y = mg.tensor(2.0)
    >>> ℒ = 2 * x + y ** 2

    Invoking ``ℒ.backward()`` signals the computational graph to
    compute the total-derivative of ``f`` with respect to each one of its dependent
    variables. I.e. ``x.grad`` will store ``dℒ/dx`` and ``y.grad`` will store
    ``dℒ/dy``. Thus we have back-propagated a gradient from ``f`` through our graph.

    Each tensor of derivatives is computed elementwise. That is, if `x = Tensor(x0, x1, x2)`,
    then dℒ/dx represents `[dℒ/d(x0), dℒ/d(x1), dℒ/d(x2)]`

    >>> ℒ.backward()  # computes df/dx and df/dy
    >>> x.grad  # df/dx
    array(6.0)
    >>> y.grad  # df/dy
    array(4.0)
    >>> ℒ.grad
    array(1.0)  # dℒ/dℒ

    Once the gradients are computed, the computational graph containing ``x``,
    ``y``, and ``ℒ`` is cleared automatically. Additionally, involving any
    of these tensors in a new computational graph will automatically null
    their gradients.

    >>> 2 * x
    >>> x.grad is None
    True

    Or, you can use the ``tensor.null_grad()`` method to manually clear a
    tensor's gradient

    >>> y.null_grad()
    Tensor(2.)
    >>> y.grad is None
    True

    Accessing the Underlying NumPy Array
    ------------------------------------
    ``mygrad.Tensor`` is a thin wrapper on ``numpy.ndarray``. A tensor's
    underlying numpy-array can be accessed via ``.data``:

    >>> x = mg.tensor([1, 2])
    >>> x.data
    array([1, 2])

    **Do not modify this underlying array**. Any in-place modifications made to this
    array will not be tracked by any computational graph involving that tensor, thus
    back-propagation through that tensor will likely be incorrect.

    Producing a "View" of a Tensor
    ------------------------------
    MyGrad's tensors exhibit the same view semantics and memory-sharing relationships
    as NumPy arrays. I.e. any (non-scalar) tensor produced via basic indexing will share
    memory with its parent.

    >>> x = mg.tensor([1., 2., 3., 4.])
    >>> y = x[:2]  # the view: Tensor([1., 2.])
    >>> y.base is x
    True
    >>> np.shares_memory(x, y)
    True

    Mutating shared data will propagate through views:

    >>> y *= -1
    >>> x
    Tensor([-1., -2.,  3.,  4.])
    >>> y
    Tensor([-1., -2.])

    And this view relationship will also manifest between the tensors' gradients

    >>> (x ** 2).backward()
    >>> x.grad
    array([-2., -4.,  6.,  8.])
    >>> y.grad
    array([-2., -4.])

    In-Place Operations are not Efficient
    =====================================
    It is important to note that while MyGrad's view semantics promote a rich parity
    with NumPy, that certain aspects should be avoided in the interest of optimized performance.
    Namely, performing in-place operations on tensors is generally not more efficient than
    their non-mutating counterparts.

    This is because MyGrad has to track the state of tensors that are involved in a computational
    graph. Thus a mutated tensor must have its pre-augmented state stored for future reference; this
    defeats the performance benefit of writing to an array's memory in-place. This is especially
    inefficient if you are mutating a tensor involved with multiple views of the same memory(
    By contrast, producing a view of a tensor _is_ efficient as one would expect).

    Thus these NumPy-like in-place semantics are supported by MyGrad not for the same performance
    purposes, but instead to support convenient and familiar code-patterns and to enable one to
    port NumPy code to MyGrad (or, in the future, inject MyGrad tensors into NumPy!!) and get
    the exact same behavior.

    A final note: MyGrad's in-place operations, when run under :func:`~mygrad.no_autodiff` mode,
    do not incur the extra costs noted above, and thus your code will benefit from the performance
    benefits of in-place operations.
    """

    __array_priority__ = 15.0

    def __array_ufunc__(
        self, ufunc: Type[np.ufunc], method: str, *inputs: ArrayLike, **kwargs
    ) -> Union["Tensor", np.ndarray]:
        """An interface provided by NumPy to override the behavior of its ufuncs [1]_.

        MyGrad implements its own ufuncs for all differentiable NumPy ufuncs.

        Non-differentiable numpy ufuncs simply get called on the underlying arrays of tensors and
        will return ndarrays.

        The differentiability - or lack thereof - of ufuncs may not be obvious to end users.
        Thus potentially ambiguous ufuncs (e.g. `numpy.ceil`) will be made to raise on non-constant
        tensors so that the lack of differentiability is made obvious to the users. This design decision
        is made in the same spirit as requiring integer-dtype tensors be constant.

        References
        ----------
        .. [1] https://numpy.org/doc/stable/reference/arrays.classes.html#numpy.class.__array_ufunc__

        Examples
        --------
        NumPy ufuncs that represent differentiable operations are overloaded by MyGrad tensors
        so that they support backprop

        >>> import mygrad as mg
        >>> import numpy as np

        >>> x = mg.tensor([1., 2.])

        This calls ``mygrad.sin`` under the hood.

        >>> np.sin(x)  # returns a tensor
        Tensor([0.84147098, 0.90929743])

        >>> np.sin(x).backward()
        >>> x.grad  # stores d(sin(x))/dx @ x = [1., 2.]
        array([ 0.54030231, -0.41614684])

        Specifying a dtype, a ``where`` mask, an in-place target (via ``out``) as an array
        or a tensor, are all supported.

        >>> x = mg.tensor([1., 2.])
        >>> y = mg.tensor([-1., -1.])
        >>> np.exp(x, where=[False, True], out=y)
        Tensor([-1.       ,  7.3890561])
        >>> y.backward()
        >>> x.grad
        array([0.       , 7.3890561])

        Non-differentiable NumPy ufuncs simply operate on the ndarrays that are wrapped
        by MyGrad tensors; these return ndarrays, which will appropriately and explicitly
        serve as constants elsewhere in a computational graph.

        >>> x = mg.tensor([1., 2.])
        >>> np.less_equal(x, 1)
        array([ True, False])
        """
        out = kwargs.pop("out", (None,))
        if len(out) > 1:  # pragma: no cover
            raise ValueError(
                "mygrad does not support in-place operations with more that one target"
            )
        (out,) = out

        out: Optional[Union[np.ndarray, "Tensor"]]

        try:
            # differentiable ufunc implemented by mygrad
            return getattr(_REGISTERED_UFUNC[ufunc], method)(*inputs, **kwargs, out=out)
        except KeyError:
            pass

        # non-differentiable ufuncs get called on numpy arrays stored by tensors
        if ufunc in _REGISTERED_BOOL_ONLY_UFUNC:
            caster = asarray
        elif ufunc in _REGISTERED_CONST_ONLY_UFUNC:
            # the presence of non-constant tensors will raise
            caster = _as_constant_array
        else:  # pragma: no cover
            return NotImplemented

        try:
            if out is not None:
                kwargs["out"] = caster(out)
            # returns ndarray
            return getattr(ufunc, method)(*(caster(t) for t in inputs), **kwargs)
        except _ConstantOnly:
            raise ValueError(
                f"{repr(ufunc)} cannot involve non-constant mygrad tensors."
            )

    def __array_function__(
        self, func: Callable[..., np.ndarray], types, args, kwargs
    ) -> Union["Tensor", np.ndarray]:
        if func in _REGISTERED_DIFFERENTIABLE_NUMPY_FUNCS:
            return _REGISTERED_DIFFERENTIABLE_NUMPY_FUNCS[func](*args, **kwargs)
        elif func in _REGISTERED_NO_DIFF_NUMPY_FUNCS:
            return func(
                *(t.data if isinstance(t, Tensor) else t for t in args),
                **{
                    k: (v.data if isinstance(v, Tensor) else v)
                    for k, v in kwargs.items()
                },
            )
        else:  # pragma: no cover
            return NotImplemented

    def __array__(self, dtype: DTypeLike = None) -> np.ndarray:
        return np.array(self.data, dtype=dtype, copy=False)

    def __init__(
        self,
        x: ArrayLike,
        *,
        dtype: DTypeLikeReals = None,
        constant: Optional[bool] = None,
        copy: bool = True,
        ndmin: int = 0,
        _creator: Optional[Operation] = None,
        _base: Optional["Tensor"] = None,
    ):
        """
        Parameters
        ----------
        x : ArrayLike
            Input data, in any form that can be converted to an array.  This
            includes numbers, sequences, nested sequences, numpy-ndarrays,
            and mygrad-tensors.

        dtype : DTypeLikeReals
            `int`, `float`, or a real-valued numpy data type. By default the
            data type is inferred from ``x`` via ``numpy.asarray(x)``.

        constant : Optional[bool]
            If ``True``, this tensor is treated as a constant, and thus does not
            facilitate back propagation (i.e. `self.grad` will always return
            ``None``).

            Defaults to ``False`` for float-type data.
            Defaults to ``True`` for integer-type data.

            Integer-type tensors must be constant.

        copy : Optional[bool]
            Determines if the incoming array-data will be copied.

        ndmin : int, optional
            Specifies the minimum number of dimensions that the resulting
            array should have.  Ones will be prepended to the shape as
            needed to meet this requirement.

        Notes
        -----
        The following are parameters reserved only for internal use:

        _creator : Optional[mygrad.Operation]
            The operation-instance whose forward pass produced `self`. Should not
            be set manually by users.

        _base : Optional[Tensor]
            Points to the tensor that ``self`` shares memory with.
        """

        if constant is not None and not isinstance(constant, bool):
            raise TypeError(f"`constant` must be a boolean value, got: {constant}")

        self._creator: Optional[Operation] = _creator

        self.data = np.array(x, dtype=dtype, copy=copy, ndmin=ndmin)  # type: np.ndarray

        dtype = self.data.dtype.type
        is_float = issubclass(dtype, np.floating)  # faster than `numpy.issubdtype`
        if not is_float and _track.TRACK_GRAPH:
            # No need to constrain dtypes if we aren't tracking the graph.
            # Also, it is nice to enable complex arithmetic through mygrad
            # functions that are wrapped in no_autodiff
            if not issubclass(dtype, CONSTANT_ONLY_DTYPES):
                raise TypeError(
                    f"Tensor data must be of an floating type, integer type, or boolean type, "
                    f"received {dtype}"
                )

            elif constant is False:
                raise ValueError("Integer-valued tensors must be treated as constants.")

        if constant is None:
            # non-float: default constant -> True
            # float: default constant -> False
            constant = not is_float

        self._constant = constant

        self._grad = None  # type: Union[None, np.ndarray]

        # track all operations that this tensor participates in
        self._ops: Set[WeakRef[Operation]] = set()

        # base points to the initial tensor that owns the memory of this
        # tensor
        self._base = _base  # type: Optional[Tensor]
        # stores all of the tensors that are a view of this tensor
        self._view_children = WeakRefIterable()  # type: WeakRefIterable[Tensor]

        # Used to reflect the view of the gradient associated with that of `self.base`.
        # This is a means of distinguishing between the gradient set on `self` as
        # part of backpropagation and the view of the gradient of its base.
        self._view_grad: Optional[np.ndarray] = None

    @property
    def grad(self) -> Optional[np.ndarray]:
        """
        Returns the derivative of ``ℒ`` with respect to this tensor.

        ``ℒ`` is the terminal node in the compuational graph from which
        ``ℒ.backward()`` was invoked.

        If this tensor is a view of another tensor then their gradients
        will exhibit the same memory-sharing relationship as their data.

        Returns
        -------
        dℒ/dx: numpy.ndarray
            The gradient of the terminal node in a computational graph
            with respect to this tensor. The shape of this numpy array
            matches ``self.shape``

        Examples
        --------
        >>> import mygrad as mg
        >>> x = mg.Tensor([1.0, 2.0])

        Prior to backpropagation tensors have ``None`` set for their gradients.

        >>> x.grad is None
        True

        Now we trigger backpropagation...

        >>> ℒ = x ** 2
        >>> ℒ.backward()

        and we see that ``x.grad`` stores dℒ/dx

        >>> x.grad  # dℒ/dx
        array([2., 4.])

        Now we will demonstrate the relationship between gradient a view tensor
        and that of its base.

        >>> base = mg.Tensor([1.0, 2.0, 3.0])
        >>> view = base[:2]; view
        Tensor([1., 2.])

        >>> ℒ = base ** 2
        >>> ℒ.backward()

        Although ``view`` is not directly involved in the computation in ``ℒ``,
        and thus would not typically store a gradient in due to ``ℒ.backward()``,
        it shares memory with ``base`` and thus it stores a gradient in correspondence
        to this "view relationship". I.e. because ``view == base[:2]``, then we expect
        to find that ``view.grad == base.grad[:2]``.

        >>> base.grad
        array([2., 4., 6.])
        >>> view.grad
        array([2., 4.])

        >>> view.grad.base is base.grad
        True

        The reasoning here is that, because a base tensor and its view share the same
        array data, then varying an element in that data implies that both the base
        tensor and the view will change (assuming the variation occurs specifically in
        a shared region). It follows that the base tensor's gradient must share the same
        relationship with the view-tensor since these are measures of "cause and effects"
        associated with varying elements of data (albeit infinitesmaly).
        """
        if self._base is None:
            return self._grad

        if self._view_grad is not None and self._view_grad.base is self._base._grad:
            # view grad has been computed already
            return self._view_grad

        if self._base._grad is None or self._creator is None:
            #  ``self`` had its graph, connecting it to its base, cleared.
            #  ``self._view_grad`` can't be computed without this info.
            return None

        (view_parent,) = self._creator.variables

        # recursively fetches grad from parent
        grad = view_parent.grad
        with _track.no_autodiff:
            self._view_grad = self._replay_op(grad).data if grad is not None else None
        return self._view_grad

[docs]    def astype(
        self,
        dtype: DTypeLikeReals,
        casting="unsafe",
        copy: bool = True,
        *,
        constant: Optional[bool] = None,
    ) -> "Tensor":
        """Copy of the tensor with the specified dtype.

        The resulting tensor is not involved in any computational graph
        and has no gradient associated with it.

        This docstring was adapted from that of ``ndarray.astype``.

        Parameters
        ----------
        dtype : Union[type, str]
            The real-valued numeric data type. This can be a numpy dtype or
            a corresponding string identifier.

        casting : Literal['no', 'equiv', 'safe', 'same_kind', 'unsafe']
            Controls what kind of data casting may occur. Defaults to ‘unsafe’ for backwards compatibility.
                - ‘no’ means the data types should not be cast at all.
                - ‘equiv’ means only byte-order changes are allowed.
                - ‘safe’ means only casts which can preserve values are allowed.
                - ‘same_kind’ means only safe casts or casts within a kind, like float64 to float32, are allowed.
                - ‘unsafe’ means any data conversions may be done.

        copy : bool, optional (default=True)
            By default, astype always returns a newly allocated array. If this is set to false, and
            the ``dtype`` and ``constant`` requirements are satisfied, the input tensor is returned
            instead of a copy.

        constant : Optional[bool]
            If specified, determines if the returned tensor is a constant.
            Otherwise this argument is inferred from the original tensor.

        Returns
        -------
        Tensor
            The resulting tensor with the specified data type.

        References
        ----------
        [1].. Retrieved from: https://numpy.org/doc/stable/reference/generated/numpy.ndarray.astype.html

        Examples
        --------
        >>> import mygrad as mg
        >>> import numpy as np
        >>> x = mg.arange(10); x
        Tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

        Using a string to specify the data type:

        >>> x.astype("float32")
        Tensor([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.], dtype=float32)

        Specifying a numpy data type object, and specifying that the
        tensor is to be treated as a constant:

        >>> x.astype(np.int8, constant=True)
        Tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int8)
        """
        cast_data = self.data.astype(dtype=dtype, casting=casting, copy=copy)

        if cast_data is self.data and (constant is None or self.constant is constant):
            return self

        return type(self)(cast_data, copy=False, constant=constant)

    @classmethod
    def _op(
        cls,
        Op: Type[Operation],
        *input_vars: ArrayLike,
        op_args: Optional[Sequence] = None,
        op_kwargs: Optional[Dict[str, Any]] = None,
        constant: Optional[bool] = None,
        out: Optional[Union[np.ndarray, "Tensor"]] = None,
    ):
        """Wraps operations performed between tensors: f(a, b, ...).

        For developer use only.

        Parameters
        ----------
        Op : Type[Operation]
            Operation-class, used to perform forward-pass on `input_vars`.

        input_vars : Tuple[array_like, ...]
            An arbitrary number of input-tensors. These can take any form that
            can be converted to an array.  This includes numbers, sequences, nested
            numerical sequences, numpy-ndarrays, and mygrad-tensors.

        op_args : Optional[Tuple[Any, ...]]
            Arbitrary positional arguments passed to the operation's forward pass.

        op_kwargs : Optional[Dict[str, Any]]
            Arbitrary keyword arguments passed to the operation's forward pass.

        constant : bool, optional (default=False)
            If True, the resulting Tensor is a constant.

        out: Optional[Union[np.ndarray, "Tensor"]]
            The target where the output (an ndarray) of the operation will be written.
            Thus this raises if `out` is read-only.

            There is an exception to this if  a tensor is provided, in which case the
            operation does not write to its underlying memory but rather triggers
            "in-place semantics" so that the computational graph behaves as if the
            tensor was mutated. See  ``Tensor._in_place_op`` for more details.

        Returns
        -------
        mygrad.Tensor
            The tensor-result of the operation's forward-pass."""
        if out is not None:
            if isinstance(out, tuple):
                if len(out) > 1:  # pragma: no cover
                    raise ValueError(
                        "mygrad does not support in-place operations with more that one target"
                    )
                (out,) = out

            if isinstance(out, Tensor):
                out._in_place_op(
                    Op,
                    *input_vars,
                    op_args=op_args,
                    op_kwargs=op_kwargs,
                    constant=constant,
                )
                return out

        _uniques_bases_then_arrs = ()

        tensor_vars = tuple(
            cls(var, constant=True, copy=False) if not isinstance(var, Tensor) else var
            for var in input_vars
        )

        # cast all input-vars to tensors
        if _track.TRACK_GRAPH and _mem.MEM_GUARD:
            # lock memory of array data
            _uniques_bases_then_arrs = WeakRefIterable(
                _mem.lock_arr_writeability(x)
                for x in _mem.unique_arrs_and_bases(tensor_vars)
            )

        if op_args is None:
            op_args = tuple()

        if op_kwargs is None:
            op_kwargs = {}

        f = Op()

        try:
            if out is None:
                op_out: np.ndarray = f(*tensor_vars, *op_args, **op_kwargs)
            else:
                op_out: np.ndarray = f(*tensor_vars, *op_args, **op_kwargs, out=out)
        except Exception as e:
            if _track.TRACK_GRAPH and _mem.MEM_GUARD:
                _mem.release_writeability_lock_on_op(_uniques_bases_then_arrs)
            raise e

        if not _track.TRACK_GRAPH:
            # execute operation without tracking creator or any graph
            # information
            return cls(
                op_out,
                constant=constant,  # constant not determined by graph info
                copy=False,
                _creator=None,
                _base=None,
            )

        # points to parent tensor that op-output is a view of
        base = None  # type: Optional[Tensor]

        # If output of op is a view - tracks the tensor var that is
        # the parent of the view
        parent_var: Optional[Tensor] = None

        # Determine whether or not op was a view; if so, `base`
        # points to parent Tensor
        op_out_base = op_out.base
        if f.can_return_view and op_out_base is not None:
            vars_can_share_mem = (
                isinstance(var, (np.ndarray, Tensor)) for var in input_vars
            )
            for can_share_mem, parent_var in zip(vars_can_share_mem, tensor_vars):
                if not can_share_mem:
                    continue
                parent_data = parent_var.data
                parent_data_base = parent_data.base

                if (
                    (op_out_base is parent_data)
                    or (op_out_base is parent_data_base)
                    or (op_out is parent_data)
                ):
                    if parent_var._base is not None and parent_var._creator is None:
                        parent_var._base = None

                    base = parent_var if parent_var.base is None else parent_var.base
                    break
            else:
                parent_var = None

        for v in input_vars:
            if isinstance(v, Tensor):
                # tensor's graph has been cleared, but its base lingers
                if v._base is not None and v._creator is None:
                    v._base = None

                if base is None:
                    # non-view ops clear grads
                    v._grad = None
                    v._view_grad = None

        if base is not None:
            # we need to be able to replay view-ops for doing in-place operations
            # on graphs with views
            f.replay_args = op_args
            f.replay_kwargs = op_kwargs
            f.replay_force_constant = constant

        # record graph information
        if constant is None:
            if any(not var.constant for var in tensor_vars):
                constant = None
            else:
                constant = True

        # record that a variable participated in that op
        ref_f = ReferenceType(f)  # type: WeakRef[Operation]
        for var in tensor_vars:
            var._ops.add(ref_f)

        tensor_out = cls(
            op_out,
            constant=constant,
            copy=False,
            _creator=f,
            _base=base,
        )

        if parent_var is not None:
            parent_var._view_children.append(tensor_out)

        if _mem.MEM_GUARD:
            if out is not None and tensor_out.data.base is not None:
                _mem.lock_arr_writeability(tensor_out.data.base)
                _uniques_bases_then_arrs.append(tensor_out.data.base)
            _mem.lock_arr_writeability(tensor_out.data)
            tensor_refs = _uniques_bases_then_arrs
            tensor_refs.append(tensor_out.data)
            finalize(f, _mem.release_writeability_lock_on_op, tensor_refs)
        return tensor_out

    def _replay_op(self, *input_vars: ArrayLike) -> "Tensor":
        """*dev use only*

        Replays the op that produced `self` - called on the specified
        input vars"""
        if self.creator is None:
            raise DisconnectedView(
                "``Tensor._replay_op(...)`` was called on a tensor without a creator."
                "\nPlease report this error at: https://github.com/rsokl/MyGrad/issues"
            )
        return self._op(
            type(self.creator),
            *input_vars,
            op_args=self.creator.replay_args,
            op_kwargs=self.creator.replay_kwargs,
            constant=self.creator.replay_force_constant,
        )

[docs]    def backward(self, grad: Optional[ArrayLike] = None):
        """Trigger backpropagation and compute the derivatives of this tensor.

        Designating this tensor as the tensor ℒ, compute dℒ/dx for all (non-constant) tensors
        that preceded ℒ in its computational graph, and store each of these derivatives in ``x.grad``
        respectively.

        Once back-propagation is finished, the present tensor is removed from all computational
        graphs, and the preceding graph is cleared.

        If ℒ is a non-scalar tensor (i.e. ``ℒ.ndim`` is greater than 0), then calling
        ``ℒ.backward()`` will behave as if ℒ was first reduced to a scalar via summation. I.e. it
        will behave identically to ``ℒ.sum().backward()``; this ensures that each element of any
        dℒ/dx will represent a derivative of a scalar function.

        Parameters
        ----------
        grad : Optional[array_like], (must be broadcast-compatible with ``self``
            By default, the present tensor is treated as the terminus of the computational graph (ℒ).
            Otherwise, one can specify a "downstream" derivative, representing ``dℒ/d(self)``.
            This can be used to effectively connect otherwise separate computational graphs.

        Examples
        --------
        >>> import mygrad as mg
        >>> x = mg.tensor(2)
        >>> y = mg.tensor(3)
        >>> w = x * y
        >>> ℒ = 2 * w
        >>> ℒ.backward()  # computes dℒ/dℒ, dℒ/dw, dℒ/dy, and dℒ/dx

        >>> ℒ.grad  # dℒ/df == 1 by identity
        array(1.)
        >>> w.grad  # dℒ/dw
        array(2.)
        >>> y.grad # dℒ/dy = dℒ/dw * dw/dy
        array(4.)
        >>> x.grad # dℒ/dx = dℒ/dw * dw/dx
        array(6.)

        Calling ``ℒ.backward()`` from a non-scalar tensor is equivalent
        to first summing that tensor.

        >>> tensor = mg.tensor([2.0, 4.0, 8.0])
        >>> ℒ = tensor * tensor[::-1]  # [x0*x2, x1*x1, x2*x0]
        >>> ℒ.backward()  # behaves like ℒ = x0*x2 + x1*x1 + x2*x0
        >>> tensor.grad
        array([16.,  8.,  4.])

        >>> tensor = mg.Tensor([2.0, 4.0, 8.0])
        >>> ℒ = tensor * tensor[::-1]
        >>> ℒ.sum().backward()
        >>> tensor.grad
        array([16.,  8.,  4.])

        Specifying a value for ``grad``

        >>> x = mg.Tensor(1.)
        >>> x.backward(2.)
        >>> x.grad  # Would normally be dℒ/dℒ == 1
        array(2.)
        """
        if not _track.TRACK_GRAPH:
            return

        if self.constant:
            self.clear_graph()
            return

        topo_sorted_tensors: Deque["Tensor"] = deque([])
        seen: Set[int] = set()

        collect_all_tensors_and_clear_grads(self, seen, topo_sorted_tensors)

        # don't set self._grad yet because there is a grad-clearing step that
        # occurs during graph creation
        if grad is not None:
            # `self` is guaranteed to be a tensor of floats
            # so we can simply cast `grad` to be the same dtype
            _grad = asarray(grad, dtype=self.dtype)

            if _grad.shape != self.shape:
                try:
                    # See if grad can broadcast to `self`
                    # raises ValueError if not
                    _grad = np.multiply(
                        np.full_like(self.data, fill_value=1.0),
                        _grad,
                        dtype=self.dtype,
                    )
                    if _grad.shape != self.shape:
                        # mutual broadcasting occurred
                        raise ValueError()
                except ValueError:
                    raise ValueError(
                        f"`tensor.backward(grad)` was passed a gradient with an incompatible shape.\n"
                        f"`grad` must be broadcast-compatible with `tensor.shape={self.shape}`\n"
                        f"Got `grad.shape={_grad.shape}`"
                    )
        else:
            _grad = np.full_like(self.data, fill_value=1.0)

        self._grad = _grad

        if self.creator is not None:
            for t in topo_sorted_tensors:
                t._backward()

        self.clear_graph()

    def _backward(self):
        """
        **For dev-use only**

        If `self` has accumulated incoming gradients from all operations in the terminal node's
        computational graph, back-propagate the accumulated gradient to the creator of `self`.

        Parameters
        ----------
        graph : Set[Operation]
            The set of all operations relevant to the terminal node of the computational graph,
            which triggered back-propagation

        Raises
        ------
        AssertionError
            Raises if the tensor and its associated gradient possess different shapes.
            Raises if `_backward` triggered on a tensor with gradient of `None`.
        """
        assert self._grad is not None, (
            f"backprop, post grad-accumulation, was triggered "
            f"on a tensor with no gradient"
            f"\n{self}"
            f"\nid {id(self._ops)}"
            f"\ngrad: {self.grad}"
            f"\ncreator: {self.creator}"
            f"\nops: {self._ops}"
            f"\nbase: {self.base}"
        )
        assert self._grad.shape == self.shape, (
            f"A tensor and its associated gradient must possess the same shape. Got:"
            f"\ntensor-shape: {self.shape}"
            f"\ngrad-shape: {self._grad.shape}"
        )
        if self._creator is not None:
            self._creator.backward(self._grad)
        return

[docs]    def null_grad(self, *, _clear_view_info: bool = False) -> "Tensor":
        """Sets this tensor's gradient to be ``None``.

        This operation is performed in-place, but a reference to the
        tensor is returned in order to permit mapping semantics.

        Also removes any ``base`` reference from disconnected views.

        Returns
        -------
        self

        Examples
        --------
        >>> import  mygrad as mg
        >>> x = mg.Tensor(2.)
        >>> (x ** 2).backward()
        >>> x.grad
        array(4.)
        >>> x.null_grad()  # returns a reference of `x`
        Tensor(2.0)
        >>> x.grad is None
        True"""
        self._view_grad = None
        self._grad = None

        if _clear_view_info:
            if self._base is not None and self._creator is None:
                self._base = None

        return self

[docs]    def null_gradients(self, clear_graph: bool = True):
        """
        **Deprecated: Tensors will automatically have their computational graphs cleared during backprop.
        Simply involving a tensor in a new computational graph will null its gradient.**

        Sets the gradient for this tensor and for all preceding tensors in the computation graph
        to ``None``.

        Additionally, the computational graph that terminates in this tensor can also be cleared
        during this process.

        Parameters
        ----------
        clear_graph : bool, optional (default=True)
            If ``True`` clear the computational graph in addition to nulling the gradients.

        Notes
        -----
        It is advised to clear the computational graph when nulling gradients, i.e. invoke
        ``null_gradients(clear_graph=True)`` (or simply ``null_gradients()``). This de-references
        all intermediate operations and tensors in the computational graph and thus permits
        garbage collection - freeing the memory that was used by the computational graph.

        Examples
        --------
        >>> import mygrad as mg
        >>> x = mg.tensor(2)
        >>> y = mg.tensor(3)
        >>> w = x * y
        >>> f = 2 * w
        >>> f.backward()  # computes df/df, df/dw, df/dy, and df/dx
        >>> any(tensor.grad is None for tensor in (f, w , x, y))
        False

        >>> f.null_gradients()  # set tensor.grad to None for all tensors in the graph
        >>> all(tensor.grad is None for tensor in (f, w , x, y))
        True
        """
        import warnings

        warnings.warn(
            "`tensor.null_gradients()` is deprecated. Calling it will raise an error "
            "in future versions of MyGrad. A tensor will automatically "
            "have its gradient nulled if you use it in a new computational graph. "
            "Or, you can call `tensor.null_grad()` to null that individual tensor's "
            "gradient.",
            FutureWarning,
        )

[docs]    def clear_graph(self):
        """
        Removes the current tensor – and tensors above it – from their shared
        computational graph.

        This de-references all operations involved in the graph and the intermediate
        tensors that were created by it. Arrays whose memory were locked by the
        computational graph will have their writeability restored.

        Examples
        --------
        >>> import mygrad as mg
        >>> import numpy as np
        >>> x = np.array([1., 2.])
        >>> y = mg.multiply(2., x)
        >>> x.flags.writeable, y.creator
        (False, <mygrad.math.arithmetic.ops.Multiply at 0x224f89cac48>)
        >>> y.clear_graph()
        >>> x.flags.writeable, y.creator
        (True, None)
        """
        if self._base is not None:
            # "pull" on grad to force views to update their
            # gradients from upstream before the graph info
            # gets cleared
            _ = self.grad

        self._view_children.clear()
        self._ops.clear()

        if self._creator is None:
            return

        creator = self._creator
        self._creator = None  # marks tensor as "visited" during graph-traversal

        for var in creator.variables:  # type: "Tensor"
            var.clear_graph()

    @property
    def constant(self) -> bool:
        """If ``True``, this tensor is a constant; it will not propagate any gradient.

        Additionally, any tensor that is a descendant of constant tensors will also
        be a constant.

        Integer-valued tesnors, Python scalars and NumPy arrays are treated as constant
        tensors when included in MyGrad computational graphs.

        Returns
        -------
        bool

        Examples
        --------
        Constant-tensors do not back-propagate gradients:

        >>> import mygrad as mg
        >>> x = mg.Tensor([1., 2.], constant=True)
        >>> y = mg.Tensor([0., 3.], constant=False)
        >>> f = x * y
        >>> f.backward()

        >>> x.grad is None  # x has no gradient
        True
        >>> y.grad
        array([1., 2.])

        A tensor that is derived solely from constant tensors is also
        a constant:

        >>> import numpy as np
        >>> x = mg.Tensor([1., 2.], constant=True)
        >>> y = mg.Tensor([0., 3.], constant=True)
        >>> z = (x + y) ** 2 - np.array([8., 7.])
        >>> z.constant
        True

        Integer-valued tensors are treated as constants

        >>> mg.Tensor([1, 2]).constant
        True
        """
        return self._constant

    @property
    def creator(self) -> Optional[Operation]:
        """The ``Operation`` instance that produced ``self``.

        Returns
        -------
        creator : Optional[Operation]
            The operation-instance that created the tensor, or `None`.

        Examples
        --------
        >>> import mygrad as mg
        >>> x = mg.Tensor(3)
        >>> x.creator is None
        True
        >>> y = mg.Tensor(2)
        >>> z = x * y  # Multiply(x, y) -> z
        >>> z.creator
         <mygrad.math.arithmetic.ops.Multiply at 0x2df5a130438>
        """
        return self._creator

    def __len__(self) -> int:
        return len(self.data)

    def __contains__(self, item) -> bool:
        return self.data.__contains__(item)

    def __getitem__(self, item: Index) -> "Tensor":
        return self._op(GetItem, self, op_args=(item,))

    def __iter__(self) -> Iterator["Tensor"]:
        # In the same way that numpy doesn't let you iterate over 0-dimensional
        # arrays, don't allow iteration over 0-dimensional arrays.
        if self.ndim == 0:
            raise TypeError("iteration over a 0-d tensor")
        return iter(self[n] for n in range(len(self)))

    def _in_place_op(
        self,
        inplace_op: Type[Operation],
        *input_vars: ArrayLike,
        op_args: Optional[Sequence] = None,
        op_kwargs: Optional[Dict] = None,
        constant: Optional[bool] = None,
    ):
        if _track.TRACK_GRAPH is False:
            return self._op(
                inplace_op,
                *input_vars,
                op_args=op_args,
                op_kwargs=op_kwargs,
                constant=constant,
                out=self.data,
            )
        #
        # **********************************************************************************
        # The way that in-place updates work in MyGrad is that any tensor that
        # is about to undergo a mutation gets "cloned". Each resulting "placeholder"
        # is used to represent that tensor in any non-view operations that the tensor
        # was participating in. This ensures that the stateful computational graph
        # is not corrupted by this mutation.
        #
        # Once the placeholders have been created, they have permanently replaced the
        # rolls of their counterparts within the computational graph. Furthermore, they
        # exist only internally to the computational graph and thus cannot be the
        # targets of subsequent views or in-place updates.
        #
        # At this point, the "original" tensors merely reserve the publicly-available
        # Tensor-instances (husks) that the users will access. We eventually need to
        # populate these husks with the appropriate augmented contents and graph-history.
        #
        # Thus this method will compute the in-place operation on a new tensor, and
        # will create a new, internal computational graph involving the base tensor
        # affected by the mutation and any of its view-children. These tensors represent
        # the mutated tensors that the users expect to have access to.
        #
        # We must connect this new computational graph to the preceding one – the one
        # involving the placeholders; this way we can backpropagate appropriately and
        # through all influencers.
        #
        # Finally we mirror each of these new tensors into the husks of the publicly
        # -available tensors and reroute the computational graph through them so that
        # the user sees that all of the relevant tensors have been augmented, and that
        # they are connected to the appropriate "history" such that backprop occurs
        # without error or inaccuracy.
        #
        #
        # For illustration, consider the following graph:
        #
        # ... x------[square]-- y = x**2
        #        \
        #         ---[slice]-- z = view-x
        #                              \
        #                               ---[mul]-- w = 3 * z
        #
        # Now suppose that we mutate `x` with `x[:] = 0`. This is a simpler case than
        # mutating a view of `x`, since `x` is already the base tensor.
        #  - This should not affect `y`
        #  - It should affect `view_x`
        #  - It should *not* affect `w`, which depends on `view_x` in a "static" way.
        #    I.e. the value for `w` is already resolved and is not a view of z or x.
        #
        #
        # As prescribed above, we will make the placeholders: px and pz, and we
        # will reroute the operations that statically depend on the old values of x and z
        # through these placeholders.
        #
        # Next we will have `x` point to a mutated version of itself, in accord with the
        # in-place update being performed, and we will subsequently recreate any
        # views of x (i.e. z), based off of this mutated tensor.
        #
        # The resulting graph is:
        #
        #                             ---[slice]-- z = view-x
        #                            /
        #        -----[set-item] -- x = px.copy()[:]=0
        #       /
        # ... px------[square]-- y = px**2
        #        \
        #         ---[slice]-- pz = view-px
        #                              \
        #                               ---[mul]-- w = 3 * pz
        #
        # Note that px and pz are strictly *internal* tensors; they cannot be accessed for
        # use in any further operations, whereas `x` and `z` are available for further use.
        #
        # **********************************************************************************
        #
        # Replace base and all of its views with "placeholder" tensors;
        # they serve as internal references to all tensors pre-mutation
        # and will preserve ops relying on the un-mutated tensors.
        #
        # These placeholder tensors are never publicly-available and thus cannot
        # be involved directly in future in-place updates

        # In Tensor._op, any tensor entering an op has its grad/view-info cleared
        # We must do this here up front since we need to consume information
        # about ``self``
        self.null_grad(_clear_view_info=True)
        if self._base is not None and not self._base._view_children:
            self._base = None

        graph = _dup.DuplicatingGraph(self if self.base is None else self.base)

        # Create copy of base so that mutation has no impact on the
        # state of any ops depending on it or its views
        mutant_base = graph.base.tensor.copy()
        mutant_base.data.flags.writeable = (
            graph.base.tensor.data.flags.writeable
            or _mem.array_is_tracked(graph.base.tensor.data)
        )

        # Create view of base in correspondence to relationship
        # that `self` has to base. Mutating this view will mutate
        # base appropriately
        inplace_target = mutant_base

        # stores view-fn sequence from base -> in-place target
        view_fn_sequence: List[Callable[[np.ndarray], np.ndarray]] = []

        with _track.no_autodiff:
            # get view sequence from base -> in-place target
            for node in graph.get_path_to_base(self)[::-1][1:]:  # skip base
                # need to point to place-holder replay op to avoid creating
                # forwards references to downstream tensors
                f = node.placeholder._replay_op
                if self.base is not None:
                    # need sequence of view-ops
                    view_fn_sequence.append(_track.no_autodiff(f, to_numpy=True))
                inplace_target = f(inplace_target)

        # Constant info was not propagated through no-autodiff mode.
        # It must be inferred from the original tensor
        inplace_target._constant = mutant_base.constant

        mutant_base_data = mutant_base.data
        del mutant_base

        try:
            with _mem.mem_guard_off:
                placeholder_mutant_view = (
                    self._op(  # will raise if original data not writeable
                        inplace_op,
                        *(graph.get_placeholder_if_exists(t) for t in input_vars),
                        op_args=op_args,
                        op_kwargs=op_kwargs,
                        constant=constant,
                        out=inplace_target.data,
                    )
                )
        except Exception as e:
            graph.restore_old_graph()
            raise e

        placeholder_mutant_view._constant = inplace_target._constant

        if _mem.MEM_GUARD:
            _mem.force_lock_tensor_and_creators(placeholder_mutant_view)

        if placeholder_mutant_view.creator.where is not True:
            # An operation like `multiply(x, y, where=mask, out=z)` occurred.
            # `placeholder_mutant_view` is the mutated version of `z`.
            # We need to connect the upstream version of `z` to the computational
            # graph so that `~mask * dℒ/dz` backprops to it, whereas `~mask * dℒ/dz`
            # will backprop to `x` and `y`.
            #
            # This is basically an alternative to treating
            # `multiply(x, y, where=mask, out=z)`
            # like a three-input operation, which adds complexity to the implementation
            # of every op that supports `where` and `out`.
            #
            #               old-z ---------------------
            #                 |                       |
            #   multiply(x, y, where=mask, out=z)     |
            #                 |                       |
            #                 z    --------------------
            #                 |    |
            #                 ApplyMask
            #                    |
            #                    z
            with _mem.mem_guard_off:
                placeholder_mutant_view = type(self)._op(
                    _dup.ApplyMask,
                    placeholder_mutant_view,  # gets passed through unchanged
                    # ~mask * grad  backprops to upstream placeholder
                    graph[self].placeholder,
                    op_kwargs={
                        "mask": placeholder_mutant_view.creator.where,
                    },
                )

        # Connect public base tensor to placeholder graph via the mutated placeholder
        # tensor `out`.
        if self.base is None:
            # The current graph:
            #    base-p --> | inplace | --> vp'
            # Becomes:
            #    base-p --> | inplace | --> base'
            #
            # The base tensor itself was the target of the in-place operation,
            # thus we need simply mirror original base against the mutant placeholder.
            # This effectively connects the original base to the placeholder graph
            mutant_base = placeholder_mutant_view

        else:
            # in-place operation occurred on a view; must connect mutated base
            # to graph and then reproduce downstream views
            #
            # The current graph:
            #    vp --> | inplace | --> vp'
            #
            # Becomes:
            #
            #    vp --> | inplace | --> vp' --> |        |
            #                                   | unview | --> base'
            #   base-p -----------------------> |        |
            #
            # I.e. the mutated base is a combination of the placeholder
            # base and of the mutant view.

            mutant_base = type(self)._op(
                _dup.UnView,
                graph.base.placeholder,
                placeholder_mutant_view,
                op_kwargs={
                    # Copy to avoid upstream placeholder mutant view sharing memory
                    # with downstream mutant base
                    "mutant_base_data": mutant_base_data,
                    "view_fn_sequence": view_fn_sequence,
                },
            )

        del placeholder_mutant_view

        # The original base now points to the augmented array data
        # and has the InPlaceOp as its creator
        _dup.mirror_tensor(source=mutant_base, target=graph.base.tensor)

        del mutant_base

        # Now that the base-tensor has been incorporated into the graph,
        # recreate the view-graph and reroute all tensors from previous
        # graph to their downstream counterparts
        #
        # Note that iterating in a topologically-ordered way is critical
        # here: each parent is updated before creating one of its children
        #
        # Iteration is always based off of the placeholders' relative positions
        # in the graph since this will never be mutated.
        for node in graph:
            if node.parent is None:
                continue
            view = node.tensor._replay_op(node.parent)
            _dup.mirror_tensor(source=view, target=node.tensor)
            node.parent._view_children.append(node.tensor)

    @property
    def shape(self) -> Shape:
        """Tuple of tensor dimension-sizes.

        Sizes are reported in row-major order.

        Returns
        -------
        Tuple[int, ...]

        Examples
        --------
        >>> import mygrad as mg
        >>> x = mg.Tensor([1, 2, 3, 4])  # axis-0 has size 4
        >>> x.shape
        (4,)
        >>> y = mg.Tensor([[1, 2, 3],    # axis-0 has size 2, axis-1 has size 3
        ...                [4, 5, 6]])
        >>> y.shape
        (2, 3)

        The shape attribute can also be set to reshape the tensor in-place

        >>> y.shape = (1, 6, 1)
        >>> y
        Tensor([[[1],
                 [2],
                 [3],
                 [4],
                 [5],
                 [6]]])

        See Also
        --------
        mygrad.reshape : similar function
        Tensor.reshape : similar method"""
        return self.data.shape

    @shape.setter
    def shape(self, newshape: Union[int, Shape]):
        # Even though this op cannot mutate views, we still must
        # do graph-replaying here so that views can still reference
        # this tensor, but with the proper reshaping mediating them.
        #
        # E.g.
        # x = arange(10)   # shape-(10,)
        # y = x[:6]        # shape-(6,)
        # x.shape = (2, 5) # shape-(2, 5)
        #
        # y.base points to the shape-(2,5) array
        # even though y is a view of the flat array
        #
        # thus we need to play this graph as
        #   (history)
        #       |
        #   placeholder   shape-(10,)
        #       |-reshape
        #       x         shape-(2,5)
        #       |-reshape
        #   placeholder   shape-(10,)
        #       |-getitem
        #       y         shape-(4,)

        if not _track.TRACK_GRAPH:
            self.data.shape = newshape
            return

        if newshape == self.shape:
            return

        old_shape = self.shape

        # raise here if the shape is not compatible
        self.data.shape = newshape
        self.data.shape = old_shape

        # create placeholders for self and all of its view-children
        graph = _dup.DuplicatingGraph(self)
        # need to iterate over all nodes now before we tinker
        # with the view children
        nodes = tuple(graph)

        # reshape placeholder of self
        out = graph.base.placeholder.reshape(newshape)

        # Store contents of `out` in `self` and replace `out` in
        # graph with `self`
        out._base = graph.base.placeholder.base
        _dup.mirror_tensor(source=out, target=self)
        _dup.reroute_ops_through(source=out, target=self)
        del out

        # although `self` is a view of placeholder, placeholder
        # is strictly an internal tensor, we won't expose it as
        # base
        graph.base.placeholder._view_children.append(self)
        base = graph.base.placeholder.base

        if base is not None:
            # if `self` was a view, we need to update that parent's
            # view children so that it points to the placeholder
            creator = graph.base.placeholder.creator.variables[0]
            creator._view_children = WeakRefIterable(
                [
                    w if w is not self else graph.base.placeholder
                    for w in graph.base.placeholder._view_children
                ]
            )

        # Undo the reshape, and place this as the tensor joining
        # the reshaped `self` with the views of unshaped `self`
        unshaped = self.reshape(old_shape)

        for node in nodes:
            if node.parent is None:
                continue
            # direct what would be views of `self` to be views of `unshaped`,
            # which translates the mutated shape of `self` to the original
            # shape used to create the views
            parent = node.parent if node.parent is not self else unshaped
            view = node.tensor._replay_op(parent)
            _dup.mirror_tensor(source=view, target=node.tensor)
            _dup.reroute_ops_through(source=view, target=node.tensor)
            parent._view_children.append(node.tensor)

    def __setitem__(self, key: Index, value: ArrayLike):
        self._in_place_op(SetItem, self, value, op_args=(key,))

    def __add__(self, other: ArrayLike) -> "Tensor":
        return self._op(Add, self, other)

    def __iadd__(self, other: ArrayLike) -> "Tensor":
        self._in_place_op(Add, self, other)
        return self

    def __radd__(self, other: ArrayLike) -> "Tensor":
        return self._op(Add, other, self)

    def __sub__(self, other: ArrayLike) -> "Tensor":
        return self._op(Subtract, self, other)

    def __isub__(self, other: ArrayLike) -> "Tensor":
        self._in_place_op(Subtract, self, other)
        return self

    def __rsub__(self, other: ArrayLike) -> "Tensor":
        return self._op(Subtract, other, self)

    def __truediv__(self, other: ArrayLike) -> "Tensor":
        return self._op(Divide, self, other)

    def __rtruediv__(self, other: ArrayLike) -> "Tensor":
        return self._op(Divide, other, self)

    def __floordiv__(self, other: ArrayLike) -> np.ndarray:
        return np.floor_divide(self, other)

    def __rfloordiv__(self, other: ArrayLike) -> np.ndarray:
        return np.floor_divide(other, self)

    def __itruediv__(self, other: ArrayLike) -> "Tensor":
        self._in_place_op(Divide, self, other)
        return self

    def __mul__(self, other: ArrayLike) -> "Tensor":
        return self._op(Multiply, self, other)

    def __imul__(self, other: ArrayLike) -> "Tensor":
        self._in_place_op(Multiply, self, other)
        return self

    def __rmul__(self, other: ArrayLike) -> "Tensor":
        return self._op(Multiply, other, self)

    def __matmul__(self, other: ArrayLike) -> "Tensor":
        return self._op(MatMul, self, other)

    def __rmatmul__(self, other: ArrayLike) -> "Tensor":
        return self._op(MatMul, other, self)

    def __pow__(self, other: ArrayLike):
        if isinstance(other, Number) or (
            isinstance(other, np.ndarray) and other.ndim == 0
        ):
            if other == 1:
                return self._op(Positive, self)
            elif other == 2:
                return self._op(Square, self)

        return self._op(Power, self, other)

    def __ipow__(self, other: ArrayLike) -> "Tensor":
        if isinstance(other, Number) or (
            isinstance(other, np.ndarray) and other.ndim == 0
        ):
            if other == 1:
                self._in_place_op(Positive, self)
                return self
            elif other == 2:
                self._in_place_op(Square, self)
                return self

        self._in_place_op(Power, self, other)
        return self

    def __rpow__(self, other: ArrayLike):
        return self._op(Power, other, self)

    def __neg__(self):
        return self._op(Negative, self)

    def __pos__(self):
        return self._op(Positive, self)

    def __repr__(self) -> str:
        return repr(self.data).replace("array", "Tensor").replace("\n", "\n ")

    def __copy__(self) -> "Tensor":
        """Produces a copy of ``self`` with ``copy.creator=None``.

        Copies of the underlying numpy data array and gradient array are created.

        Returns
        -------
        Tensor
        """
        return self.copy()

[docs]    def copy(self, *, constant: Optional[bool] = None) -> "Tensor":
        """Produces a copy of ``self`` with ``copy.creator=None``.

        Copies of the underlying numpy data array and gradient array are created.

        No information regarding the tensor's participation in the computational
        graph are copied.

        Parameters
        ----------
        constant : Optional[bool]

        Returns
        -------
        Tensor

        Examples
        --------
        >>> import mygrad as mg
        >>> x = mg.Tensor(data, constant=constant)
        >>> y = x * 2
        >>> y.backward()
        >>> y_copy = y.copy()
        >>> y_copy
        Tensor(6)
        >>> y_copy.grad
        array(1.)
        >>> y_copy.creator is None
        True
        """
        copy = Tensor(
            np.copy(self.data),
            constant=(self.constant if constant is None else constant),
        )
        copy._grad = np.copy(self._grad) if self._grad is not None else None
        return copy

[docs]    def item(self) -> Union[int, float]:
        """Copy an element of a tensor to a standard Python scalar and return it.

        Note that the returned object does not support back-propagation.

        Returns
        -------
        z : Standard Python scalar object
            A copy of the specified element of the tensor as a suitable
            Python scalar

        Examples
        --------
        >>> import mygrad as mg
        >>> x = Tensor([22.2])
        >>> x.item()
        22.2
        >>> type(x.item())
        float"""
        if self.size > 1:
            raise ValueError("can only convert a tensor of size 1 to a Python scalar")
        return self.data.item()

    def __float__(self) -> float:
        if self.size > 1:
            raise TypeError("can only convert a tensor of size 1 to a Python scalar")
        return float(self.data)

    def __int__(self) -> int:
        if self.size > 1:
            raise TypeError("can only convert a tensor of size 1 to a Python scalar")
        return int(self.data)

    def __index__(self) -> int:
        """Return self converted to an integer, if self is suitable for use as an index
        into a list."""
        return self.data.__index__()

[docs]    def flatten(self, *, constant: Optional[bool] = None) -> "Tensor":
        """Return a copy of the tensor collapsed into one dimension.

        This docstring was adapted from ``numpy.ndarray.flatten``.

        Parameters
        ----------
        constant : bool, optional(default=False)
            If ``True``, the returned tensor is a constant (it
            does not back-propagate a gradient)

        Returns
        -------
        mygrad.Tensor
            A copy of the input tensor, flattened to one dimension.

        Notes
        -----
        To return a flattened view of the tensor, use ``x.reshape(-1)``.

        Examples
        --------
        >>> import mygrad as mg
        >>> x = mg.Tensor([[1, 2],
        ...                [3, 4]])
        >>> x.flatten()
        Tensor([1, 2, 3, 4])
        """
        return Tensor._op(Flatten, self, constant=constant)

    @property
    def base(self) -> Optional["Tensor"]:
        """
        A reference to the base tensor that the present tensor is a view of.

        It this tensor owns its memory, then this returns ``None``.

        Examples
        --------
        The base of a tensor that owns its memory is ``None``:

        >>> import mygrad as mg
        >>> x = mg.arange(5)
        >>> x.base is None
        True

        Slicing creates a view, whose memory is shared with x:

        >>> y = x[2:]
        >>> y.base is x
        True
        >>> y.data.base is x.data
        True

        A view of a view has the same base as its "parent"

        >>> z = y[:]
        >>> z.base is x
        True

        The behavior of ``Tensor.base`` departs from that of ``ndarray.base`` in that
        mygrad will never create an "internal" tensor to serve as a base; e.g.

        >>> import numpy as np
        >>> np.reshape(2., (1,)).base
        array(2.)

        >>> mg.reshape(2., (1,)).base is None
        True
        """
        return self._base

    @property
    def size(self) -> int:
        """
        Number of elements in the tensor. i.e., the product of the tensor's
        dimensions.

        Returns
        -------
        int

        Examples
        --------
        >>> import mygrad as mg
        >>> x = mg.zeros((3, 5, 2))  # creates a tensor with 3x5x2 (= 30) elements
        >>> x.size
        30
        """
        return self.data.size

    @property
    def ndim(self) -> int:
        """Number of tensor dimensions. I.e. the number
        of indices that must be supplied to uniquely specify
        an element in the tensor.

        Returns
        -------
        int

        Examples
        --------
        >>> import mygrad as mg
        >>> x = mg.Tensor([1, 2, 3])
        >>> x.ndim
        1
        >>> x[0]  # a single index identifies an element in `x`
        Tensor(1)

        >>> y = mg.Tensor([[1, 2, 3],
        ...                [4, 5, 6]])
        >>> y.ndim
        2
        >>> y[0, 0]  # two indices are required to identify an element in `x`
        Tensor(1)"""
        return self.data.ndim

    @property
    def dtype(self) -> np.dtype:
        """Data-type of the tensor's elements.

        Returns
        -------
        numpy dtype object

        Examples
        --------
        >>> import mygrad as mg
        >>> x = mg.Tensor([[0, 1],
        ...                [2, 3]])
        >>> x.dtype
        dtype('int32')
        >>> type(x.dtype)
        <type 'numpy.dtype'>"""
        return self.data.dtype

    def reshape(
        self, *newshape: Union[int, Shape], constant: Optional[bool] = None
    ) -> "Tensor":
        """Returns a tensor with a new shape, without changing its data.
        This docstring was adapted from ``numpy.reshape``

        Parameters
        ----------
        *newshape : Union[int, Tuple[int, ...]]
            The new shape should be compatible with the original shape. If
            an integer, then the result will be a 1-D tensor of that length.
            One shape dimension can be -1. In this case, the value is
            inferred from the length of the tensor and remaining dimensions.

        constant : bool, optional(default=False)
            If ``True``, the returned tensor is a constant (it
            does not back-propagate a gradient)

        Returns
        -------
        mygrad.Tensor
            ``a`` with its shape changed.  A new tensor is returned.

        Notes
        -----
        ``reshape`` utilizes C-ordering, meaning that it reads & writes elements using
        C-like index ordering; the last axis index changing fastest, and, proceeding
        in reverse order, the first axis index changing slowest.

        Examples
        --------
        >>> import mygrad as mg
        >>> a = mg.Tensor([[1, 2, 3], [4, 5, 6]])
        >>> a.reshape(6)
        Tensor([1, 2, 3, 4, 5, 6])

        >>> a.reshape(3, -1))   # the unspecified value is inferred to be 2
        Tensor([[1, 2],
                [3, 4],
                [5, 6]])
        """

        if not newshape:
            raise TypeError("reshape() takes at least 1 argument (0 given)")
        if hasattr(newshape[0], "__iter__"):
            if len(newshape) > 1:
                raise TypeError("an integer is required")
            newshape = newshape[0]
        return Tensor._op(Reshape, self, op_args=(newshape,), constant=constant)

    @property
    def T(self) -> "Tensor":
        """Same as self.transpose(), except that self is returned if self.ndim < 2 and
        a view of the underlying data is utilized whenever possible.

        Returns
        -------
        Tensor

        Examples
        --------
        >>> import mygrad as mg
        >>> y = mg.Tensor([[1, 2, 3],
        ...                [4, 5, 6]])
        >>> y.T
        Tensor([[1, 4],
                [2, 5],
                [3, 6]])
        """
        return self._op(Tensor_Transpose_Property, self)

    def __eq__(self, other: ArrayLike) -> np.ndarray:
        return np.ndarray.__eq__(self.data, asarray(other))

    def __ne__(self, other: ArrayLike) -> np.ndarray:
        return np.ndarray.__ne__(self.data, asarray(other))

    def __lt__(self, other: ArrayLike) -> np.ndarray:
        return np.ndarray.__lt__(self.data, asarray(other))

    def __le__(self, other: ArrayLike) -> np.ndarray:
        return np.ndarray.__le__(self.data, asarray(other))

    def __gt__(self, other: ArrayLike) -> np.ndarray:
        return np.ndarray.__gt__(self.data, asarray(other))

    def __ge__(self, other: ArrayLike) -> np.ndarray:
        return np.ndarray.__ge__(self.data, asarray(other))

    def __imatmul__(self, other):
        raise TypeError(
            "In-place matrix multiplication is not (yet) supported. "
            "Use 'a = a @ b' instead of 'a @= b'"
        )

    def sum(
        self,
        axis: Optional[Union[int, Tuple[int, ...]]] = None,
        keepdims: bool = False,
        *,
        constant: Optional[bool] = None,
    ) -> "Tensor":
        """
        Sum of tensor elements over a given axis.

        Parameters
        ----------
        axis : Optional[int, Tuple[ints, ...]]
            Axis or axes along which a sum is performed.  The default,
            axis=None, will sum all of the elements of the input tensor.  If
            axis is negative it counts from the last to the first axis.
            If axis is a tuple of ints, a sum is performed on all of the axes
            specified in the tuple instead of a single axis or all the axes as
            before.

        keepdims : bool, optional
            If this is set to True, the axes which are reduced are left
            in the result as dimensions with size one. With this option,
            the result will broadcast correctly against the input tensor.

        constant : Optional[bool]
            If ``True``, this tensor is treated as a constant, and thus does not
            facilitate back propagation (i.e. ``constant.grad`` will always return
            ``None``).

            Defaults to ``False`` for float-type data.
            Defaults to ``True`` for integer-type data.

            Integer-type tensors must be constant.

        Returns
        -------
        sum_along_axis : mygrad.Tensor
            A Tensor with the same shape as `self`, with the specified
            axis/axes removed. If `self` is a 0-d tensor, or if `axis` is None,
            a 0-dim Tensor is returned.

        See Also
        --------
        mygrad.Tensor.sum : Equivalent method.

        cumsum : Cumulative sum of array elements.

        mean, average

        Notes
        -----
        Arithmetic is modular when using integer types, and no error is
        raised on overflow.

        The sum of an empty tensor is the neutral element 0:

        >>> mygrad.sum([])
        Tensor(0.0)

        Examples
        --------
        >>> import mygrad as mg
        >>> import numpy as np
        >>> x = mg.tensor([1., 1.])
        >>> x.sum()
        Tensor(2.0)
        >>> x = mg.tensor([0.5, 0.7, 0.2, 1.5])
        >>> x.sum(dtype=np.int32)
        Tensor(1)
        >>> x = mg.tensor([[0, 1], [0, 5]])
        >>> x.sum()
        Tensor(6)
        >>> x.sum(axis=0)
        Tensor([0, 6])
        >>> x.sum(axis=1)
        Tensor([1, 5])
        """
        return Tensor._op(
            Sum, self, op_kwargs={"axis": axis, "keepdims": keepdims}, constant=constant
        )

    def prod(
        self,
        axis: Optional[Union[int, Tuple[int, ...]]] = None,
        keepdims: bool = False,
        *,
        constant: Optional[bool] = None,
    ) -> "Tensor":
        """
        Return the product of array elements over given axes.

        Parameters
        ----------
        axis : Optional[Union[int, Tuple[int, ...]]]
            Axis or axes along which to operate. By default, flattened input is used.

        keepdims : bool, optional (default=False)
            If this is set to True, the axes which are reduced are left in the
            result as dimensions with size one. With this option, the result
            will broadcast correctly against the input array.

        constant : Optional[bool]
            If ``True``, this tensor is treated as a constant, and thus does not
            facilitate back propagation (i.e. ``constant.grad`` will always return
            ``None``).

            Defaults to ``False`` for float-type data.
            Defaults to ``True`` for integer-type data.

            Integer-type tensors must be constant.

        Returns
        -------
        product_along_axis : mygrad.Tensor
            A tensor shaped as `a` but with the specified axis removed."""
        return Tensor._op(
            Prod,
            self,
            op_kwargs={"axis": axis, "keepdims": keepdims},
            constant=constant,
        )

    def cumprod(
        self,
        axis: Optional[Union[int, Tuple[int, ...]]] = None,
        *,
        constant: Optional[bool] = None,
    ) -> "Tensor":
        """
        Return the cumulative product of elements along a given axis.

        This docstring was adapted from the official numpy documentation

        Parameters
        ----------
        axis : Optional[int]
            Axis along which the cumulative product is computed.  By default
            the input is flattened.

        constant : bool, optional(default=False)
            If ``True``, the returned tensor is a constant (it
            does not back-propagate a gradient)

        constant : Optional[bool]
            If ``True``, this tensor is treated as a constant, and thus does not
            facilitate back propagation (i.e. ``constant.grad`` will always return
            ``None``).

            Defaults to ``False`` for float-type data.
            Defaults to ``True`` for integer-type data.

            Integer-type tensors must be constant.

        Returns
        -------
        mygrad.Tensor

        Notes
        -----
        Arithmetic is modular when using integer types, and no error is
        raised on overflow."""

        return Tensor._op(CumProd, self, op_kwargs={"axis": axis}, constant=constant)

    def cumsum(
        self,
        axis: Optional[Union[int, Tuple[int, ...]]] = None,
        *,
        constant: Optional[bool] = None,
    ) -> "Tensor":
        """
        Return the cumulative sum of the elements along a given axis.

        This docstring was adapted from the official numpy documentation

        Parameters
        ----------
        axis : int, optional
            Axis along which the cumulative sum is computed. The default
            (None) is to compute the cumsum over the flattened array.

        constant : Optional[bool]
            If ``True``, this tensor is treated as a constant, and thus does not
            facilitate back propagation (i.e. ``constant.grad`` will always return
            ``None``).

            Defaults to ``False`` for float-type data.
            Defaults to ``True`` for integer-type data.

            Integer-type tensors must be constant.

        Returns
        -------
        mygrad.Tensor
        """

        return Tensor._op(CumSum, self, op_kwargs={"axis": axis}, constant=constant)

    def mean(
        self,
        axis: Optional[Union[int, Tuple[int, ...]]] = None,
        keepdims: bool = False,
        *,
        constant: Optional[bool] = None,
    ) -> "Tensor":
        """
        Mean of tensor elements over a given axis.

        Parameters
        ----------
        x : ArrayLike

        axis : Optional[int, Tuple[ints, ...]
            Axis or axes along which a mean is performed.  The default,
            axis=None, will mean all of the elements of the input tensor.  If
            axis is negative it counts from the last to the first axis.

            If axis is a tuple of ints, a mean is performed on all of the axes
            specified in the tuple instead of a single axis or all the axes as
            before.

        keepdims : bool, optional
            If this is set to True, the axes which are reduced are left
            in the result as dimensions with size one. With this option,
            the result will broadcast correctly against the input tensor.

        constant : Optional[bool]
            If ``True``, this tensor is treated as a constant, and thus does not
            facilitate back propagation (i.e. ``constant.grad`` will always return
            ``None``).

            Defaults to ``False`` for float-type data.
            Defaults to ``True`` for integer-type data.

            Integer-type tensors must be constant.

        Returns
        -------
        mean_along_axis : Tensor
            A Tensor with the same shape as `self`, with the specified
            axis/axes removed. If `self` is a 0-d tensor, or if `axis` is None,
            a 0-dim Tensor is returned.
        """
        return Tensor._op(
            Mean,
            self,
            op_kwargs={"axis": axis, "keepdims": keepdims},
            constant=constant,
        )

    def std(
        self,
        axis: Optional[Union[int, Tuple[int, ...]]] = None,
        ddof: int = 0,
        keepdims: bool = False,
        *,
        constant: Optional[bool] = None,
    ) -> "Tensor":
        """
        Compute the standard deviation along the specified axis.

        Returns the variance of the array elements, a measure of the spread of a
        distribution.  The variance is computed for the flattened array by
        default, otherwise over the specified axis.

        Parameters
        ----------
        axis : Optional[Union[int, Tuple[int, ...]]]
            Axis or axes along which the variance is computed.  The default is to
            compute the variance of the flattened array.

        ddof : int, optional (default=0)
            "Delta Degrees of Freedom": the divisor used in the calculation is
            ``N - ddof``, where ``N`` represents the number of elements. By
            default `ddof` is zero.

        keepdims : bool, optional (default=False)
            If this is set to True, the axes which are reduced are left
            in the result as dimensions with size one. With this option,
            the result will broadcast correctly against the input array.

        constant : Optional[bool]
            If ``True``, this tensor is treated as a constant, and thus does not
            facilitate back propagation (i.e. ``constant.grad`` will always return
            ``None``).

            Defaults to ``False`` for float-type data.
            Defaults to ``True`` for integer-type data.

            Integer-type tensors must be constant.

        Returns
        -------
        std : mygrad.Tensor

        Notes
        -----
        The variance is the average of the squared deviations from the mean,
        i.e.,  ``var = mean(abs(x - x.mean())**2)``.

        The mean is normally calculated as ``x.sum() / N``, where ``N = len(x)``.
        If, however, `ddof` is specified, the divisor ``N - ddof`` is used
        instead.  In standard statistical practice, ``ddof=1`` provides an
        unbiased estimator of the variance of a hypothetical infinite population.
        ``ddof=0`` provides a maximum likelihood estimate of the variance for
        normally distributed variables."""
        return Tensor._op(
            StdDev,
            self,
            op_kwargs={"axis": axis, "keepdims": keepdims, "ddof": ddof},
            constant=constant,
        )

    def var(
        self,
        axis: Optional[Union[int, Tuple[int, ...]]] = None,
        ddof: int = 0,
        keepdims: bool = False,
        *,
        constant: Optional[bool] = None,
    ) -> "Tensor":
        """
        Compute the variance along the specified axis.

        Returns the variance of the array elements, a measure of the spread of a
        distribution.  The variance is computed for the flattened array by
        default, otherwise over the specified axis.

        Parameters
        ----------
        axis : Optional[int, Tuple[int, ...]]
            Axis or axes along which the variance is computed.  The default is to
            compute the variance of the flattened array.

        ddof : int, optional (default=0)
            "Delta Degrees of Freedom": the divisor used in the calculation is
            ``N - ddof``, where ``N`` represents the number of elements. By
            default `ddof` is zero.

        keepdims : bool, optional (default=False)
            If this is set to True, the axes which are reduced are left
            in the result as dimensions with size one. With this option,
            the result will broadcast correctly against the input array..

        constant : Optional[bool]
            If ``True``, this tensor is treated as a constant, and thus does not
            facilitate back propagation (i.e. ``constant.grad`` will always return
            ``None``).

            Defaults to ``False`` for float-type data.
            Defaults to ``True`` for integer-type data.

            Integer-type tensors must be constant.
        Returns
        -------
        variance : mygrad.Tensor

        Notes
        -----
        The variance is the average of the squared deviations from the mean,
        i.e.,  ``var = mean(abs(x - x.mean())**2)``.

        The mean is normally calculated as ``x.sum() / N``, where ``N = len(x)``.
        If, however, `ddof` is specified, the divisor ``N - ddof`` is used
        instead.  In standard statistical practice, ``ddof=1`` provides an
        unbiased estimator of the variance of a hypothetical infinite population.
        ``ddof=0`` provides a maximum likelihood estimate of the variance for
        normally distributed variables."""
        return Tensor._op(
            Variance,
            self,
            op_kwargs={"axis": axis, "keepdims": keepdims, "ddof": ddof},
            constant=constant,
        )

    def max(
        self,
        axis: Optional[Union[int, Tuple[int, ...]]] = None,
        keepdims: bool = False,
        *,
        constant: Optional[bool] = None,
    ) -> "Tensor":
        """
        Return the maximum of a tensor or maximum along its axes.

        Parameters
        ----------
        x : ArrayLike

        axis : Optional[int, Tuple[int, ...]]
            Axis or axes along which to operate. By default, flattened input is used.

        keepdims : bool, optional
            If this is set to True, the axes which are reduced are left
            in the result as dimensions with size one. With this option,
            the result will broadcast correctly against the original `arr`.

        constant : Optional[bool]
            If ``True``, this tensor is treated as a constant, and thus does not
            facilitate back propagation (i.e. ``constant.grad`` will always return
            ``None``).

            Defaults to ``False`` for float-type data.
            Defaults to ``True`` for integer-type data.

            Integer-type tensors must be constant.

        Returns
        -------
        max : mygrad.Tensor
            Maximum of `a`. If `axis` is None, the result is a 0-D tensor.

        Examples
        --------
        >>> import mygrad as mg
        >>> import numpy as np
        >>> a = mg.arange(4).reshape((2,2))
        >>> a
        Tensor([[0, 1],
                [2, 3]])
        >>> mg.amax(a)           # Maximum of the flattened array
        Tensor(3)
        >>> mg.amax(a, axis=0)   # Maxima along the first axis
        Tensor([2, 3])
        >>> mg.amax(a, axis=1)   # Maxima along the second axis
        Tensor([1, 3])
        >>> b = mg.arange(5, dtype=float)
        >>> b[2] = np.NaN
        >>> mg.amax(b)
        Tensor(nan)
        """
        return Tensor._op(
            Max,
            self,
            op_kwargs={"axis": axis, "keepdims": keepdims, "dtype": _NoValue},
            constant=constant,
        )

    def min(
        self,
        axis: Optional[Union[int, Tuple[int, ...]]] = None,
        keepdims: bool = False,
        *,
        constant: Optional[bool] = None,
    ) -> "Tensor":
        """
        Return the minimum of a tensor or minimum along its axes.

        Parameters
        ----------
        axis : Optional[int, Tuple[int, ...]]
            Axis or axes along which to operate. By default, flattened input is used.

        keepdims : bool, optional
            If this is set to True, the axes which are reduced are left
            in the result as dimensions with size one. With this option,
            the result will broadcast correctly against the original `arr`.

        constant : Optional[bool]
            If ``True``, this tensor is treated as a constant, and thus does not
            facilitate back propagation (i.e. ``constant.grad`` will always return
            ``None``).

            Defaults to ``False`` for float-type data.
            Defaults to ``True`` for integer-type data.

            Integer-type tensors must be constant.

        Returns
        -------
        min : mygrad.Tensor
            Minimum of `a`. If `axis` is None, the result is a 0-D tensor.

        Examples
        --------
        >>> import mygrad as mg
        >>> import numpy as np
        >>> a = mg.arange(4).reshape((2,2))
        >>> a
        Tensor([[0, 1],
                [2, 3]])
        >>> mg.amin(a)           # Minimum of the flattened array
        Tensor(0)
        >>> mg.amin(a, axis=0)   # Minima along the first axis
        Tensor([0, 1])
        >>> mg.amin(a, axis=1)   # Minima along the second axis
        Tensor([0, 2])
        >>> b = mg.arange(5, dtype=float)
        >>> b[2] = np.NaN
        >>> mg.amin(b)
        Tensor(nan)
        """
        return Tensor._op(
            Min,
            self,
            op_kwargs={"axis": axis, "keepdims": keepdims, "dtype": _NoValue},
            constant=constant,
        )

    def swapaxes(
        self, axis1: int, axis2: int, *, constant: Optional[bool] = None
    ) -> "Tensor":
        """Interchange two axes of a tensor.

        Parameters
        ----------
        axis1 : int
            First axis.

        axis2 : int
            Second axis.

        constant : Optional[bool]
            If ``True``, this tensor is treated as a constant, and thus does not
            facilitate back propagation (i.e. ``constant.grad`` will always return
            ``None``).

            Defaults to ``False`` for float-type data.
            Defaults to ``True`` for integer-type data.

            Integer-type tensors must be constant.

        Returns
        -------
        mygrad.Tensor
        """
        return Tensor._op(SwapAxes, self, op_args=(axis1, axis2), constant=constant)

    def transpose(
        self: ArrayLike, *axes: int, constant: Optional[bool] = None
    ) -> "Tensor":
        """Permute the dimensions of a tensor.

        Parameters
        ----------
        axes : int
            By default, reverse the dimensions, otherwise permute the axes
            according to the values given.

        constant : Optional[bool]
            If ``True``, this tensor is treated as a constant, and thus does not
            facilitate back propagation (i.e. ``constant.grad`` will always return
            ``None``).

            Defaults to ``False`` for float-type data.
            Defaults to ``True`` for integer-type data.

            Integer-type tensors must be constant.

        Returns
        -------
        mygrad.Tensor
            `a` with its axes permuted.  A new tensor is returned.

        Examples
        --------
        >>> import mygrad as mg
        >>> a = mg.tensor([[1, 2], [3, 4]])
        >>> a
        Tensor([[1, 2],
                [3, 4]])
        >>> a.transpose()
        Tensor([[1, 3],
                [2, 4]])
        >>> a.transpose((1, 0))
        Tensor([[1, 3],
                [2, 4]])
        >>> a.transpose(1, 0)
        Tensor([[1, 3],
                [2, 4]])"""
        if not axes:
            axes = None
        elif hasattr(axes[0], "__iter__") or axes[0] is None:
            if len(axes) > 1:
                raise TypeError(
                    f"'{type(axes[0])}' object cannot be interpreted as an integer"
                )
            axes = axes[0]
        return Tensor._op(Transpose, self, op_args=(axes,), constant=constant)

    def moveaxis(
        self,
        source: Union[int, Tuple[int, ...]],
        destination: Union[int, Tuple[int, ...]],
        *,
        constant: Optional[bool] = None,
    ) -> "Tensor":
        """Move axes of a tensor to new positions. Other axes remain in their
        original order.


        Parameters
        ----------
        source : Union[int, Sequence[int]]
            Original positions of the axes to move. These must be unique.

        destination : Union[int, Sequence[int]]
            Destination positions for each of the original axes. These must also be
            unique.

        constant : Optional[bool]
            If ``True``, this tensor is treated as a constant, and thus does not
            facilitate back propagation (i.e. ``constant.grad`` will always return
            ``None``).

            Defaults to ``False`` for float-type data.
            Defaults to ``True`` for integer-type data.

            Integer-type tensors must be constant.

        Returns
        -------
        result : mygrad.Tensor
            Array with moved axes. This array is a view of the input array.."""
        return Tensor._op(
            MoveAxis, self, op_args=(source, destination), constant=constant
        )

    def squeeze(
        self,
        axis: Optional[Union[int, Tuple[int, ...]]] = None,
        *,
        constant: Optional[bool] = None,
    ) -> "Tensor":
        """
        Remove single-dimensional entries from the shape of a tensor.

        This docstring was adapted from ``numpy.squeeze``

        Parameters
        ----------
        axis : Optional[int, Tuple[int, ...]]
            Selects a subset of the single-dimensional entries in the
            shape. If an axis is selected with shape entry greater than
            one, an error is raised.

        constant : Optional[bool]
            If ``True``, this tensor is treated as a constant, and thus does not
            facilitate back propagation (i.e. ``constant.grad`` will always return
            ``None``).

            Defaults to ``False`` for float-type data.
            Defaults to ``True`` for integer-type data.

            Integer-type tensors must be constant.


        Returns
        -------
        mygrad.Tensor

        Raises
        ------
        ValueError
            If ``axis`` is not ``None``, and an axis being squeezed is not of length 1
        """
        return Tensor._op(Squeeze, self, op_args=(axis,), constant=constant)

    def ravel(self, *, constant: Optional[bool] = None) -> "Tensor":
        """
        Flattens contents of a tensor into a contiguous 1-D array.  A copy is made only if needed.

        This docstring was adapted from ``numpy.ravel``.

        Parameters
        ----------
        constant : Optional[bool]
            If ``True``, this tensor is treated as a constant, and thus does not
            facilitate back propagation (i.e. ``constant.grad`` will always return
            ``None``).

            Defaults to ``False`` for float-type data.
            Defaults to ``True`` for integer-type data.

            Integer-type tensors must be constant.


        Returns
        -------
        mygrad.Tensor

        Notes
        -----
        ``ravel`` utilizes C-ordering, meaning that it reads & writes elements using
        C-like index ordering; the last axis index changing fastest, and, proceeding
        in reverse order, the first axis index changing slowest.
        """
        return Tensor._op(Ravel, self, constant=constant)

    def argmax(
        self, axis: Optional[int] = None, out: Optional[np.ndarray] = None
    ) -> np.ndarray:
        """Returns the indices of the maximum values along an axis.

        Parameters
        ----------
        a: array_like

        axis: int, optional
            By default, the index is into the flattened array, otherwise along the specified axis.

        out: numpy.array, optional
            If provided, the result will be inserted into this array. It should be of the appropriate shape and dtype.

        Returns
        -------
        numpy.ndarray[int]"""

        return np.argmax(self.data, axis, out)

    def argmin(
        self, axis: Optional[int] = None, out: Optional[np.ndarray] = None
    ) -> np.ndarray:
        """Returns the indices of the minimum values along an axis.

        Parameters
        ----------
        axis: int, optional
            By default, the index is into the flattened array, otherwise along the specified axis.

        out: numpy.array, optional
            If provided, the result will be inserted into this array. It should be of the appropriate shape and dtype.

        Returns
        -------
        numpy.ndarray[int]"""

        return np.argmin(self.data, axis, out)

    def any(
        self,
        axis: Optional[Union[int, Tuple[int, ...]]] = None,
        out: Optional[np.ndarray] = None,
        keepdims: bool = False,
    ) -> np.ndarray:
        """Test whether any array or Tensor element along a given axis evaluates to True.

        Returns single boolean if `axis` is ``None``

        This documentation was adapted from ``numpy.add``

        Parameters
        ----------
        axis : None or int or tuple of ints, optional
            Axis or axes along which a logical OR reduction is performed.
            The default (``axis=None``) is to perform a logical OR over all
            the dimensions of the input array. `axis` may be negative, in
            which case it counts from the last to the first axis.
            If this is a tuple of ints, a reduction is performed on multiple
            axes, instead of a single axis or all the axes as before.

        out : ndarray, optional
            Alternate output array in which to place the result.  It must have
            the same shape as the expected output and its type is preserved
            (e.g., if it is of type float, then it will remain so, returning
            1.0 for True and 0.0 for False, regardless of the type of `a`).
            See `ufuncs-output-type` for more details.

        keepdims : bool, optional
            If this is set to True, the axes which are reduced are left
            in the result as dimensions with size one. With this option,
            the result will broadcast correctly against the input array.
            If the default value is passed, then `keepdims` will not be
            passed through to the `any` method of sub-classes of
            `ndarray`, however any non-default value will be.  If the
            sub-class' method does not implement `keepdims` any
            exceptions will be raised.

        Returns
        -------
        any : bool or ndarray
            A new boolean or `ndarray` is returned unless `out` is specified,
            in which case a reference to `out` is returned.

        See Also
        --------
        Tensor.any : equivalent method

        """
        return np.any(self.data, axis=axis, out=out, keepdims=keepdims)

    def clip(
        self,
        a_min: ArrayLike,
        a_max: ArrayLike,
        out: Optional[Union[np.ndarray, "Tensor"]] = None,
        *,
        constant: Optional[bool] = None,
    ) -> "Tensor":  # pragma: no cover
        """Clip (limit) the values in an array.

        Given an interval, values outside the interval are clipped to
        the interval edges.  For example, if an interval of ``[0, 1]``
        is specified, values smaller than 0 become 0, and values larger
        than 1 become 1.

        Equivalent to `mg.minimum(a_max, mg.maximum(a, a_min))``.

        No check is performed to ensure ``a_min < a_max``.

        This docstring was adapted from that of `numpy.clip`

        Parameters
        ----------
        a_min : Optional[float, ArrayLike]
            Minimum value. If `None`, clipping is not performed on lower
            interval edge. Not more than one of `a_min` and `a_max` may be
            `None`.

        a_max : Optional[float, ArrayLike]
            Maximum value. If `None`, clipping is not performed on upper
            interval edge. Not more than one of `a_min` and `a_max` may be
            `None`. If `a_min` or `a_max` are ArrayLike, then the three
            arrays will be broadcasted to match their shapes.

        out : Optional[Union[ndarray, Tensor]]
            A location into which the result is stored. If provided, it must have
            a shape that the inputs broadcast to. If not provided or None, a
            freshly-allocated tensor is returned.

        constant : bool, optional(default=False)
            If ``True``, the returned tensor is a constant (it
            does not backpropagate a gradient)

        Returns
        -------
        Tensor
            A tensor with the elements of `a`, but where values
            < `a_min` are replaced with `a_min`, and those > `a_max`
            with `a_max`.

        Examples
        --------
        >>> import mygrad as mg
        >>> a = mg.arange(10)
        >>> a
        Tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
        >>> a.clip(1, 8)
        Tensor([1, 1, 2, 3, 4, 5, 6, 7, 8, 8])
        >>> a.clip([3, 4, 1, 1, 1, 4, 4, 4, 4, 4], 8)
        Tensor([3, 4, 2, 3, 4, 5, 6, 7, 8, 8])"""
        # set in added in mygrad.__init__
        ...