Source code for gristmill.optimize

"""Optimizer for the contraction computations."""

import collections
import enum
import heapq
import itertools
import typing
import warnings

from drudge import TensorDef, prod_, Term, Range
from sympy import (
    Integer, Symbol, Expr, IndexedBase, Mul, Indexed, sympify, primitive, Wild
)
from sympy.utilities.iterables import multiset_partitions

from .utils import get_cost_key, add_costs, get_total_size, DSF


#
#  The public driver
#  -----------------
#


[docs]class Strategy(enum.Enum):
    """The optimization strategy for tensor contractions.

    This enumeration type gives possible options for the optimization strategy
    for tensor contractions.  Supported values includes,

    ``GREEDY``
        The contraction will be optimized greedily.  This should only be used
        for large inputs where the other strategies cannot finish within a
        reasonable time.

    ``BEST``
        The global minimum of each tensor contraction will be found by the
        advanced algorithm in gristmill.  And only the optimal contraction(s)
        will be kept for the summation optimization.

    ``SEARCHED``
        The same strategy as ``BEST`` will be attempted for the optimization of
        contractions.  But all evaluations searched in the optimization process
        will be kept and considered in subsequent summation optimizations.

    ``ALL``
        All possible contraction sequences will be considered for all
        contractions.  This can be extremely slow.  But it might be helpful for
        manageable problems.

    """

    GREEDY = 0
    BEST = 1
    SEARCHED = 2
    ALL = 3


[docs]def optimize(
        computs: typing.Iterable[TensorDef], substs=None, interm_fmt='tau^{}',
        simplify=True, strategy=Strategy.SEARCHED
) -> typing.List[TensorDef]:
    """Optimize the valuation of the given tensor contractions.

    This function will transform the given computations, given as tensor
    definitions, into another list computations mathematically equivalent to the
    given computation while requiring less floating-point operations (FLOPs).

    Parameters
    ----------

    computs
        The computations, can be given as an iterable of tensor definitions.

    substs
        A dictionary for making substitutions inside the sizes of ranges.  All
        the ranges need to have size in at most one undetermined variable after
        the substitution so that they can be totally ordered.

    interm_fmt
        The format for the names of the intermediates.

    simplify
        If the input is going to be simplified before processing.  It can be
        disabled when the input is already simplified.

    strategy
        The optimization strategy, as explained in :py:class:`Strategy`.

    """

    substs = {} if substs is None else substs

    if simplify:
        computs = [i.simplify() for i in computs]
    else:
        computs = list(computs)

    if not isinstance(strategy, Strategy):
        raise TypeError('Invalid optimization strategy', strategy)

    opt = _Optimizer(
        computs, substs=substs, interm_fmt=interm_fmt, strategy=strategy
    )

    return opt.optimize()


#
# The internal optimization engine
# --------------------------------
#
# Internal small type definitions
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#


#
# Small type definitions.
#


_Grain = collections.namedtuple('_Grain', [
    'base',
    'exts',
    'terms'
])

#
# The information on collecting a collectible.
#
# Interpretation, after the substitutions given in ``substs``, the ``lr`` factor
# in the evaluation ``eval_`` will be turned into ``coeff`` times the actual
# collectible.
#

_CollectInfo = collections.namedtuple('_CollectInfo', [
    'eval_',
    'lr',
    'coeff',
    'substs',
    'ranges',
    'add_cost'
])

_Ranges = collections.namedtuple('_Ranges', [
    'involved_exts',
    'sums',
    'other_exts'
])

_Collectible = typing.Tuple[Term, ...]

_CollectInfos = typing.Dict[int, _CollectInfo]

_Collectibles = typing.Dict[_Collectible, _CollectInfos]

_Part = collections.namedtuple('_Part', [
    'ref',
    'node'
])


#
# Core evaluation DAG nodes.
#


class _EvalNode:
    """A node in the evaluation graph.
    """

    def __init__(self, base, exts):
        """Initialize the evaluation node.
        """

        self.base = base
        self.exts = exts

        self.evals = []  # type: typing.List[_EvalNode]
        self.total_cost = None
        self.n_refs = 0
        self.generated = False

    def get_substs(self, indices):
        """Get substitutions and symbols requiring exclusion before indexing.
        """

        substs = {}
        excl = set()

        assert len(indices) == len(self.exts)
        for i, j in zip(indices, self.exts):
            dumm = j[0]
            substs[dumm] = i
            excl.add(dumm)
            excl |= i.atoms(Symbol)
            continue

        return substs, excl


class _Sum(_EvalNode):
    """Sum nodes in the evaluation graph."""

    def __init__(self, base, exts, sum_terms):
        """Initialize the node."""
        super().__init__(base, exts)
        self.sum_terms = sum_terms

    def __repr__(self):
        """Form a representation string for the node."""
        return '_Sum(base={}, exts={}, sum_terms={})'.format(
            repr(self.base), repr(self.exts), repr(self.sum_terms)
        )


class _Prod(_EvalNode):
    """Product nodes in the evaluation graph.
    """

    def __init__(self, base, exts, sums, coeff, factors):
        """Initialize the node."""
        super().__init__(base, exts)
        self.sums = sums
        self.coeff = coeff
        self.factors = factors

    def __repr__(self):
        """Form a representation string for the node."""
        return '_Prod(base={}, exts={}, coeff={}, factors={})'.format(
            repr(self.base), repr(self.exts),
            repr(self.coeff), repr(self.factors)
        )


#
# Core optimizer class
# ~~~~~~~~~~~~~~~~~~~~
#


class _Optimizer:
    """Optimizer for tensor contraction computations.

    This internal optimizer can only be used once for one set of input.
    """

    def __init__(self, computs, substs, interm_fmt, strategy):
        """Initialize the optimizer."""

        self._prepare_grist(computs, substs)

        self._interm_fmt = interm_fmt
        self._strategy = strategy

        self._next_internal_idx = 0

        self._interms = {}
        self._interms_canon = {}

        self._res = None

    def optimize(self):
        """Optimize the evaluation of the given computations.
        """

        if self._res is not None:
            return self._res

        res_nodes = [self._form_node(i) for i in self._grist]
        for i in res_nodes:
            self._optimize(i)
            continue

        self._res = self._linearize(res_nodes)
        return self._res

    #
    # User input pre-processing.
    #

    def _prepare_grist(self, computs, substs):
        """Prepare tensor definitions for optimization.
        """

        self._grist = []
        self._drudge = None
        self._range_var = None  # The only variable for range sizes.
        self._excl = set()
        self._input_ranges = {}  # Substituted range to original range.

        # Form pre-grist, basically everything is set except the dummy variables
        # for external indices and summations.
        pre_grist = [
            self._form_pre_grist(comput, substs) for comput in computs
        ]

        # Finalize grist formation by resetting the dummies.
        self._dumms = {
            k: self._drudge.dumms.value[v]
            for k, v in self._input_ranges.items()
        }

        self._grist = [self._reset_dumms(grain) for grain in pre_grist]

        return

    def _form_pre_grist(self, comput, substs):
        """Form grist from a given computation.
        """

        curr_drudge = comput.rhs.drudge
        if self._drudge is None:
            self._drudge = curr_drudge
        elif self._drudge is not curr_drudge:
            raise ValueError(
                'Invalid computations to optimize, containing two drudges',
                (self._drudge, curr_drudge)
            )
        else:
            pass

        # Externals processing.
        exts = self._proc_sums(comput.exts, substs)
        ext_symbs = {i for i, _ in exts}

        # Terms processing.
        terms = []
        for term in comput.rhs_terms:
            if not term.is_scalar:
                raise ValueError(
                    'Invalid term to optimize', term, 'expecting scalar'
                )
            sums = self._proc_sums(term.sums, substs)
            amp = term.amp

            # Add the true free symbols to the exclusion set.
            self._excl |= term.free_vars - ext_symbs
            terms.append(Term(sums, amp, ()))

            continue

        return _Grain(base=comput.base, exts=exts, terms=terms)

    def _proc_sums(self, sums, substs):
        """Process a summation list.

        The ranges will be replaced with substitution sizes.  Relevant members
        of the object will also be updated.  User error will also be reported.
        """

        res = []
        for symb, range_ in sums:

            if not range_.bounded:
                raise ValueError(
                    'Invalid range for optimization', range_,
                    'expecting explicit bound'
                )
            lower, upper = [
                self._check_range_var(i.xreplace(substs), range_)
                for i in [range_.lower, range_.upper]
            ]

            new_range = Range(range_.label, lower=lower, upper=upper)
            if new_range not in self._input_ranges:
                self._input_ranges[new_range] = range_
            elif range_ != self._input_ranges[new_range]:
                raise ValueError(
                    'Invalid ranges', (range_, self._input_ranges[new_range]),
                    'duplicated labels'
                )
            else:
                pass

            res.append((symb, new_range))
            continue

        return tuple(res)

    def _check_range_var(self, expr, range_) -> Expr:
        """Check size expression for valid symbol presence."""

        range_vars = expr.atoms(Symbol)
        if len(range_vars) == 0:
            pass
        elif len(range_vars) == 1:

            range_var = range_vars.pop()
            if self._range_var is None:
                self._range_var = range_var
            elif self._range_var != range_var:
                raise ValueError(
                    'Invalid range', range_, 'unexpected symbol',
                    range_var, 'conflicting with', self._range_var
                )
            else:
                pass
        else:
            raise ValueError(
                'Invalid range', range_, 'containing multiple symbols',
                range_vars
            )

        return expr

    def _reset_dumms(self, grain):
        """Reset the dummies in a grain."""

        exts, ext_substs, dummbegs = Term.reset_sums(
            grain.exts, self._dumms, excl=self._excl
        )
        terms = []
        for term in grain.terms:
            sums, curr_substs, _ = Term.reset_sums(
                term.sums, self._dumms,
                dummbegs=dict(dummbegs), excl=self._excl
            )
            curr_substs.update(ext_substs)
            terms.append(term.map(
                lambda x: x.xreplace(curr_substs), sums=sums
            ))
            continue

        return _Grain(base=grain.base, exts=exts, terms=terms)

    #
    # Optimization result post-processing.
    #

    def _linearize(
            self, optimized: typing.Sequence[_EvalNode]
    ) -> typing.List[TensorDef]:
        """Linearize optimized forms of the evaluation.
        """

        for node in optimized:
            self._set_n_refs(node)
            continue

        interms = []
        res = []
        for node in optimized:
            curr = self._linearize_node(node, interms, keep=True)
            assert curr is not None
            res.append(curr)
            continue

        return self._finalize(itertools.chain(interms, res))

    def _set_n_refs(self, node: _EvalNode):
        """Set reference counts from an evaluation node.

        It is always the first evaluation that is going to be used, all rest
        will be removed to avoid further complications.
        """

        if len(node.evals) == 0:
            self._optimize(node)
        assert len(node.evals) > 0
        del node.evals[1:]
        eval_ = node.evals[0]

        if isinstance(eval_, _Prod):
            refs = [i for i in eval_.factors if self._is_interm_ref(i)]
        elif isinstance(eval_, _Sum):
            refs = eval_.sum_terms
        else:
            assert False

        for i in refs:
            _, ref = self._parse_interm_ref(i)
            dep = ref.base if isinstance(ref, Indexed) else ref
            dep_node = self._interms[dep]
            dep_node.n_refs += 1
            self._set_n_refs(dep_node)
            continue

        return

    def _linearize_node(self, node: _EvalNode, res: list, keep=False):
        """Linearize evaluation rooted in the given node into the result.

        If keep if set to True, the evaluation of the given node will not be
        appended to the result list.
        """

        if node.generated:
            return None

        def_, deps = self._form_def(node)
        for i in deps:
            self._linearize_node(self._interms[i], res)
            continue

        node.generated = True

        if not keep:
            res.append(def_)

        return def_

    def _form_def(self, node: _EvalNode):
        """Form the final definition of an evaluation node."""

        assert len(node.evals) == 1

        if isinstance(node, _Prod):
            return self._form_prod_def(node)
        elif isinstance(node, _Sum):
            return self._form_sum_def(node)
        else:
            assert False

    def _form_prod_def(self, node: _Prod):
        """Form the final definition of a product evaluation node."""

        exts = node.exts
        eval_ = node.evals[0]
        assert isinstance(eval_, _Prod)
        term, deps = self._form_prod_def_term(eval_)
        return _Grain(base=node.base, exts=exts, terms=[term]), deps

    def _form_prod_def_term(self, eval_: _Prod):
        """Form the term in the final definition of a product evaluation node.
        """

        amp = eval_.coeff

        deps = []
        for factor in eval_.factors:

            if self._is_interm_ref(factor):
                dep = factor.base if isinstance(factor, Indexed) else factor
                interm = self._interms[dep]
                if self._is_input(interm):
                    # Inline trivial reference to an input.
                    content = self._get_def(factor)
                    assert len(content) == 1
                    amp *= content[0].amp
                else:
                    deps.append(dep)
                    amp *= factor
            else:
                amp *= factor
        return Term(eval_.sums, amp, ()), deps

    def _form_sum_def(self, node: _Sum):
        """Form the final definition of a sum evaluation node."""

        exts = node.exts
        terms = []
        deps = []

        eval_ = node.evals[0]
        assert isinstance(eval_, _Sum)

        sum_terms = []
        self._inline_sum_terms(eval_.sum_terms, sum_terms)
        for term in sum_terms:

            coeff, ref = self._parse_interm_ref(term)

            # Sum term are guaranteed to be formed from references to products,
            # never directly written in terms of input.
            term_base = ref.base if isinstance(ref, Indexed) else ref
            term_node = self._interms[term_base]

            if term_node.n_refs == 1 or self._is_input(term_node):
                # Inline intermediates only used here and simple input
                # references.

                eval_ = term_node.evals[0]
                assert isinstance(eval_, _Prod)
                indices = ref.indices if isinstance(ref, Indexed) else ()
                term = self._index_prod(eval_, indices)[0]
                factors, term_coeff = term.get_amp_factors(self._interms)

                # Switch back to evaluation node for using the facilities for
                # product nodes.
                new_term, term_deps = self._form_prod_def_term(_Prod(
                    term_node.base, exts, term.sums, coeff * term_coeff, factors
                ))

                terms.append(new_term)
                deps.extend(term_deps)

            else:
                terms.append(Term(
                    (), term, ()
                ))
                deps.append(term_base)
            continue

        return _Grain(base=node.base, exts=exts, terms=terms), deps

    def _inline_sum_terms(self, sum_terms, res):
        """Inline the summation terms from single-reference terms."""

        for sum_term in sum_terms:
            coeff, ref = self._parse_interm_ref(sum_term)
            node = self._interms[
                ref.base if isinstance(ref, Indexed) else ref
            ]
            assert len(node.evals) > 0
            eval_ = node.evals[0]

            if_inline = isinstance(eval_, _Sum) and (
                node.n_refs == 1 or len(eval_.sum_terms) == 1
            )
            if if_inline:
                if len(node.exts) == 0:
                    substs = None
                else:
                    substs = {
                        i[0]: j for i, j in zip(eval_.exts, ref.indices)
                    }

                proced_sum_terms = [
                    (
                        i.xreplace(substs) if substs is not None else sum_term
                    ) * coeff for i in eval_.sum_terms
                ]
                self._inline_sum_terms(proced_sum_terms, res)
                continue
            else:
                res.append(sum_term)
            continue

        return

    def _is_input(self, node: _EvalNode):
        """Test if a product node is just a trivial reference to an input."""
        if isinstance(node, _Prod):
            return len(node.sums) == 0 and len(node.factors) == 1 and (
                not self._is_interm_ref(node.factors[0])
            )
        else:
            return False

    def _finalize(
            self, computs: typing.Iterable[_Grain]
    ) -> typing.List[TensorDef]:
        """Finalize the linearization result.

        Things will be cast to drudge tensor definitions, with intermediates
        holding names formed from the format given by user.
        """

        next_idx = 0

        substs = {}  # For normal substitution of bases.
        repls = []  # For removed shallow intermediates

        def proc_amp(amp):
            """Process the amplitude by making the found substitutions."""
            for i in reversed(repls):
                amp = amp.replace(*i)
                continue
            return amp.xreplace(substs)

        res = []
        for comput in computs:
            base = comput.base
            exts = tuple((s, self._input_ranges[r]) for s, r in comput.exts)
            terms = [
                i.map(proc_amp, sums=tuple(
                    (s, self._input_ranges[r]) for s, r in i.sums
                )) for i in comput.terms
            ]

            if base in self._interms:

                if len(terms) == 1 and len(terms[0].sums) == 0:
                    # Remove shallow intermediates.  The saving might be too
                    # modest to justify the additional memory consumption.
                    repl_lhs = base[tuple(
                        _WILD_FACTORY[i] for i, _ in enumerate(exts)
                    )] if len(exts) > 0 else base
                    repl_rhs = proc_amp(terms[0].amp.xreplace(
                        {v[0]: _WILD_FACTORY[i] for i, v in enumerate(exts)}
                    ))
                    repls.append((repl_lhs, repl_rhs))
                    continue  # No new intermediate added.

                final_base = type(base)(self._interm_fmt.format(next_idx))
                next_idx += 1
                substs[base] = final_base
            else:
                final_base = base

            res.append(TensorDef(
                final_base, exts, self._drudge.create_tensor(terms)
            ))
            continue

        return res

    #
    # Internal support utilities.
    #

    def _get_next_internal(self, symbol=False):
        """Get the base or symbol for the next internal intermediate.
        """
        idx = self._next_internal_idx
        self._next_internal_idx += 1
        cls = Symbol if symbol else IndexedBase
        return cls('gristmillInternalIntermediate{}'.format(idx))

    @staticmethod
    def _write_in_orig_ranges(sums):
        """Write the summations in terms of undecorated bare ranges.

        The labels in the ranges are assumed to be decorated.
        """
        return tuple(
            (i, j.replace_label(j.label[0])) for i, j in sums
        )

    def _canon_terms(self, new_sums, terms):
        """Form a canonical label for a list of terms.

        The new summation list is prepended to the summation list of all terms.
        The coefficient ahead of the canonical form is returned before the
        canonical form.  And the permuted new summation list is also returned
        after the canonical form.  Note that this list contains the original
        dummies given in the new summation list, while the terms has reset new
        dummies.

        Note that the ranges in the new summation list are assumed to be
        decorated with labels earlier than _SUMMED.  In the result, they are
        still in decorated forms and are guaranteed to be permuted in the same
        way for all given terms.  The summations from the terms will be
        internally decorated but written in bare ranges in the final result.

        Note that this is definitely a poor man's version of canonicalization of
        multi-term tensor definitions with external indices.  A lot of cases
        cannot be handled well.  Hopefully it can be replaced with a systematic
        treatment some day in the future.

        """

        new_dumms = {i for i, _ in new_sums}
        coeffs = []

        candidates = collections.defaultdict(list)
        for term in terms:
            term, canon_sums = self._canon_term(new_sums, term)

            factors, coeff = term.amp_factors
            coeffs.append(coeff)

            candidates[
                term.map(lambda x: prod_(factors))
            ].append(canon_sums)
            continue

        # Poor man's canonicalization of external indices.
        #
        # This algorithm is not guaranteed to work.  Here we just choose an
        # ordering of the external indices that is as safe as possible.  But
        # certainly it is not guaranteed to work for all cases.
        #
        # TODO: Fix it!

        chosen = min(candidates.items(), key=lambda x: (
            len(x[1]), -len(x[0].amp.atoms(Symbol) & new_dumms),
            x[0].sort_key
        ))

        canon_new_sums = set(chosen[1])
        if len(canon_new_sums) > 1:
            warnings.warn(
                'Internal deficiency: '
                'summation intermediate may not be fully canonicalized'
            )
        # This could also fail when the chosen term has symmetry among the new
        # summations not present in any other term.  This can be hard to check.

        canon_new_sum = canon_new_sums.pop()
        preferred = chosen[0].amp_factors[1]
        canon_coeff = _get_canon_coeff(coeffs, preferred)

        res_terms = []
        for term in terms:
            canon_term, _ = self._canon_term(canon_new_sum, term, fix_new=True)
            # TODO: Add support for complex conjugation.
            res_terms.append(canon_term.map(lambda x: x / canon_coeff))
            continue

        return canon_coeff, tuple(
            sorted(res_terms, key=lambda x: x.sort_key)
        ), canon_new_sum

    def _canon_term(self, new_sums, term, fix_new=False):
        """Canonicalize a single term.

        Internal method for _canon_terms, not supposed to be directly called.
        """

        term = Term(tuple(itertools.chain(
            (
                (v[0], v[1].replace_label((v[1].label[0], _EXT, i)))
                for i, v in enumerate(new_sums)
            ) if fix_new else new_sums,
            (
                (i, j.replace_label((j.label, _SUMMED)))
                for i, j in term.sums
            )
        )), term.amp, ())
        canoned = term.canon(symms=self._drudge.symms.value)

        canon_sums = canoned.sums
        canon_orig_sums = self._write_in_orig_ranges(canon_sums)

        dumm_reset, _ = canoned.map(
            lambda x: x, sums=canon_orig_sums
        ).reset_dumms(
            dumms=self._dumms, excl=self._excl
        )

        canon_new_sums = []
        term_new_sums = []
        term_sums = []
        i_new = 0
        for i, j in zip(dumm_reset.sums, canon_sums):
            if j[1].label[1] == _SUMMED:
                # Existing summations.
                term_sums.append(i)
            else:
                if fix_new:
                    assert j[0] == new_sums[i_new][0]
                    range_ = new_sums[i_new][1]
                else:
                    range_ = j[1]
                canon_new_sums.append((j[0], range_))
                term_new_sums.append((i[0], range_))
                i_new += 1
            continue

        return dumm_reset.map(lambda x: x, sums=tuple(itertools.chain(
            term_new_sums, term_sums
        ))), tuple(canon_new_sums)

    def _parse_interm_ref(self, sum_term: Expr):
        """Get the coefficient and pure intermediate reference in a reference.

        Despite being SymPy expressions, actually intermediate reference, for
        instance in a term in an summation node, is very rigid.
        """

        if isinstance(sum_term, Mul):
            args = sum_term.args
            assert len(args) == 2
            if self._is_interm_ref(args[1]):
                return args
            else:
                assert self._is_interm_ref(args[0])
                return args[1], args[0]
        else:
            return _UNITY, sum_term

    def _is_interm_ref(self, expr: Expr):
        """Test if an expression is a reference to an intermediate."""
        return (isinstance(expr, Indexed) and expr.base in self._interms) or (
            expr in self._interms
        )

    def _get_def(self, interm_ref: Expr) -> typing.List[Term]:
        """Get the definition of an intermediate reference.

        The intermediate reference need to be a pure intermediate reference
        without any factor.
        """

        if isinstance(interm_ref, Indexed):
            base = interm_ref.base
            indices = interm_ref.indices
        elif isinstance(interm_ref, Symbol):
            base = interm_ref
            indices = ()
        else:
            raise TypeError('Invalid intermediate reference', interm_ref)

        if base not in self._interms:
            raise ValueError('Invalid intermediate base', base)

        node = self._interms[base]

        if isinstance(node, _Sum):
            return self._index_sum(node, indices)
        elif isinstance(node, _Prod):
            return self._index_prod(node, indices)
        else:
            assert False

    def _index_sum(self, node: _Sum, indices) -> typing.List[Term]:
        """Substitute the external indices in the sum node"""

        substs, _ = node.get_substs(indices)

        res = []
        for i in node.sum_terms:
            coeff, ref = self._parse_interm_ref(i.xreplace(substs))
            term = self._get_def(ref)[0].scale(coeff)
            res.append(term)

        return res

    def _index_prod(self, node: _Prod, indices) -> typing.List[Term]:
        """Substitute the external indices in the evaluation node."""

        substs, excl = node.get_substs(indices)

        # TODO: Add handling of sum intermediate reference in factors.
        term = Term(
            node.sums, node.coeff * prod_(node.factors), ()
        ).reset_dumms(
            self._dumms, excl=self._excl | excl
        )[0].map(lambda x: x.xreplace(substs))

        return [term]

    #
    # General optimization.
    #

    def _form_node(self, grain: _Grain):
        """Form an evaluation node from a tensor definition.
        """

        # We assume it is fully simplified and expanded by grist preparation.
        exts = grain.exts
        terms = grain.terms

        if len(terms) == 0:
            assert False  # Should be removed by grist preparation.
        else:
            return self._form_sum_from_terms(grain.base, exts, terms)

    def _optimize(self, node):
        """Optimize the evaluation of the given node.

        The evaluation methods will be filled with, possibly multiple, method of
        evaluations.
        """

        if len(node.evals) > 0:
            return node

        if isinstance(node, _Sum):
            return self._optimize_sum(node)
        elif isinstance(node, _Prod):
            return self._optimize_prod(node)
        else:
            assert False

    def _form_prod_interm(
            self, exts, sums, factors
    ) -> typing.Tuple[Expr, _EvalNode]:
        """Form a product intermediate.

        The factors are assumed to be all non-trivial factors needing
        processing.
        """

        decored_exts = tuple(
            (i, j.replace_label((j.label, _EXT)))
            for i, j in exts
        )
        n_exts = len(decored_exts)
        term = Term(tuple(sums), prod_(factors), ())

        coeff, key, canon_exts = self._canon_terms(
            decored_exts, [term]
        )
        assert len(key) == 1

        if key in self._interms_canon:
            base = self._interms_canon[key]
        else:
            base = self._get_next_internal(n_exts == 0)
            self._interms_canon[key] = base

            key_term = key[0]
            key_exts = self._write_in_orig_ranges(key_term.sums[:n_exts])
            key_sums = key_term.sums[n_exts:]
            key_factors, key_coeff = key_term.get_amp_factors(self._interms)
            interm = _Prod(
                base, key_exts, key_sums, key_coeff, key_factors
            )
            self._interms[base] = interm

        return coeff * base[tuple(
            i for i, _ in canon_exts
        )] if isinstance(base, IndexedBase) else base, self._interms[base]

    def _form_sum_interm(self, exts, terms) -> typing.Tuple[Expr, _EvalNode]:
        """Form a sum intermediate.
        """

        decored_exts = tuple(
            (i, j.replace_label((j.label, _EXT)))
            for i, j in exts
        )
        n_exts = len(decored_exts)

        coeff, canon_terms, canon_exts = self._canon_terms(decored_exts, terms)

        if canon_terms in self._interms_canon:
            base = self._interms_canon[canon_terms]
        else:
            base = self._get_next_internal(n_exts == 0)
            self._interms_canon[canon_terms] = base

            node_exts = None
            node_terms = []
            for term in canon_terms:
                term_exts = self._write_in_orig_ranges(term.sums[:n_exts])
                if node_exts is None:
                    node_exts = term_exts
                else:
                    assert node_exts == term_exts
                node_terms.append(term.map(
                    lambda x: x, sums=term.sums[n_exts:]
                ))
                continue

            node = self._form_sum_from_terms(base, node_exts, node_terms)
            self._interms[base] = node
            self._optimize(node)

        return coeff * base[tuple(
            i for i, _ in canon_exts
        )] if isinstance(base, IndexedBase) else base, self._interms[base]

    def _form_sum_from_terms(self, base, exts, terms):
        """Form a summation node for given the terms.

        No processing is done in this method.
        """
        sum_terms = []
        for term in terms:
            sums = term.sums
            factors, coeff = term.amp_factors
            interm_ref, _ = self._form_prod_interm(exts, sums, factors)
            sum_terms.append(interm_ref * coeff)
            continue

        return _Sum(base, exts, sum_terms)

    #
    # Sum optimization.
    #

    def _optimize_sum(self, sum_node: _Sum):
        """Optimize the summation node."""

        # We first optimize the common terms.
        exts = sum_node.exts
        terms, new_term_idxes = self._optimize_common_terms(sum_node)

        # Now we embark upon the heroic factorization.
        collectibles = collections.defaultdict(dict)  # type: _Collectibles
        while True:

            for idx in new_term_idxes:
                term = terms[idx]

                # Loop over collectibles the new term can offer.
                for i, j in self._find_collectibles(exts, term):
                    infos = collectibles[i]
                    if idx not in infos:
                        # The same term cannot provide the same collectible
                        # twice.
                        infos[idx] = j
                    continue

                continue
            new_term_idxes.clear()

            to_collect, infos, total_cost = self._choose_collectible(
                collectibles
            )
            if to_collect is None:
                break

            new_term_idx = self._collect(terms, infos, total_cost)
            new_term_idxes.append(new_term_idx)

            del collectibles[to_collect]
            for i in infos.keys():
                for j in collectibles.values():
                    if i in j:
                        del j[i]

            continue
        # End Main loop.

        rem_terms = [i for i in terms if i is not None]
        sum_node.evals = [_Sum(
            sum_node.base, sum_node.exts, rem_terms
        )]
        return

    def _optimize_common_terms(self, sum_node: _Sum) -> typing.Tuple[
        typing.List[Expr], typing.List[int]
    ]:
        """Perform optimization of common intermediate references.
        """

        exts_dict = dict(sum_node.exts)

        # Intermediate base -> (indices -> coefficient)
        #
        # This also gather terms with the same reference to deeper nodes.
        interm_refs = collections.defaultdict(
            lambda: collections.defaultdict(lambda: 0)
        )

        for term in sum_node.sum_terms:
            coeff, ref = self._parse_interm_ref(term)
            if isinstance(ref, Symbol):
                base = ref
                indices = ()
            elif isinstance(ref, Indexed):
                base = ref.base
                indices = ref.indices
            else:
                assert False

            interm_refs[base][indices] += coeff
            continue

        # Intermediate referenced only once goes to the result directly and wait
        # to be factored, others wait to be pulled and do not participate in
        # factorization.
        res_terms = []
        res_collectible_idxes = []
        # Indices, coeffs tuple -> base, coeff
        pull_info = collections.defaultdict(list)
        for k, v in interm_refs.items():

            if len(v) == 0:
                assert False
            elif len(v) == 1:
                res_collectible_idxes.append(len(res_terms))
                indices, coeff = v.popitem()
                res_terms.append(
                    (k[indices] if len(indices) > 0 else k) * coeff
                )
            else:
                # Here we use name for sorting directly, since here we cannot
                # have general expressions hence no need to use the expensive
                # sort_key.
                raw = list(sorted(v.items(), key=lambda x: tuple(
                    i.name for i in x[0]
                )))
                leading_coeff = raw[0][1]
                pull_info[tuple(
                    (i, j / leading_coeff) for i, j in raw
                )].append((k, leading_coeff))

        # Now we treat the terms from which new intermediates might be pulled
        # out.
        for k, v in pull_info.items():
            pivot = k[0][0]
            assert k[0][1] == 1
            if len(v) == 1:
                # No need to form a new intermediate.
                base, coeff = v[0]
                pivot_ref = base[pivot] * coeff
            else:
                # We need to form an intermediate here.
                interm_exts = tuple(
                    (i, exts_dict[i]) for i in pivot
                )
                pivot_ref, interm_node = self._form_sum_interm(interm_exts, [
                    term.scale(coeff)
                    for base, coeff in v
                    for term in self._get_def(base[pivot])
                ])
                self._optimize(interm_node)

            for indices, coeff in k:
                substs = {
                    i: j for i, j in zip(pivot, indices)
                }
                res_terms.append(
                    pivot_ref.xreplace(substs) * coeff / k[0][1]
                )
                continue

            continue

        return res_terms, res_collectible_idxes

    def _find_collectibles(self, exts, term):
        """Find the collectibles from a given term.

        Collectibles are going to be yielded as key and infos pairs.
        """

        coeff, ref = self._parse_interm_ref(term)

        res = []  # type: typing.List[typing.Tuple[_Collectible, _CollectInfo]]

        if coeff != 1 and coeff != -1:
            # TODO: Add attempt to collect the coefficient.
            #
            # This could give some minor saving.
            pass

        prod_node = self._interms[
            ref.base if isinstance(ref, Indexed) else ref
        ]
        self._optimize(prod_node)

        if len(prod_node.factors) > 1:
            # Single-factor does not offer collectible,
            # collectible * (something + 1) is so rare in real applications.

            for eval_i in prod_node.evals:
                res.extend(self._find_collectibles_eval(
                    exts, ref, eval_i, prod_node.total_cost
                ))
                continue

        return res

    def _find_collectibles_eval(
            self, exts, ref: Expr, eval_: _Prod, opt_cost
    ):
        """Get the collectibles for a particular evaluations of a product."""

        # To begin, we first need to substitute the external indices in for this
        # particular evaluation inside its ambient.
        total_cost = eval_.total_cost
        assert total_cost is not None
        if len(eval_.exts) == 0:
            assert isinstance(ref, Symbol)
        else:
            assert isinstance(ref, Indexed)
            eval_terms = self._index_prod(eval_, ref.indices)
            assert len(eval_terms) == 1
            eval_term = eval_terms[0]
            factors, coeff = eval_term.get_amp_factors(self._interms)
            eval_ = _Prod(
                _SUBSTED_EVAL_BASE, exts, eval_term.sums, coeff, factors
            )
            eval_.total_cost = total_cost

        sums = eval_.sums
        factors = eval_.factors
        assert len(factors) == 2

        # Each evaluation could give two collectibles.
        res = []
        for lr in range(2):
            factor = factors[lr]
            collectible, ranges, coeff, substs = self._get_collectible_interm(
                exts, sums, factor
            )
            res.append((collectible, _CollectInfo(
                eval_=eval_, lr=lr,
                coeff=coeff, substs=substs, ranges=ranges,
                add_cost=eval_.total_cost - opt_cost
            )))
            continue

        return res

    def _get_collectible_interm(self, exts, sums, interm_ref):
        """Get a collectible from an intermediate reference."""

        terms = self._get_def(interm_ref)
        involved_symbs = interm_ref.atoms(Symbol)

        involved_exts = []
        other_exts = []
        for i, v in enumerate(exts):
            symb, range_ = v
            if symb in involved_symbs:
                involved_exts.append((
                    symb, range_.replace_label((range_.label, _EXT, i))
                ))
            else:
                other_exts.append((symb, range_))  # Undecorated.
            continue

        involved_sums = []
        for i, j in sums:
            # Sums not involved in both should be pushed in.
            assert i in involved_symbs
            involved_sums.append((
                i, j.replace_label((j.label, _SUMMED_EXT))
            ))
            continue

        coeff, key, all_sums = self._canon_terms(
            tuple(itertools.chain(involved_exts, involved_sums)), terms
        )
        ranges = _Ranges(
            involved_exts=self._write_in_orig_ranges(involved_exts),
            sums=self._write_in_orig_ranges(involved_sums),
            other_exts=other_exts
        )

        new_sums = (i for i in all_sums if i[1].label[1] == _SUMMED_EXT)
        return key, ranges, coeff, {
            i[0]: j[0]
            for i, j in zip(involved_sums, new_sums)
        }

    def _choose_collectible(self, collectibles: _Collectibles):
        """Choose the most profitable collectible factor.

        The collectible, its infos, and the final cost of the evaluation after
        the collection will be returned.
        """

        with_saving = (
            i for i in collectibles.items() if len(i[1]) > 1
        )

        optimal = None
        new_total_cost = None
        largest_saving = None
        for collectible, infos in with_saving:
            # Any range is sufficient for the determination of savings.
            raw_saving = self._get_collectible_saving(
                next(iter(infos.values())).ranges
            )
            saving = raw_saving - sum(
                i.add_cost for i in infos.values()
            )
            saving_key = get_cost_key(saving)

            if_save = len(saving_key[1]) > 0 and saving_key[1][0] > 0
            if_better = (
                largest_saving is None or saving_key > largest_saving[1]
            )

            if if_save and if_better:
                largest_saving = (saving, saving_key)
                optimal = (collectible, infos)
                orig_cost = sum(i.eval_.total_cost for i in infos.values())
                new_total_cost = orig_cost - raw_saving

            continue

        if optimal is None:
            return None, None, None
        else:
            return optimal[0], optimal[1], new_total_cost

    @staticmethod
    def _get_collectible_saving(ranges: _Ranges) -> Expr:
        """Get the saving factor for a collectible."""

        other_size = get_total_size(ranges.other_exts)
        sum_size = get_total_size(ranges.sums)
        ext_size = get_total_size(ranges.involved_exts)

        return other_size * add_costs(
            2 * sum_size * ext_size, ext_size, -sum_size
        )

    def _collect(self, terms, collect_infos: _CollectInfos, new_cost):
        """Collect the given collectible factor.

        This function will mutate the given terms list.  Set one of the
        collected terms to the new sum term, whose index is going to be
        returned, with all the rest collected terms set to None.
        """

        residue_terms = []
        residue_exts = None
        new_term_idx = min(collect_infos.keys())
        for k, v in collect_infos.items():

            coeff, _ = self._parse_interm_ref(terms[k])
            eval_ = v.eval_
            coeff *= eval_.coeff * v.coeff  # Three levels of coefficients.

            residue_terms.extend(
                i.map(lambda x: coeff * x) for i in self._get_def(
                    eval_.factors[0 if v.lr == 1 else 1].xreplace(v.substs)
                )
            )

            curr_exts = tuple(
                itertools.chain(v.ranges.other_exts, v.ranges.sums),
            )
            if residue_exts is None:
                residue_exts = curr_exts
            else:
                assert residue_exts == curr_exts

            continue

        new_ref, _ = self._form_sum_interm(residue_exts, residue_terms)

        for k, v in collect_infos.items():
            if k == new_term_idx:
                terms[k] = self._form_collected(terms[k], v, new_ref, new_cost)
            else:
                terms[k] = None

        return new_term_idx

    def _form_collected(
            self, term, info: _CollectInfo, new_ref, new_cost
    ) -> Expr:
        """Form new sum term with some factors collected based on a term.
        """

        eval_ = info.eval_
        collected_factor = eval_.factors[info.lr].xreplace(info.substs)

        interm_coeff, interm = self._parse_interm_ref(new_ref)
        coeff = interm_coeff / info.coeff

        _, orig_ref = self._parse_interm_ref(term)
        orig_node = self._interms[
            orig_ref.base if isinstance(orig_ref, Indexed) else orig_ref
        ]
        orig_exts = orig_node.exts

        base = self._get_next_internal(len(orig_exts) == 0)

        new_node = _Prod(base, orig_exts, eval_.sums, coeff, [
            collected_factor, interm
        ])
        new_node.total_cost = new_cost
        new_node.evals = [new_node]

        self._interms[base] = new_node

        return (
            base[tuple(i for i, _ in orig_exts)]
            if isinstance(base, IndexedBase) else base
        )

    #
    # Product optimization.
    #

    def _optimize_prod(self, prod_node):
        """Optimize the product evaluation node.
        """

        assert len(prod_node.evals) == 0
        n_factors = len(prod_node.factors)

        if n_factors < 2:
            assert n_factors == 1
            prod_node.evals.append(prod_node)
            prod_node.total_cost = self._get_prod_final_cost(
                get_total_size(prod_node.exts),
                get_total_size(prod_node.sums)
            )
            return

        strategy = self._strategy
        evals = prod_node.evals
        optimal_cost = None
        for final_cost, broken_sums, parts_gen in self._gen_factor_parts(
                prod_node
        ):
            def need_break():
                """If we need to break the current loop."""
                if strategy == Strategy.GREEDY:
                    return True
                elif strategy == Strategy.BEST or strategy == Strategy.SEARCHED:
                    return get_cost_key(final_cost) > optimal_cost[0]
                elif strategy == Strategy.ALL:
                    return False
                else:
                    assert False

            if (optimal_cost is not None) and need_break():
                break
            # Else

            for parts in parts_gen:

                # Recurse, two parts.
                assert len(parts) == 2
                for i in parts:
                    self._optimize(i.node)
                    continue

                total_cost = (
                    final_cost
                    + parts[0].node.total_cost
                    + parts[1].node.total_cost
                )
                total_cost_key = get_cost_key(total_cost)

                if_new_optimal = (
                    optimal_cost is None or optimal_cost[0] > total_cost_key
                )
                if if_new_optimal:
                    optimal_cost = (total_cost_key, total_cost)
                    if self._strategy == Strategy.BEST:
                        evals.clear()

                # New optimal is always added.
                def need_add_eval():
                    """If the current evaluation need to be added."""
                    if self._strategy == Strategy.BEST:
                        return total_cost_key == optimal_cost[0]
                    else:
                        return True

                if if_new_optimal or need_add_eval():
                    new_eval = self._form_prod_eval(
                        prod_node, broken_sums, parts
                    )
                    new_eval.total_cost = total_cost
                    evals.append(new_eval)

                continue

        assert len(evals) > 0
        prod_node.total_cost = optimal_cost[1]
        return

    def _gen_factor_parts(self, prod_node: _Prod):
        """Generate all the partitions of factors in a product node."""

        # Compute things invariant to different summations for performance.
        exts = prod_node.exts
        exts_total_size = get_total_size(exts)

        factor_atoms = [
            i.atoms(Symbol) for i in prod_node.factors
        ]
        sum_involve = [
            {j for j, v in enumerate(factor_atoms) if i in v}
            for i, _ in prod_node.sums
        ]

        dumm2index = tuple(
            {v[0]: j for j, v in enumerate(i)}
            for i in [prod_node.exts, prod_node.sums]
        )
        # Indices of external and internal dummies involved by each factors.
        factor_infos = [
            tuple(
                set(i[j] for j in atoms if j in i)
                for i in dumm2index
            )
            for atoms in factor_atoms
        ]

        # Actual generation.
        for broken_size, kept in self._gen_kept_sums(prod_node.sums):
            broken_sums = [i for i, j in zip(prod_node.sums, kept) if not j]
            final_cost = self._get_prod_final_cost(
                exts_total_size, broken_size
            )
            yield final_cost, broken_sums, self._gen_parts_w_kept_sums(
                prod_node, kept, sum_involve, factor_infos
            )
            continue

    @staticmethod
    def _gen_kept_sums(sums):
        """Generate kept summations in increasing size of broken summations.

        The results will be given as boolean array giving if the corresponding
        entry is to be kept.
        """

        sizes = [i.size for _, i in sums]
        n_sums = len(sizes)

        def get_size(kept):
            """Wrap the kept summation with its size."""
            size = sympify(prod_(
                i for i, j in zip(sizes, kept) if not j
            ))
            return get_cost_key(size), size, kept

        init = [True] * n_sums  # Everything is kept.
        queue = [get_size(init)]
        while len(queue) > 0:
            curr = heapq.heappop(queue)
            yield curr[1], curr[2]
            curr_kept = curr[2]
            for i in range(n_sums):
                if curr_kept[i]:
                    new_kept = list(curr_kept)
                    new_kept[i] = False
                    heapq.heappush(queue, get_size(new_kept))
                    continue
                else:
                    break
            continue

    def _gen_parts_w_kept_sums(
            self, prod_node: _Prod, kept, sum_involve, factor_infos
    ):
        """Generate all partitions with given summations kept.

        First we the factors are divided into chunks indivisible according to
        the kept summations.  Then their bipartitions which really break the
        broken sums are generated.
        """

        dsf = DSF(i for i, _ in enumerate(factor_infos))

        for i, j in zip(kept, sum_involve):
            if i:
                dsf.union(j)
            continue

        chunks = dsf.sets
        if len(chunks) < 2:
            return

        for part in self._gen_parts_from_chunks(kept, chunks, sum_involve):
            assert len(part) == 2
            yield tuple(
                self._form_part(prod_node, i, sum_involve, factor_infos)
                for i in part
            )

        return

    @staticmethod
    def _gen_parts_from_chunks(kept, chunks, sum_involve):
        """Generate factor partitions from chunks.

        Here special care is taken to respect the broken summations in the
        result.
        """

        n_chunks = len(chunks)

        for chunks_part in multiset_partitions(n_chunks, m=2):
            factors_part = tuple(set(
                factor_i for chunk_i in chunk_part_i
                for factor_i in chunks[chunk_i]
            ) for chunk_part_i in chunks_part)

            for i, v in enumerate(kept):
                if v:
                    continue
                # Now we have broken sum, it need to be involved by both parts.
                involve = sum_involve[i]
                if any(part.isdisjoint(involve) for part in factors_part):
                    break
            else:
                yield factors_part

    def _form_part(self, prod_node, factor_idxes, sum_involve, factor_infos):
        """Form a partition for the given factors."""

        involved_exts, involved_sums = [
            set.union(*[factor_infos[i][label] for i in factor_idxes])
            for label in [0, 1]
        ]

        factors = [prod_node.factors[i] for i in factor_idxes]
        exts = [
            v
            for i, v in enumerate(prod_node.exts)
            if i in involved_exts
        ]
        sums = []

        for i, v in enumerate(prod_node.sums):
            if sum_involve[i].isdisjoint(factor_idxes):
                continue
            elif sum_involve[i] <= factor_idxes:
                sums.append(v)
            else:
                exts.append(v)
            continue

        ref, node = self._form_prod_interm(exts, sums, factors)
        return _Part(ref=ref, node=node)

    @staticmethod
    def _get_prod_final_cost(exts_total_size, sums_total_size) -> Expr:
        """Compute the final cost for a pairwise product evaluation."""

        if sums_total_size == 1:
            return exts_total_size
        else:
            return _TWO * exts_total_size * sums_total_size

    def _form_prod_eval(
            self, prod_node: _Prod, broken_sums, parts: typing.Tuple[_Part, ...]
    ):
        """Form an evaluation for a product node."""

        assert len(parts) == 2

        coeff = _UNITY
        factors = []
        for i in parts:
            curr_coeff, curr_ref = self._parse_interm_ref(i.ref)
            coeff *= curr_coeff
            factors.append(curr_ref)
            continue

        return _Prod(
            prod_node.base, prod_node.exts, broken_sums,
            coeff * prod_node.coeff, factors
        )


#
# Utility constants.
#


_UNITY = Integer(1)
_NEG_UNITY = Integer(-1)
_TWO = Integer(2)

_EXT = 0
_SUMMED_EXT = 1
_SUMMED = 2

_SUBSTED_EVAL_BASE = Symbol('gristmillSubstitutedEvalBase')


#
# Utility static functions.
#

class _SymbFactory(dict):
    """A small symbol factory."""

    def __missing__(self, key):
        return Symbol('gristmillInternalSymbol{}'.format(key))


_SYMB_FACTORY = _SymbFactory()


class _WildFactory(dict):
    """A small wild symbol factory."""

    def __missing__(self, key):
        return Wild('gristmillInternalWild{}'.format(key))


_WILD_FACTORY = _WildFactory()


def _get_canon_coeff(coeffs, preferred):
    """Get the canonical coefficient from a list of coefficients."""

    coeff, _ = primitive(sum(
        v * _SYMB_FACTORY[i] for i, v in enumerate(coeffs)
    ))

    # The primitive computation does not take phase into account.
    n_neg = 0
    n_pos = 0
    for i in coeffs:
        if i.has(_NEG_UNITY) or i.is_negative:
            n_neg += 1
        else:
            n_pos += 1
        continue
    if n_neg > n_pos:
        phase = _NEG_UNITY
    elif n_pos > n_neg:
        phase = _UNITY
    else:
        preferred_phase = (
            _NEG_UNITY if preferred.has(_NEG_UNITY) or preferred.is_negative
            else _UNITY
        )
        phase = preferred_phase

    return coeff * phase


#
# Optimization result verification
# --------------------------------
#


[docs]def verify_eval_seq(
        eval_seq: typing.Sequence[TensorDef], res: typing.Sequence[TensorDef],
        simplify=False
) -> bool:
    """Verify the correctness of an evaluation sequence for the results.

    The last entries of the evaluation sequence should be in one-to-one
    correspondence with the original form in the ``res`` argument.  This
    function returns ``True`` when the evaluation sequence is symbolically
    equivalent to the given raw form.  When a difference is found,
    ``ValueError`` will be raised with relevant information.

    Note that this function can be very slow for large evaluations.  But it is
    advised to be used for all optimizations in mission-critical tasks.

    Parameters
    ----------

    eval_seq
        The evaluation sequence to verify, can be the output from
        :py:func:`optimize` directly.

    res
        The original result to test the evaluation sequence against.  It can be
        the input to :py:func:`optimize` directly.

    simplify
        If simplification is going to be performed after each step of the
        back-substitution.  It is advised for larger complex evaluations.

    """

    n_res = len(res)
    n_interms = len(eval_seq) - n_res

    substed_eval_seq = []
    defs_dict = {}
    for idx, eval_ in enumerate(eval_seq):
        base = eval_.base
        free_vars = eval_.rhs.free_vars
        curr_defs = [
            defs_dict[i] for i in free_vars if i in defs_dict
        ]
        rhs = eval_.rhs.subst_all(curr_defs, simplify=simplify)
        new_def = TensorDef(base, eval_.exts, rhs)
        substed_eval_seq.append(new_def)

        if idx < n_interms:
            defs_dict[
                base.label if isinstance(base, IndexedBase) else base
            ] = new_def

        continue

    for i, j in zip(substed_eval_seq[-n_res:], res):
        if i.lhs != j.lhs:
            raise ValueError(
                'Unequal left-hand sides', i.lhs, 'with', j.lhs, 'for', j
            )
        diff = (i.rhs - j.rhs).simplify()
        if diff != 0:
            raise ValueError(
                'Unequal definition for ', j.lhs, j
            )
        continue

    return True