Source code for columnflow.selection.matching

# coding: utf-8

"""
Distance-based methods.
"""

from __future__ import annotations

from columnflow.types import Callable, Union
from columnflow.selection import Selector, SelectionResult, selector
from columnflow.util import maybe_import

np = maybe_import("numpy")
ak = maybe_import("awkward")


[docs]def cleaning_factory(
    selector_name: str,
    to_clean: str,
    clean_against: list[str],
    metric: Union[Callable, None] = None,
) -> Selector:
    """
    Factory to generate a function with name *selector_name* that cleans the field *to_clean* in an
    array following the :external+coffea:py:class:`~coffea.nanoevents.NanoAODSchema` against the
    field(s) *clean_against*. First, the necessary column names to construct four-momenta for the
    different object fields are constructed, i.e. ``pt``, ``eta``, ``phi`` and ``e`` for the
    different objects. Finally, the actual selector function is generated, which uses these columns.

    :param selector_name: Name of the :py:class:`~columnflow.selection.Selector` class to be
        initialized.
    :param to_clean: Name of the field to be cleaned (e.g. ``"Jet"``).
    :param clean_against: Names of the fields of object to clean field *to_clean* against
        (e.g. ``["Muon"]``).
    :param metric: Function to use for the cleaning. If None, use
        :external+coffea:py:meth:`~coffea.nanoevents.methods.vector.LorentzVector.delta_r`.
    :return: Instance of :py:class:`~columnflow.selection.Selector`.
    """
    # default of the metric function is the delta_r function
    # of the coffea LorentzVectors
    if metric is None:
        metric = lambda a, b: a.delta_r(b)

    # compile the list of variables that are necessary for the four momenta
    # this list is always the same
    variables_for_lorentzvec = ["pt", "eta", "phi", "e"]

    # sum up all fields aht are to be considered, i.e. the field *to_clean*
    # and all fields in *clean_against*
    all_fields = clean_against + [to_clean]

    # construct the set of columns that is necessary for the four momenta in
    # the different fields (and thus also for the current implementation of
    # the cleaning itself) by looping through the fields and variables.

    uses = {
        f"{x}.{var}" for x in all_fields for var in variables_for_lorentzvec
    }

    # additionally, also load the lengths of the different fields
    uses |= {f"n{x}" for x in all_fields}

    # finally, construct selector function itself
    @selector(uses=uses, name=selector_name)
    def func(
        self: Selector,
        events: ak.Array,
        to_clean: str,
        clean_against: list[str],
        metric: Union[Callable, None] = metric,
        threshold: float = 0.4,
    ) -> ak.Array:
        """
        Abstract function to perform a cleaning of field *to_clean* against a (list of) field(s)
        *clean_against* based on an abitrary metric *metric* (e.g.
        :external+coffea:py:meth:`~coffea.nanoevents.methods.vector.LorentzVector.delta_r`). First
        concatenate all fields in *clean_against*, which thus includes all fields that are to be
        used for the comparison of the metric. Then construct the metric for all permutations of the
        different objects using the :external+coffea:doc:`index`
        :external+coffea:py:meth:`~coffea.nanoevents.methods.vector.LorentzVector.nearest`
        implementation. All objects in field *to_clean* are removed if the metric is below the
        *threshold*.

        :param self: :py:class:`columnflow.selection.Selector` instance into which this function is
            embedded.
        :param events: array containing events in the NanoAOD format
        param to_clean: Name of the field to be cleaned (e.g. ``"Jet"``)
        :param clean_against: Names of the fields of object to clean field *to_clean* against (e.g.
            ``["Muon"]``)
        :param metric: Function to use for the cleaning. If None, the
            :external+coffea:py:meth:`~coffea.nanoevents.methods.vector.LorentzVector.delta_r`,
            defaults to None.
        :param threshold: Threshold value for decision which objects to keep and which to reject,
            defaults to ``0.4``.
        :return: array of indices of cleaned objects, ordered according to the ``pt`` of the
            objects.
        """
        # concatenate the fields that are to be used in the construction
        # of the metric table
        summed_clean_against = ak.concatenate(
            [events[x] for x in clean_against],
            axis=1,
        )

        # load actual NanoEventArray that is to be cleaned
        to_clean_field = events[to_clean]

        # construct metric table for these objects. The metric table contains the minimal value of
        # the metric *metric* for each object in field *to_clean* w.r.t. all objects in
        # *summed_clean_against*. Thus, it has the dimensions nevents x nto_clean, where *nevents*
        # is the number of events in the current chunk of data and *nto_clean* is the length of the
        # field *to_clean*. Note that the argument *threshold* in the *nearest* function must be set
        # to None since the function will perform a selection itself to extract the nearest objects
        # (i.e. applies the selection we want here in reverse)
        _, metric = to_clean_field.nearest(
            summed_clean_against,
            metric=metric,
            return_metric=True,
            threshold=None,
        )
        # build a binary mask based on the selection threshold provided by the
        # user
        mask = metric > threshold

        # construct final result. Currently, this is the list of indices for
        # clean jets, sorted for pt
        # WARNING: this still contains the bug with the application of the mask
        #          which will be adressed in a PR in the very near future
        # TODO: return the mask itself instead of the list of indices
        sorted_list = ak.argsort(to_clean_field.pt, axis=-1, ascending=False)[mask]
        return sorted_list

    return func


delta_r_jet_lepton = cleaning_factory(
    selector_name="delta_r_jet_lepton",
    to_clean="Jet",
    clean_against=["Muon", "Electron"],
    metric=lambda a, b: a.delta_r(b),
)


[docs]@selector(uses={delta_r_jet_lepton})
def jet_lepton_delta_r_cleaning(
    self: Selector,
    events: ak.Array,
    stats: dict[str, Union[int, float]],
    threshold: float = 0.4,
    **kwargs,
) -> tuple[ak.Array, SelectionResult]:
    """
    Function to apply the selection requirements necessary for a cleaning of jets against leptons.

    The function calls the requirements to clean the field *Jet* against the concatination of the
    fields *[Muon, Electron]*, i.e. all leptons and passes the desired threshold for the selection

    :param events: Array containing events in the NanoAOD format
    :param stats: :py:class:`dictionary <dict>` containing selection stats (not used here).
    :param threshold: Threshold value for decision which objects to keep and which to reject.

    :return: Tuple containing the events array and a
        :py:class:`~columnflow.selection.SelectionResult` with indices of cleaned jets in the
        "Jet" object field.
    """
    clean_jet_indices = self[delta_r_jet_lepton](events, "Jet", ["Muon", "Electron"], threshold=threshold)

    # TODO: should not return a new object collection but an array with masks
    return events, SelectionResult(objects={"Jet": clean_jet_indices})