Source code for columnflow.selection.matching

# coding: utf-8

"""
Distance-based methods.
"""

from __future__ import annotations

from columnflow.types import Callable, Union
from columnflow.selection import Selector, SelectionResult, selector
from columnflow.util import maybe_import

np = maybe_import("numpy")
ak = maybe_import("awkward")


[docs]def cleaning_factory( selector_name: str, to_clean: str, clean_against: list[str], metric: Union[Callable, None] = None, ) -> Selector: """ Factory to generate a function with name *selector_name* that cleans the field *to_clean* in an array following the :external+coffea:py:class:`~coffea.nanoevents.NanoAODSchema` against the field(s) *clean_against*. First, the necessary column names to construct four-momenta for the different object fields are constructed, i.e. ``pt``, ``eta``, ``phi`` and ``e`` for the different objects. Finally, the actual selector function is generated, which uses these columns. :param selector_name: Name of the :py:class:`~columnflow.selection.Selector` class to be initialized. :param to_clean: Name of the field to be cleaned (e.g. ``"Jet"``). :param clean_against: Names of the fields of object to clean field *to_clean* against (e.g. ``["Muon"]``). :param metric: Function to use for the cleaning. If None, use :external+coffea:py:meth:`~coffea.nanoevents.methods.vector.LorentzVector.delta_r`. :return: Instance of :py:class:`~columnflow.selection.Selector`. """ # default of the metric function is the delta_r function # of the coffea LorentzVectors if metric is None: metric = lambda a, b: a.delta_r(b) # compile the list of variables that are necessary for the four momenta # this list is always the same variables_for_lorentzvec = ["pt", "eta", "phi", "e"] # sum up all fields aht are to be considered, i.e. the field *to_clean* # and all fields in *clean_against* all_fields = clean_against + [to_clean] # construct the set of columns that is necessary for the four momenta in # the different fields (and thus also for the current implementation of # the cleaning itself) by looping through the fields and variables. uses = { f"{x}.{var}" for x in all_fields for var in variables_for_lorentzvec } # additionally, also load the lengths of the different fields uses |= {f"n{x}" for x in all_fields} # finally, construct selector function itself @selector(uses=uses, name=selector_name) def func( self: Selector, events: ak.Array, to_clean: str, clean_against: list[str], metric: Union[Callable, None] = metric, threshold: float = 0.4, ) -> ak.Array: """ Abstract function to perform a cleaning of field *to_clean* against a (list of) field(s) *clean_against* based on an abitrary metric *metric* (e.g. :external+coffea:py:meth:`~coffea.nanoevents.methods.vector.LorentzVector.delta_r`). First concatenate all fields in *clean_against*, which thus includes all fields that are to be used for the comparison of the metric. Then construct the metric for all permutations of the different objects using the :external+coffea:doc:`index` :external+coffea:py:meth:`~coffea.nanoevents.methods.vector.LorentzVector.nearest` implementation. All objects in field *to_clean* are removed if the metric is below the *threshold*. :param self: :py:class:`columnflow.selection.Selector` instance into which this function is embedded. :param events: array containing events in the NanoAOD format param to_clean: Name of the field to be cleaned (e.g. ``"Jet"``) :param clean_against: Names of the fields of object to clean field *to_clean* against (e.g. ``["Muon"]``) :param metric: Function to use for the cleaning. If None, the :external+coffea:py:meth:`~coffea.nanoevents.methods.vector.LorentzVector.delta_r`, defaults to None. :param threshold: Threshold value for decision which objects to keep and which to reject, defaults to ``0.4``. :return: array of indices of cleaned objects, ordered according to the ``pt`` of the objects. """ # concatenate the fields that are to be used in the construction # of the metric table summed_clean_against = ak.concatenate( [events[x] for x in clean_against], axis=1, ) # load actual NanoEventArray that is to be cleaned to_clean_field = events[to_clean] # construct metric table for these objects. The metric table contains the minimal value of # the metric *metric* for each object in field *to_clean* w.r.t. all objects in # *summed_clean_against*. Thus, it has the dimensions nevents x nto_clean, where *nevents* # is the number of events in the current chunk of data and *nto_clean* is the length of the # field *to_clean*. Note that the argument *threshold* in the *nearest* function must be set # to None since the function will perform a selection itself to extract the nearest objects # (i.e. applies the selection we want here in reverse) _, metric = to_clean_field.nearest( summed_clean_against, metric=metric, return_metric=True, threshold=None, ) # build a binary mask based on the selection threshold provided by the # user mask = metric > threshold # construct final result. Currently, this is the list of indices for # clean jets, sorted for pt # WARNING: this still contains the bug with the application of the mask # which will be adressed in a PR in the very near future # TODO: return the mask itself instead of the list of indices sorted_list = ak.argsort(to_clean_field.pt, axis=-1, ascending=False)[mask] return sorted_list return func
delta_r_jet_lepton = cleaning_factory( selector_name="delta_r_jet_lepton", to_clean="Jet", clean_against=["Muon", "Electron"], metric=lambda a, b: a.delta_r(b), )
[docs]@selector(uses={delta_r_jet_lepton}) def jet_lepton_delta_r_cleaning( self: Selector, events: ak.Array, stats: dict[str, Union[int, float]], threshold: float = 0.4, **kwargs, ) -> tuple[ak.Array, SelectionResult]: """ Function to apply the selection requirements necessary for a cleaning of jets against leptons. The function calls the requirements to clean the field *Jet* against the concatination of the fields *[Muon, Electron]*, i.e. all leptons and passes the desired threshold for the selection :param events: Array containing events in the NanoAOD format :param stats: :py:class:`dictionary <dict>` containing selection stats (not used here). :param threshold: Threshold value for decision which objects to keep and which to reject. :return: Tuple containing the events array and a :py:class:`~columnflow.selection.SelectionResult` with indices of cleaned jets in the "Jet" object field. """ clean_jet_indices = self[delta_r_jet_lepton](events, "Jet", ["Muon", "Electron"], threshold=threshold) # TODO: should not return a new object collection but an array with masks return events, SelectionResult(objects={"Jet": clean_jet_indices})