Source code for pyirf.interpolation.quantile_interpolator

import numpy as np
from scipy.interpolate import griddata, interp1d

from .base_interpolators import DiscretePDFInterpolator, PDFNormalization
from .utils import get_bin_width

__all__ = ["QuantileInterpolator"]


def cdf_values(binned_pdf, bin_edges, normalization):
    """
    compute cdf values and assure they are normed to unity
    """
    bin_widths = get_bin_width(bin_edges, normalization)
    cdfs = np.cumsum(binned_pdf * bin_widths, axis=-1)

    # assure the last cdf value is 1, ignore errors for empty pdfs as they are reset to 0 by nan_to_num
    with np.errstate(invalid="ignore"):
        cdfs = np.nan_to_num(cdfs / np.max(cdfs, axis=-1)[..., np.newaxis])

    return cdfs


def ppf_values(bin_mids, cdfs, quantiles):
    """
    Compute ppfs from cdfs and interpolate them to the desired interpolation point

    Parameters
    ----------
    bin_mids: numpy.ndarray, shape=(n_bins)
        Bin-mids for each bin along interpolation axis

    cdfs: numpy.ndarray, shape=(n_points,...,n_bins)
        Corresponding cdf-values for all quantiles

    quantiles: numpy.ndarray, shape=(n_quantiles)
        Quantiles for which the ppf-values should be estimated

    Returns
    -------
    ppfs: numpy.ndarray, shape=(1,...,n_quantiles)
        Corresponding ppf-values for all quantiles at the target interpolation point
    """

    def cropped_interp(cdf):
        """
        create ppf-values through inverse interpolation of the cdf, avoiding repeating 0 and 1 entries
        around the first and last bins as well as repeating values due to zero bins
        in between filled bins. Both cases would result in division by zero errors when computing
        the interpolation polynom.
        """

        # Find last 0 and first 1 entry
        last_0 = np.nonzero(cdf == 0)[0][-1] if cdf[0] == 0 else 0
        first_1 = np.nonzero(cdf == 1.0)[0][0]

        # Predefine selection mask
        selection = np.ones(len(cdf), dtype=bool)

        # Keep only the first of subsequently matching values
        selection[1:] = cdf[1:] != cdf[:-1]

        # Keep only the last 0 and first 1 entry
        selection[:last_0] = False
        selection[last_0] = True
        selection[first_1 + 1 :] = False
        selection[first_1] = True

        # create ppf values from selected bins
        return interp1d(
            cdf[selection],
            bin_mids[selection],
            bounds_error=False,
            fill_value="extrapolate",
        )(quantiles)

    # create ppf values from cdf samples via interpolation of the cdfs
    # return nan for empty pdfs
    ppfs = np.apply_along_axis(
        lambda cdf: cropped_interp(cdf)
        if np.sum(cdf) > 0
        else np.full_like(quantiles, np.nan),
        -1,
        cdfs,
    )
    # nD interpolation of ppf values
    return ppfs


def pdf_from_ppf(bin_edges, interp_ppfs, quantiles):
    """
    Reconstruct pdf from ppf and evaluate at desired points.

    Parameters
    ----------
    bin_edges: numpy.ndarray, shape=(n_bins+1)
        Edges of the bins in which the final pdf should be binned

    interp_ppfs: numpy.ndarray, shape=(1,...,n_quantiles)
        Corresponding ppf-values for all quantiles at the target_point,
        not to be confused with QunatileInterpolators self.ppfs, the ppfs
        computed from the input distributions.

    quantiles: numpy.ndarray, shape=(n_quantiles)
        Quantiles corresponding to the ppf-values in interp_ppfs

    Returns
    -------
    pdf_values: numpy.ndarray, shape=(1,...,n_bins)
        Recomputed, binned pdf at target_point
    """
    # recalculate pdf values through numerical differentiation
    pdf_interpolant = np.nan_to_num(np.diff(quantiles) / np.diff(interp_ppfs, axis=-1))

    # Unconventional solution to make this usable with np.apply_along_axis for readability
    # The ppf bin-mids are computed since the pdf-values are derived through derivation
    # from the ppf-values
    xyconcat = np.concatenate(
        (interp_ppfs[..., :-1] + np.diff(interp_ppfs) / 2, pdf_interpolant), axis=-1
    )

    def interpolate_ppf(xy):
        ppf = xy[: len(xy) // 2]
        pdf = xy[len(xy) // 2 :]
        interpolate = interp1d(ppf, pdf, bounds_error=False, fill_value=(0, 0))
        result = np.nan_to_num(interpolate(bin_edges[:-1]))
        return result

    # Interpolate pdf samples and evaluate at bin edges, weight with the bin_width to estimate
    # correct bin height via the midpoint rule formulation of the trapezoidal rule
    pdf_values = np.apply_along_axis(interpolate_ppf, -1, xyconcat)

    return pdf_values


def norm_pdf(pdf_values, bin_edges, normalization):
    """
    Normalize binned_pdf to a sum of 1
    """
    norm = np.sum(pdf_values, axis=-1)
    width = get_bin_width(bin_edges, normalization)

    # Norm all binned_pdfs to unity that are not empty
    normed_pdf_values = np.divide(
        pdf_values,
        norm[..., np.newaxis] * width,
        out=np.zeros_like(pdf_values),
        where=norm[..., np.newaxis] != 0,
    )

    return normed_pdf_values



[docs]
class QuantileInterpolator(DiscretePDFInterpolator):
    """Interpolator class providing quantile interpoalation."""

    def __init__(
        self,
        grid_points,
        bin_edges,
        binned_pdf,
        quantile_resolution=1e-3,
        normalization=PDFNormalization.AREA,
    ):
        """
        Parameters
        ----------
        grid_points : np.ndarray, shape=(n_points, n_dims)
            Grid points at which interpolation templates exist
        bin_edges : np.ndarray, shape=(n_bins+1)
            Edges of the data binning
        binned_pdf : np.ndarray, shape(n_points, ..., n_bins)
            Content of each bin in bin_edges for
            each point in grid_points. First dimesion has to correspond to number
            of grid_points, the last axis has to correspond to number
            of bins for the quantity that should be interpolated
            (e.g. the Migra axis for EDisp)
        quantile_resolution : float
            Spacing between quantiles
        normalization : PDFNormalization
            How the discrete PDF is normalized

        Raises
        ------
        ValueError:
            When last axis in binned_pdf and number of bins are not equal.

        Notes
        -----
        Also calls __init__ of pyirf.interpolation.BaseInterpolator and
        DiscretePDFInterpolator
        """
        # Remember input shape
        self.input_shape = binned_pdf.shape

        if self.input_shape[-1] != len(bin_edges) - 1:
            raise ValueError(
                "Number of bins along last axis and those specified by bin_edges not matching."
            )

        # include 0-bin at first position in each pdf to avoid edge-effects where the CDF would otherwise
        # start at a value != 0, also extend edges with one bin to the left
        fill_zeros = np.zeros(shape=binned_pdf.shape[:-1])[..., np.newaxis]
        binned_pdf = np.concatenate((fill_zeros, binned_pdf), axis=-1)
        bin_edges = np.append(bin_edges[0] - np.diff(bin_edges)[0], bin_edges)

        # compute quantiles from quantile_resolution
        self.quantiles = np.linspace(
            0, 1, int(np.round(1 / quantile_resolution, decimals=0))
        )

        super().__init__(
            grid_points=grid_points,
            bin_edges=bin_edges,
            binned_pdf=binned_pdf,
            normalization=normalization,
        )

        # Compute CDF values
        self.cdfs = cdf_values(self.binned_pdf, self.bin_edges, self.normalization)

        # compute ppf values at quantiles, determine quantile step of [1]
        self.ppfs = ppf_values(self.bin_mids, self.cdfs, self.quantiles)


[docs]
    def interpolate(self, target_point):
        """
        Takes a grid of binned pdfs for a bunch of different parameters
        and interpolates it to given value of those parameters.
        This function provides an adapted version of the quantile interpolation introduced
        in [1].
        Instead of following this method closely, it implements different approaches to the
        steps shown in Fig. 5 of [1].

        Parameters
        ----------
        target_point: numpy.ndarray, shape=(1, n_dims)
            Value for which the interpolation is performed (target point)

        Returns
        -------
        f_new: numpy.ndarray, shape=(1, ..., n_bins)
            Interpolated and binned pdf

        References
        ----------
        .. [1] B. E. Hollister and A. T. Pang (2013). Interpolation of Non-Gaussian Probability Distributions
               for Ensemble Visualization
               https://engineering.ucsc.edu/sites/default/files/technical-reports/UCSC-SOE-13-13.pdf
        """

        # interpolate ppfs to target point, interpolate quantiles step of [1]
        interpolated_ppfs = griddata(self.grid_points, self.ppfs, target_point)

        # compute pdf values for all bins, evaluate interpolant PDF values step of [1], drop the earlier
        # introduced extra bin
        interpolated_pdfs = pdf_from_ppf(
            self.bin_edges, interpolated_ppfs, self.quantiles
        )[..., 1:]

        # Renormalize pdf to sum of 1
        normed_interpolated_pdfs = norm_pdf(
            interpolated_pdfs, self.bin_edges[1:], self.normalization
        )

        # Re-swap axes and set all nans to zero
        return np.nan_to_num(normed_interpolated_pdfs).reshape(1, *self.input_shape[1:])