Source code for qp.parameterizations.hist.hist

"""This module implements a distribution parameterization sub-class using histograms"""

from __future__ import annotations
import numpy as np

from scipy.stats import rv_continuous
from typing import Mapping, Optional
from numpy.typing import ArrayLike


import warnings

from .hist_utils import (
    evaluate_hist_x_multi_y,
    extract_hist_values,
    extract_hist_samples,
)
from ..base import Pdf_rows_gen
from ...plotting import get_axes_and_xlims, plot_pdf_histogram_on_axes
from ...utils.array import reshape_to_pdf_size, reduce_dimensions

from ...utils.interpolation import interpolate_multi_x_y, interpolate_x_multi_y

from ...core.factory import add_class
from ...core.ensemble import Ensemble



[docs]
class hist_gen(Pdf_rows_gen):
    """Implements distributions parameterized as histograms.

    By default, the input distribution is normalized. If the input data is
    already normalized, you can use the optional parameter ``norm = False``
    to skip the normalization process.

    Parameters
    ----------
    bins : ArrayLike
        The array containing the (n+1) bin boundaries
    pdfs : ArrayLike
        The array containing the (npdf, n) bin values
    norm : bool, optional
        If True, normalizes the input distribution. If False, assumes the
        given distribution is already normalized. By default True.
    warn : bool, optional
            If True, raises warnings if input is not valid PDF data (i.e. if
            data is negative). If False, no warnings are raised. By default True.


    Notes
    -----

    There must be a minimum of 2 bins.

    Converting to this parameterization:

    This table contains the available methods to convert to this parameterization,
    their required arguments, and their method keys. If the key is `None`, this is
    the default conversion method.

    +------------------------+-----------------------------------------------------+------------+
    | Function               | Arguments                                           | Method key |
    +------------------------+-----------------------------------------------------+------------+
    | `.extract_hist_values` | bins (array of bin edges)                           | None       |
    +------------------------+-----------------------------------------------------+------------+
    | `.extract_hist_samples`| bins (array of bin edges),                          | samples    |
    |                        | size (int, optional, number of samples to generate) |            |
    +------------------------+-----------------------------------------------------+------------+

    Implementation notes:

    Inside a given bin `pdf()` will return the `hist_gen.pdfs` value.
    Outside the range of the given bins `pdf()` will return 0.

    Inside a given bin `cdf()` will use a linear interpolation across the bin.
    Outside the range of the given bins `cdf()` will return (0 or 1), respectively.

    The percentage point function `ppf()` will return negative infinity at 0 and positive
    infinity at 1.

    """

    # pylint: disable=protected-access

    name = "hist"
    version = 0

    _support_mask = rv_continuous._support_mask

    def __init__(
        self,
        bins: ArrayLike,
        pdfs: ArrayLike,
        norm: bool = True,
        warn: bool = True,
        *args,
        **kwargs,
    ) -> None:
        """
        Create a new distribution using the given histogram.

        Parameters
        ----------
        bins : ArrayLike
          The array containing the (n+1) bin boundaries
        pdfs : ArrayLike
          The array containing the (npdf, n) bin values
        norm : bool, optional
            If True, normalizes the input distribution. If False, assumes the
            given distribution is already normalized. By default True.
        warn : bool, optional
            If True, raises warnings if input is not valid PDF data (i.e. if
            data is negative). If False, no warnings are raised. By default True.
        """
        self._hbins = np.squeeze(bins)  # make sure bins is 1D
        self._nbins = self._hbins.size - 1
        self._hpdfs = reshape_to_pdf_size(np.asarray(pdfs), -1)

        # make sure that the bins are sorted
        if not np.all(np.diff(self._hbins) >= 0):
            raise ValueError(
                f"Invalid bins: The given bins are not sorted: {self._hbins}"
            )

        # raise warnings if input data is not finite or pdfs are not positive
        self._warn = warn
        if self._warn:
            if not np.all(np.isfinite(self._hbins)):
                warnings.warn(
                    f"The given bins contain non-finite values - {self._hbins}",
                    RuntimeWarning,
                )
            if not np.all(np.isfinite(pdfs)):
                indices = np.where(np.isfinite(pdfs) != True)
                warnings.warn(
                    f"There are non-finite values in the pdfs for the distributions: {indices[0]}",
                    RuntimeWarning,
                )
            if np.any(self._hpdfs < 0):
                indices = np.where(self._hpdfs < 0)
                warnings.warn(
                    f"There are negative values in the pdfs for the distributions: {indices[0]}",
                    RuntimeWarning,
                )

        # check data shapes make sense
        if np.shape(pdfs)[-1] != self._nbins:  # pragma: no cover
            raise ValueError(
                "Number of bins (%i) != number of values (%i)"
                % (self._nbins, np.shape(pdfs)[-1])
            )

        self._hbin_widths = self._hbins[1:] - self._hbins[:-1]
        self._xmin = self._hbins[0]
        self._xmax = self._hbins[-1]

        # normalize the input data if norm is True
        self._norm = norm
        if self._norm:
            self._hpdfs = self.normalize()["pdfs"]

        self._hcdfs = None
        # Set support
        kwargs["shape"] = self._hpdfs.shape  # pdfs.shape
        super().__init__(*args, **kwargs)
        self._addmetadata("bins", self._hbins)
        self._addobjdata("pdfs", self._hpdfs)

    def _compute_cdfs(self):
        copy_shape = np.array(self._hpdfs.shape)
        copy_shape[-1] += 1
        self._hcdfs = np.ndarray(copy_shape)
        self._hcdfs[:, 0] = 0.0
        self._hcdfs[:, 1:] = np.cumsum(self._hpdfs * self._hbin_widths, axis=1)


[docs]
    def normalize(self) -> Mapping[str, np.ndarray[float]]:
        """Normalizes the input distribution values.

        Returns
        -------
        Mapping [str, np.ndarray[float]]
            An (npdf, n) array of pdf values in the n bins for the npdf distributions

        Raises
        ------
        ValueError
            Raised if the sum under the distribution <= 0.
        """
        pdfs_2d = self._hpdfs
        sums = np.sum(pdfs_2d * self._hbin_widths, axis=1)
        if np.any(sums < 0):
            indices = np.where(sums < 0)
            raise ValueError(
                f"The distribution(s) cannot be properly normalized, the sum of the pdfs is < 0 for distributions at index = {indices[0]} "
            )
        elif np.any(sums == 0):
            indices = np.where(sums == 0)
            warnings.warn(
                f"The distributions(s) with indices {indices[0]} have an integral of 0."
            )
        return {"pdfs": (pdfs_2d.T / sums).T}


    @property
    def bins(self) -> np.ndarray[float]:
        """Return the histogram bin edges"""
        return self._hbins

    @property
    def pdfs(self) -> np.ndarray[float]:
        """Return the histogram bin values"""
        return self._hpdfs


[docs]
    def x_samples(self) -> np.ndarray[float]:
        """Return a set of x values that can be used to plot all the PDFs."""
        # TODO: possibly add a bin to the left and right?
        return (self._hbins[:-1] + self._hbins[1:]) / 2


    def _pdf(self, x, row):
        # pylint: disable=arguments-differ
        pdf = evaluate_hist_x_multi_y(x, row, self._hbins, self._hpdfs).ravel()

        # reduce dimension to 0 if there's only one value
        if np.shape(pdf) == (1,) and len(pdf) == 1:
            return pdf[0]
        else:
            return pdf

    def _cdf(self, x, row):
        # pylint: disable=arguments-differ
        if self._hcdfs is None:  # pragma: no cover
            self._compute_cdfs()
        return interpolate_x_multi_y(
            x, row, self._hbins, self._hcdfs, bounds_error=False, fill_value=(0.0, 1.0)
        ).ravel()

    def _ppf(self, x, row):
        # pylint: disable=arguments-differ
        if self._hcdfs is None:  # pragma: no cover
            self._compute_cdfs()
        return interpolate_multi_x_y(
            x,
            row,
            self._hcdfs,
            self._hbins,
            bounds_error=False,
            fill_value=(self._xmin, self._xmax),
        ).ravel()

    def _munp(self, m, *args):
        """compute moments"""
        # pylint: disable=arguments-differ
        # Silence floating point warnings from integration.
        with np.errstate(all="ignore"):
            vals = self.custom_generic_moment(m)
        return vals


[docs]
    def custom_generic_moment(self, m: ArrayLike) -> np.ndarray[float]:
        """Compute the mth moment"""
        m = np.asarray(m)
        dx = self._hbins[1] - self._hbins[0]
        xv = 0.5 * (self._hbins[1:] + self._hbins[:-1])
        return np.sum(xv**m * self._hpdfs, axis=1) * dx


    def _updated_ctor_param(self):
        """
        Set the bins as additional constructor argument
        """
        dct = super()._updated_ctor_param()
        dct["bins"] = self._hbins
        dct["pdfs"] = self._hpdfs
        dct["norm"] = self._norm
        dct["warn"] = self._warn
        return dct


[docs]
    @classmethod
    def get_allocation_kwds(
        cls, npdf: int, **kwargs: str
    ) -> dict[str, tuple[tuple[int, int], str]]:
        """Return the kwds necessary to create an `empty` HDF5 file with ``npdf`` entries
        for iterative write. We only need to allocate the data columns, as
        the metadata will be written when we finalize the file.

        The number of data columns is calculated based on the length or shape of the
        metadata, ``n``. For example, the number of columns is ``nbins-1``
        for a histogram.

        Parameters
        ----------
        npdf : int
            Total number of distributions that will be written out
        kwargs :
            The keys needed to construct the shape of the data to be written.

        Returns
        -------
        dict [ str, tuple [ tuple [int , int], str]]
            A dictionary with a key for the objdata, a tuple with the shape of that data,
            and the data type of the data as a string.
            i.e. ``{objdata_key = ( (npdf, n), "f4" )}``

        Raises
        ------
        ValueError
            Raises an error if the bins is not provided."""

        if "bins" not in kwargs:  # pragma: no cover
            raise ValueError("required argument 'bins' not included in kwargs")
        nbins = len(kwargs["bins"].flatten())
        return dict(pdfs=((npdf, nbins - 1), "f4"))



[docs]
    @classmethod
    def plot_native(cls, pdf: Ensemble, **kwargs):
        """Plot the PDF in a way that is particular to this type of distribution

        For a histogram this shows the bin edges.

        Parameters
        ----------
        axes : Axes
            The axes to plot on. Either this or xlim must be provided.
        xlim : tuple [float, float]
            The x-axis limits. Either this or axes must be provided.
        kwargs :
            Any keyword arguments to pass to matplotlib's `matplotlib.axes.Axes.hist` method.

        Returns
        -------
        axes : Axes
            The plot axes.
        """
        axes, _, kw = get_axes_and_xlims(**kwargs)
        vals = pdf.dist.pdfs[pdf.kwds["row"]]
        return plot_pdf_histogram_on_axes(axes, hist=(pdf.dist.bins, vals), **kw)



[docs]
    @classmethod
    def add_mappings(cls):
        """
        Add this classes mappings to the conversion dictionary
        """
        cls._add_creation_method(cls.create, None)
        cls._add_extraction_method(extract_hist_values, None)
        cls._add_extraction_method(extract_hist_samples, "samples")



[docs]
    @classmethod
    def create_ensemble(
        self,
        bins: ArrayLike,
        pdfs: ArrayLike,
        norm: bool = True,
        warn: bool = True,
        ancil: Optional[Mapping] = None,
    ) -> Ensemble:
        """Creates an Ensemble of distributions parameterized as histograms.


        Parameters
        ----------
        bins : ArrayLike
          The array containing the (n+1) bin boundaries
        pdfs : ArrayLike
          The array containing the (npdf, n) bin values
        norm : bool, optional
            If True, normalizes the input distribution. If False, assumes the
            given distribution is already normalized. By default True.
        warn : bool, optional
            If True, raises warnings if input is not valid PDF data (i.e. if
            data is negative). If False, no warnings are raised. By default True.
        ancil : Optional[Mapping], optional
            A dictionary of metadata for the distributions, where any arrays have
            length npdf, by default None

        Returns
        -------
        Ensemble
            An Ensemble object containing all of the given distributions.

        Examples
        --------

        To create an Ensemble with two distributions and an 'ancil' table that
        provides ids for the distributions, you can use the following code:

        >>> import qp
        >>> import numpy as np
        >>> bins= [0,1,2,3,4,5]
        >>> pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]])
        >>> ancil = {'ids': [105, 108]}
        >>> ens = qp.hist.create_ensemble(bins,pdfs,ancil=ancil)
        >>> ens.metadata
        {'pdf_name': array([b'hist'], dtype='|S4'),
        'pdf_version': array([0]),
        'bins': array([[0, 1, 2, 3, 4, 5]])}

        """

        data = {"bins": bins, "pdfs": pdfs, "norm": norm, "warn": warn}
        return Ensemble(self, data, ancil)




hist = hist_gen
add_class(hist_gen)