Source code for qp.parameterizations.hist.hist

"""This module implements a distribution parameterization sub-class using histograms"""

from __future__ import annotations
import numpy as np

from scipy.stats import rv_continuous
from typing import Mapping, Optional
from numpy.typing import ArrayLike


import warnings

from .hist_utils import (
    evaluate_hist_x_multi_y,
    extract_hist_values,
    extract_hist_samples,
)
from ..base import Pdf_rows_gen
from ...plotting import get_axes_and_xlims, plot_pdf_histogram_on_axes
from ...utils.array import reshape_to_pdf_size, reduce_dimensions

from ...utils.interpolation import interpolate_multi_x_y, interpolate_x_multi_y

from ...core.factory import add_class
from ...core.ensemble import Ensemble


[docs] class hist_gen(Pdf_rows_gen): """Implements distributions parameterized as histograms. By default, the input distribution is normalized. If the input data is already normalized, you can use the optional parameter ``norm = False`` to skip the normalization process. Parameters ---------- bins : ArrayLike The array containing the (n+1) bin boundaries pdfs : ArrayLike The array containing the (npdf, n) bin values norm : bool, optional If True, normalizes the input distribution. If False, assumes the given distribution is already normalized. By default True. warn : bool, optional If True, raises warnings if input is not valid PDF data (i.e. if data is negative). If False, no warnings are raised. By default True. Notes ----- There must be a minimum of 2 bins. Converting to this parameterization: This table contains the available methods to convert to this parameterization, their required arguments, and their method keys. If the key is `None`, this is the default conversion method. +------------------------+-----------------------------------------------------+------------+ | Function | Arguments | Method key | +------------------------+-----------------------------------------------------+------------+ | `.extract_hist_values` | bins (array of bin edges) | None | +------------------------+-----------------------------------------------------+------------+ | `.extract_hist_samples`| bins (array of bin edges), | samples | | | size (int, optional, number of samples to generate) | | +------------------------+-----------------------------------------------------+------------+ Implementation notes: Inside a given bin `pdf()` will return the `hist_gen.pdfs` value. Outside the range of the given bins `pdf()` will return 0. Inside a given bin `cdf()` will use a linear interpolation across the bin. Outside the range of the given bins `cdf()` will return (0 or 1), respectively. The percentage point function `ppf()` will return negative infinity at 0 and positive infinity at 1. """ # pylint: disable=protected-access name = "hist" version = 0 _support_mask = rv_continuous._support_mask def __init__( self, bins: ArrayLike, pdfs: ArrayLike, norm: bool = True, warn: bool = True, *args, **kwargs, ) -> None: """ Create a new distribution using the given histogram. Parameters ---------- bins : ArrayLike The array containing the (n+1) bin boundaries pdfs : ArrayLike The array containing the (npdf, n) bin values norm : bool, optional If True, normalizes the input distribution. If False, assumes the given distribution is already normalized. By default True. warn : bool, optional If True, raises warnings if input is not valid PDF data (i.e. if data is negative). If False, no warnings are raised. By default True. """ self._hbins = np.squeeze(bins) # make sure bins is 1D self._nbins = self._hbins.size - 1 self._hpdfs = reshape_to_pdf_size(np.asarray(pdfs), -1) # make sure that the bins are sorted if not np.all(np.diff(self._hbins) >= 0): raise ValueError( f"Invalid bins: The given bins are not sorted: {self._hbins}" ) # raise warnings if input data is not finite or pdfs are not positive self._warn = warn if self._warn: if not np.all(np.isfinite(self._hbins)): warnings.warn( f"The given bins contain non-finite values - {self._hbins}", RuntimeWarning, ) if not np.all(np.isfinite(pdfs)): indices = np.where(np.isfinite(pdfs) != True) warnings.warn( f"There are non-finite values in the pdfs for the distributions: {indices[0]}", RuntimeWarning, ) if np.any(self._hpdfs < 0): indices = np.where(self._hpdfs < 0) warnings.warn( f"There are negative values in the pdfs for the distributions: {indices[0]}", RuntimeWarning, ) # check data shapes make sense if np.shape(pdfs)[-1] != self._nbins: # pragma: no cover raise ValueError( "Number of bins (%i) != number of values (%i)" % (self._nbins, np.shape(pdfs)[-1]) ) self._hbin_widths = self._hbins[1:] - self._hbins[:-1] self._xmin = self._hbins[0] self._xmax = self._hbins[-1] # normalize the input data if norm is True self._norm = norm if self._norm: self._hpdfs = self.normalize()["pdfs"] self._hcdfs = None # Set support kwargs["shape"] = self._hpdfs.shape # pdfs.shape super().__init__(*args, **kwargs) self._addmetadata("bins", self._hbins) self._addobjdata("pdfs", self._hpdfs) def _compute_cdfs(self): copy_shape = np.array(self._hpdfs.shape) copy_shape[-1] += 1 self._hcdfs = np.ndarray(copy_shape) self._hcdfs[:, 0] = 0.0 self._hcdfs[:, 1:] = np.cumsum(self._hpdfs * self._hbin_widths, axis=1)
[docs] def normalize(self) -> Mapping[str, np.ndarray[float]]: """Normalizes the input distribution values. Returns ------- Mapping [str, np.ndarray[float]] An (npdf, n) array of pdf values in the n bins for the npdf distributions Raises ------ ValueError Raised if the sum under the distribution <= 0. """ pdfs_2d = self._hpdfs sums = np.sum(pdfs_2d * self._hbin_widths, axis=1) if np.any(sums < 0): indices = np.where(sums < 0) raise ValueError( f"The distribution(s) cannot be properly normalized, the sum of the pdfs is < 0 for distributions at index = {indices[0]} " ) elif np.any(sums == 0): indices = np.where(sums == 0) warnings.warn( f"The distributions(s) with indices {indices[0]} have an integral of 0." ) return {"pdfs": (pdfs_2d.T / sums).T}
@property def bins(self) -> np.ndarray[float]: """Return the histogram bin edges""" return self._hbins @property def pdfs(self) -> np.ndarray[float]: """Return the histogram bin values""" return self._hpdfs
[docs] def x_samples(self) -> np.ndarray[float]: """Return a set of x values that can be used to plot all the PDFs.""" # TODO: possibly add a bin to the left and right? return (self._hbins[:-1] + self._hbins[1:]) / 2
def _pdf(self, x, row): # pylint: disable=arguments-differ pdf = evaluate_hist_x_multi_y(x, row, self._hbins, self._hpdfs).ravel() # reduce dimension to 0 if there's only one value if np.shape(pdf) == (1,) and len(pdf) == 1: return pdf[0] else: return pdf def _cdf(self, x, row): # pylint: disable=arguments-differ if self._hcdfs is None: # pragma: no cover self._compute_cdfs() return interpolate_x_multi_y( x, row, self._hbins, self._hcdfs, bounds_error=False, fill_value=(0.0, 1.0) ).ravel() def _ppf(self, x, row): # pylint: disable=arguments-differ if self._hcdfs is None: # pragma: no cover self._compute_cdfs() return interpolate_multi_x_y( x, row, self._hcdfs, self._hbins, bounds_error=False, fill_value=(self._xmin, self._xmax), ).ravel() def _munp(self, m, *args): """compute moments""" # pylint: disable=arguments-differ # Silence floating point warnings from integration. with np.errstate(all="ignore"): vals = self.custom_generic_moment(m) return vals
[docs] def custom_generic_moment(self, m: ArrayLike) -> np.ndarray[float]: """Compute the mth moment""" m = np.asarray(m) dx = self._hbins[1] - self._hbins[0] xv = 0.5 * (self._hbins[1:] + self._hbins[:-1]) return np.sum(xv**m * self._hpdfs, axis=1) * dx
def _updated_ctor_param(self): """ Set the bins as additional constructor argument """ dct = super()._updated_ctor_param() dct["bins"] = self._hbins dct["pdfs"] = self._hpdfs dct["norm"] = self._norm dct["warn"] = self._warn return dct
[docs] @classmethod def get_allocation_kwds( cls, npdf: int, **kwargs: str ) -> dict[str, tuple[tuple[int, int], str]]: """Return the kwds necessary to create an `empty` HDF5 file with ``npdf`` entries for iterative write. We only need to allocate the data columns, as the metadata will be written when we finalize the file. The number of data columns is calculated based on the length or shape of the metadata, ``n``. For example, the number of columns is ``nbins-1`` for a histogram. Parameters ---------- npdf : int Total number of distributions that will be written out kwargs : The keys needed to construct the shape of the data to be written. Returns ------- dict [ str, tuple [ tuple [int , int], str]] A dictionary with a key for the objdata, a tuple with the shape of that data, and the data type of the data as a string. i.e. ``{objdata_key = ( (npdf, n), "f4" )}`` Raises ------ ValueError Raises an error if the bins is not provided.""" if "bins" not in kwargs: # pragma: no cover raise ValueError("required argument 'bins' not included in kwargs") nbins = len(kwargs["bins"].flatten()) return dict(pdfs=((npdf, nbins - 1), "f4"))
[docs] @classmethod def plot_native(cls, pdf: Ensemble, **kwargs): """Plot the PDF in a way that is particular to this type of distribution For a histogram this shows the bin edges. Parameters ---------- axes : Axes The axes to plot on. Either this or xlim must be provided. xlim : tuple [float, float] The x-axis limits. Either this or axes must be provided. kwargs : Any keyword arguments to pass to matplotlib's `matplotlib.axes.Axes.hist` method. Returns ------- axes : Axes The plot axes. """ axes, _, kw = get_axes_and_xlims(**kwargs) vals = pdf.dist.pdfs[pdf.kwds["row"]] return plot_pdf_histogram_on_axes(axes, hist=(pdf.dist.bins, vals), **kw)
[docs] @classmethod def add_mappings(cls): """ Add this classes mappings to the conversion dictionary """ cls._add_creation_method(cls.create, None) cls._add_extraction_method(extract_hist_values, None) cls._add_extraction_method(extract_hist_samples, "samples")
[docs] @classmethod def create_ensemble( self, bins: ArrayLike, pdfs: ArrayLike, norm: bool = True, warn: bool = True, ancil: Optional[Mapping] = None, ) -> Ensemble: """Creates an Ensemble of distributions parameterized as histograms. Parameters ---------- bins : ArrayLike The array containing the (n+1) bin boundaries pdfs : ArrayLike The array containing the (npdf, n) bin values norm : bool, optional If True, normalizes the input distribution. If False, assumes the given distribution is already normalized. By default True. warn : bool, optional If True, raises warnings if input is not valid PDF data (i.e. if data is negative). If False, no warnings are raised. By default True. ancil : Optional[Mapping], optional A dictionary of metadata for the distributions, where any arrays have length npdf, by default None Returns ------- Ensemble An Ensemble object containing all of the given distributions. Examples -------- To create an Ensemble with two distributions and an 'ancil' table that provides ids for the distributions, you can use the following code: >>> import qp >>> import numpy as np >>> bins= [0,1,2,3,4,5] >>> pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ancil = {'ids': [105, 108]} >>> ens = qp.hist.create_ensemble(bins,pdfs,ancil=ancil) >>> ens.metadata {'pdf_name': array([b'hist'], dtype='|S4'), 'pdf_version': array([0]), 'bins': array([[0, 1, 2, 3, 4, 5]])} """ data = {"bins": bins, "pdfs": pdfs, "norm": norm, "warn": warn} return Ensemble(self, data, ancil)
hist = hist_gen add_class(hist_gen)