Source code for qp.core.ensemble

"""Implementation of an Ensemble of distributions."""

from __future__ import annotations
import os
from typing import Mapping, Optional, Union

import json
import h5py
import numpy as np
import tables_io
from tables_io import hdf5
from typing import Mapping, Optional, Union
from numpy.typing import ArrayLike

from ..utils.dictionary import (
    check_array_shapes,
    compare_dicts,
    concatenate_dicts,
    slice_dict,
    reduce_arrays_to_1d,
    make_len_equal,
    expand_dimensions,
)
from ..utils.array import encode_strings, reduce_dimensions
from ..metrics import quick_moment
from ..parameterizations.base import Pdf_gen

# import psutil
# import timeit


[docs] class Ensemble: """An object comprised of one or more distributions with the same parameterization. The Ensemble allows you to perform operations on the group of parameterizations as a whole. An Ensemble has three main data components, the last of which is optional: 1. The metadata: this contains information about the parameterization, and the coordinates of the parameterization. 2. The object data: this contains the data that is unique to each distribution, for example the values that correspond to the coordinates. 3. The ancillary data (optional): this contains data points where there is one data point for each distribution in the ensemble. There can be many of these columns or arrays in the ancillary data table. Parameters ---------- the_class : Pdf_gen subclass The class to use to parameterize the distributions data : Mapping Dictionary with data used to construct the ensemble. The keys required vary for different parameterizations. ancil : Optional[Mapping] Dictionary with ancillary data, by default None method : Optional[str] The key for the creation method to use, by default None Examples -------- >>> import qp >>> import numpy as np >>> data = {'bins': [0,1,2,3,4,5], ... 'pdfs': np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ancil = {'ids': [105, 108]}} >>> ens = qp.Ensemble(qp.hist,data,ancil) >>> ens.metadata {'pdf_name': array([b'hist'], dtype='|S4'), 'pdf_version': array([0]), 'bins': array([[0, 1, 2, 3, 4, 5]])} """ def __init__( self, the_class: Pdf_gen, data: Mapping, ancil: Optional[Mapping] = None, method: Optional[str] = None, ): """Class constructor. The requirements are the class object that the ensemble is to be parameterized as, and the data dictionary. The data dictionary will need different keys depending on what parameterization you have chosen. If you are unsure which keys are required, try ``qp.[parameterization].create_ensemble?``, where [parameterization] is the class of ensemble you wish to create. This will output a docstring with which describes the necessary inputs (and this function can also be used to create an ensemble instead). An ancillary data dictionary can also be provided upon creation. This dictionary should contain arrays that are the same length as the number of distributions in the ensemble. Essentially, this should include arrays of data where each value in the array corresponds to a distribution. Parameters ---------- the_class : Pdf_gen subclass The class to use to parameterize the distributions data : Mapping Dictionary with data used to construct the ensemble. The keys required vary for different parameterizations. ancil : Optional[Mapping] Dictionary with ancillary data, by default None method : Optional[str] The key for the creation method to use, by default None """ # start_time = timeit.default_timer() self._gen_func = the_class.creation_method(method) self._frozen = self._gen_func(**data) self._gen_obj = self._frozen.dist self._gen_class = type(self._gen_obj) self._ancil = None self.set_ancil(ancil) self._gridded = None self._samples = None def __repr__(self) -> str: class_name = type(self).__name__ return f"{class_name}(the_class={self._gen_class.name},shape={self.shape})" def __len__(self) -> int: return self.npdf def __getitem__(self, key: Union[int, slice]) -> Ensemble: """Build an Ensemble object for a sub-set of the distributions in this ensemble Parameters ---------- key : Union [int , slice] Used to slice the data to pick out one distribution from this ensemble Returns ------- ens : Ensemble The ensemble for the requested distribution or slice of distributions """ red_data = {} md = self.metadata md.pop("pdf_name") md.pop("pdf_version") for k, v in md.items(): red_data[k] = np.squeeze(v) if self.npdf > 1: dd = slice_dict(self.objdata, key) elif self.npdf == 1 and key == 0: dd = self.objdata else: raise IndexError( f"Cannot slice Ensemble object with {self.npdf} with given index/slice {key}." ) for k, v in dd.items(): if len(np.shape(v)) < 2: red_data[k] = np.expand_dims(v, 0) else: red_data[k] = v if self._ancil is not None and self.npdf > 1: ancil = slice_dict(self._ancil, key) elif self._ancil is not None and self.npdf == 1: ancil = self._ancil else: ancil = None return Ensemble(self._gen_obj, data=red_data, ancil=ancil) @property def gen_func(self): """Return the function used to create the distribution object for this ensemble""" return self._gen_func @property def gen_class(self): """Return the class used to generate distributions for this ensemble""" return self._gen_class @property def dist(self): """Return the `scipy.stats.rv_continuous` object that generates distributions for this ensemble""" return self._gen_obj @property def kwds(self): """Return the kwds associated to the frozen object for this ensemble""" return self._frozen.kwds @property def gen_obj(self): """Return the `scipy.stats.rv_continuous` object that generates distributions for this ensemble""" return self._gen_obj @property def frozen(self): """Return the `scipy.stats.rv_frozen` object that encapsulates the distributions for this ensemble""" return self._frozen @property def ndim(self) -> int: """Return the number of dimensions of distributions in this ensemble.""" return self._frozen.ndim @property def shape(self) -> tuple: """Return the shape of distributions in this ensemble.""" return self._frozen.shape @property def npdf(self) -> int: """Return the number of distributions in this ensemble.""" return self._frozen.npdf @property def ancil(self) -> Mapping: """Return the ancillary data dictionary for this ensemble.""" return self._ancil
[docs] def x_samples( self, min: float = 0.0, max: float = 5.0, n: Optional[int] = 1000 ) -> np.ndarray[float]: """Return an array of x values that can be used to plot all the distributions in the Ensemble. This is meant to plot the characteristic distribution for an Ensemble of discrete data. For example, for an ensemble of histograms that would be the PDF, and for an ensemble of quantiles that would be the CDF. Analytic parameterizations like `mixmod <qp.mixmod_gen>` or `scipy.stats.norm` will just return a `np.linspace(min,max,n) <numpy.linspace>`, and it's recommended you input the values as the defaults are the same for all analytic distributions. Parameters ---------- min : float, optional The minimum x value to be used if the parameterization doesn't have an `x_samples` method or is analytic, by default 0. max : float, optional The maximum x value to be used if the parameterization doesn't have an `x_samples` method or is analytic, by default 5. n : Optional[int], optional The number of points to be used if the parameterization doesn't have an `x_samples` method or is analytic, by default 1000 Returns ------- xs : np.ndarray[float] The array of points to use. """ try: return self._frozen.dist.x_samples() except: return np.linspace(min, max, n)
[docs] def convert_to(self, to_class: Pdf_gen, **kwargs: str) -> Ensemble: """Convert this ensemble to the given parameterization class. To see the available conversion methods for the your chosen parameterization and their required arguments, check the docstrings for ``qp.to_class``. If the parameterization class doesn't have a conversion methods table, then it will not be possible to convert to that class. Parameters ---------- to_class : Pdf_gen subclass Parameterization class to convert to **kwargs : Keyword arguments that are passed to the output class constructor Other Parameters ---------------- method : str Optional argument to specify a non-default conversion algorithm Returns ------- ens : Ensemble Ensemble of distributions of type class_to using the data from this object Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]])) >>> ens_i = ens_h.convert_to(qp.interp, xvals=np.linspace(0,5,10)) >>> ens_i.metadata {'pdf_name': array([b'interp'], dtype='|S6'), 'pdf_version': array([0]), 'xvals': array([0. , 0.55555556, 1.11111111, 1.66666667, 2.22222222, 2.77777778, 3.33333333, 3.88888889, 4.44444444, 5. ]))} """ kwds = kwargs.copy() method = kwds.pop("method", None) ctor_func = to_class.creation_method(method) class_name = to_class.name if ctor_func is None: # pragma: no cover raise KeyError( "Class named %s does not have a creation_method named %s" % (class_name, method) ) extract_func = to_class.extraction_method(method) if extract_func is None: # pragma: no cover raise KeyError( "Class named %s does not have a extraction_method named %s" % (class_name, method) ) data = extract_func(self, **kwds) return Ensemble(to_class, data=data, method=method)
[docs] def update(self, data: Mapping, ancil: Optional[Mapping] = None) -> None: """Update the frozen distribution object with the given data, and set the ancillary data table with ``ancil`` if given. Parameters ---------- data : Mapping Dictionary with data used to construct the ensemble, including metadata. ancil : Optional[Mapping], optional Optional dictionary that contains data for each of the distributions in the ensemble, by default None. Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([0,0.1,0.1,0.4,0.2])) >>> ens_h.update(data={'bins': np.array([1,2,3,4,5]), 'pdfs': np.array([0.1,0.1,0.4,0.2])}) >>> ens_h.metadata {'pdf_name': array([b'hist'], dtype='|S4'), 'pdf_version': array([0]), 'bins': array([[1, 2, 3, 4, 5]])} """ self._frozen = self._gen_func(**data) self._gen_obj = self._frozen.dist self.set_ancil(ancil) self._gridded = None self._samples = None
[docs] def update_objdata(self, data: Mapping, ancil: Optional[Mapping] = None) -> None: """Updates the objdata in the frozen distribution, and sets the ancillary data table if given. Parameters ---------- data : Mapping Dictionary with the object data that will be used to reconstruct the ensemble ancil : Optional[Mapping], optional Optional dictionary that contains data for each of the distributions in the ensemble, by default None. Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([0,0.1,0.1,0.4,0.2])) >>> ens_h.objdata {'pdfs': array([0. , 0.125, 0.125, 0.5 , 0.25 ])} >>> ens_h.update_objdata(data={'pdfs': np.array([0.05,0.09,0.2,0.3,0.15])}) >>> ens_h.objdata {'pdfs': array([[0.06329114, 0.11392405, 0.25316456, 0.37974684, 0.18987342]])} """ new_data = {} for k, v in self.metadata.items(): if k in ["pdf_name", "pdf_version"]: continue new_data[k] = np.squeeze(v) new_data.update(self.objdata) new_data.update(data) self.update(new_data, ancil)
@property def metadata(self) -> Mapping: """Return the metadata for this ensemble. Metadata are elements that are the same for all the distributions in the ensemble. These include the name and version of the distribution generation class Returns ------- metadata : Mapping The dictionary of the metadata. """ dd = {} dd.update(self._gen_obj.metadata) return dd @property def objdata(self) -> Mapping: """Return the data for this ensemble. These are the elements that differ for each distribution in the ensemble. For example, the data points that correspond to each of the coordinates given in the metadata. Returns ------- objdata : Mapping The object data Notes ----- If the distribution normalized the data (which many do by default), this will return the normalized data and not the original input data. """ dd = {} dd.update(self._frozen.kwds) dd.pop("row", None) dd.update(self._gen_obj.objdata) # if there is only one distribution reshape data as necessary if self.npdf == 1: dd = reduce_arrays_to_1d(dd) return dd
[docs] def set_ancil(self, ancil: Mapping) -> None: """Set the ancillary data dictionary. The arrays in this dictionary must have one row for each of the distributions, which means that the length of these arrays (or the first dimension) must be the same as the number of distributions in the ensemble. Parameters ---------- ancil : Mapping The ancillary data dictionary. Raises ------ IndexError If the length of the arrays in ancil does not match the number of distributions in the Ensemble. Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]])) >>> ancil = {'ids': np.array([5,7])} >>> ens_h.set_ancil(ancil) >>> ens_h.ancil {'ids': array([5, 7])} """ check_array_shapes(ancil, self.npdf) self._ancil = ancil
[docs] def add_to_ancil(self, to_add: Mapping) -> None: # pragma: no cover """Add additional columns to the ancillary data dictionary. The ancil dictionary must already exist. If it does not, use `set_ancil`. If any of these columns have the same name as already existing ancillary data columns, the new columns will overwrite the old ones. Parameters ---------- to_add : Mapping The columns to add to the ancillary data dict Raises ------ IndexError If the length of the arrays in to_add does not match the number of distributions in the Ensembles Examples -------- >>> import qp >>> import numpy as np >>> ancil = {'ids': np.array([5,7])} >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]), ancil=ancil) >>> ens_h.add_to_ancil({'means':np.array([0.2,0.25])}) >>> ens_h.ancil {'ids': array([5, 7]), 'means': array[0.2,0.25]} """ check_array_shapes(to_add, self.npdf) self._ancil.update(to_add)
[docs] def append(self, other_ens: Ensemble) -> None: """Append another ensemble to this ensemble. The ensembles must be of the same parameterization, or this will not work. They must also have the same metadata, so for example if they are both histograms they must also have the same bins. Both ensembles must have an ancillary data dictionary in order for them to be appended to each other. If one ensemble has an ancillary data dictionary and the other does not, this will set the ancillary data dictionary to `None`. Parameters ---------- other_ens : Ensemble The ensemble to append to this one. Raises ------ KeyError Raised if the two ensembles do not have matching metadata. Examples -------- >>> import qp >>> import numpy as np >>> ens_1 = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([0,0.1,0.1,0.4,0.2])) >>> ens_2 = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([0.5,0.15,0.25,0.45,0.1])) >>> ens_1.append(ens_2) >>> ens_1.npdf 2 """ if not compare_dicts([self.metadata, other_ens.metadata]): # pragma: no cover raise KeyError("Metadata does not match, can not append") full_objdata = concatenate_dicts([self.objdata, other_ens.objdata]) if self._ancil is not None and other_ens.ancil is not None: # pragma: no cover full_ancil = concatenate_dicts([self.ancil, other_ens.ancil]) else: full_ancil = None self.update_objdata(full_objdata, full_ancil)
[docs] def build_tables(self, encode: bool = False, ext: Optional[str] = None) -> Mapping: """Returns a dictionary of dictionaries of numpy arrays for the meta data, object data, and the ancillary data (if it exists) for this ensemble. Parameters ---------- encode : bool If True and `ext` is 'hdf5', will encode any string columns in the `ancil` table, by default False. ext : str, optional If set to 'hdf5' when `encode` is True, will encode any string columns in the `ancil` table, by default None. Returns ------- data : Mapping, `tables_io.TableDict-like` The dictionary with the data. Has the keys: ``meta`` for metadata, ``data`` for object data, and optionally ``ancil`` for ancillary data. """ meta = make_len_equal(self.metadata) dd = dict(meta=meta, data=self.objdata) # expand out the objdata to 2D arrays if there's only 1 distribution if self.npdf == 1: new_objdata = expand_dimensions(dd["data"], self.npdf, self.shape[1]) dd.update(dict(data=new_objdata)) if self.ancil is not None: # encode any string columns if the file will be hdf5 if encode == True and ext == "hdf5": ancil_tmp = encode_strings(self.ancil) dd["ancil"] = ancil_tmp else: dd["ancil"] = self.ancil return dd
[docs] def norm(self): """Normalizes the input distribution data if it represents a PDF and can be normalized. Raises ------ AttributeError Raised if the parameterization doesn't have a normalization method. """ # get normalized data values try: normed = self._gen_obj.normalize() except AttributeError as err: raise AttributeError( "This parameterization does not have a normalization function." ) from err except RuntimeError as err: raise err # update ensemble objdata with normalized values d_keys = list(self.objdata.keys()) # add in any data from current ensemble which was unchanged for key in d_keys: if not key in normed.keys(): normed[key] = self.objdata[key] self.update_objdata(data=normed, ancil=self.ancil)
[docs] def mode(self, grid: ArrayLike) -> ArrayLike: """Return the mode of each ensemble distribution, evaluated on the given grid. Parameters ---------- grid : ArrayLike Grid on which to evaluate distribution Returns ------- mode : ArrayLike The modes of the distributions evaluated on grid, with shape (npdf, 1) """ new_grid, griddata = self.gridded(grid) return np.expand_dims(new_grid[np.argmax(griddata, axis=1)], -1)
[docs] def gridded(self, grid: ArrayLike) -> tuple[ArrayLike, ArrayLike]: """Build, cache and return the PDF values at the given grid points. If the given grid matches the already cached grid, then this just returns the cached value. Parameters ---------- grid : ArrayLike The grid points to evaluate the PDF at. Returns ------- gridded : tuple [ ArrayLike, ArrayLike ] (grid, pdf_values) """ if self._gridded is None or not np.array_equal(self._gridded[0], grid): self._gridded = (grid, self.pdf(grid)) return self._gridded
[docs] def write_to(self, filename: str) -> None: """Write this ensemble to a file. The file type can be any of the those supported by tables_io. File type is indicated by the suffix of the file name given. Allowed formats are: 'hdf5','h5','hf5','hd5','fits','fit','pq','parq','parquet' If writing to parquet files, a file will be written for the metadata, the object data, and the ancillary data if it exists, where the identifying key is added to the filename. Parameters ---------- filename : str Examples -------- >>> import qp >>> import numpy as np >>> ens_1 = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([0,0.1,0.1,0.4,0.2])) >>> ens_1.write_to("hist-ensemble.hdf5") """ basename, ext = os.path.splitext(filename) tables = self.build_tables(encode=True, ext=ext[1:]) tables_io.write(tables, basename, ext[1:])
[docs] def to_json(self) -> dict[str, str]: """Convert this ensemble to a json string """ tables = self.build_tables() # fix the type to make json happier tables['meta']['pdf_name'] = np.array([tables['meta']['pdf_name'][0].decode()]) json_dict = tables_io.convert(tables, tables_io.types.JSON_STRING) return json_dict
[docs] def pdf(self, x: ArrayLike) -> ArrayLike: """ Evaluates the probability density function (PDF) for each of the distributions in the ensemble Parameters ---------- x : ArrayLike Location(s) at which to evaluate the PDF for each distribution. Returns ------- pdf : ArrayLike The PDF value(s) at the given location(s). Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.pdf(np.linspace(3,6,6)) array([[0.5 , 0.5 , 0.25 , 0.25 , 0. , 0. ], [0.37974684, 0.37974684, 0.18987342, 0.18987342, 0. , 0. ]]) """ pdf = self._frozen.pdf(x) # reduce dimensionality if possible if self.npdf == 1: pdf = reduce_dimensions(pdf, x) return pdf
# return self._frozen.pdf(x)
[docs] def logpdf(self, x: ArrayLike) -> ArrayLike: """ Evaluates the log of the probability density function (PDF) for each of the distributions in the ensemble. Parameters ---------- x : ArrayLike Location(s) at which to do the evaluations Returns ------- logpdf : ArrayLike The log of the PDF at the given location(s) Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.logpdf(np.linspace(3,6,6)) array([[-0.69314718, -0.69314718, -1.38629436, -1.38629436, -inf, -inf], [-0.96825047, -0.96825047, -1.66139765, -1.66139765, -inf, -inf]]) """ logpdf = self._frozen.logpdf(x) # reduce dimensionality if possible if self.npdf == 1: logpdf = reduce_dimensions(logpdf, x) return logpdf
[docs] def cdf(self, x: ArrayLike) -> ArrayLike: """ Evaluates the cumulative distribution function (CDF) for each of the distributions in the ensemble. Parameters ---------- x : ArrayLike Location(s) at which to do the evaluations Returns ------- cdf : ArrayLike The CDF at the given location(s) Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.cdf(np.linspace(3,6,6)) array([[0.25 , 0.55 , 0.8 , 0.95 , 1. , 1. ], [0.43037975, 0.65822785, 0.84810127, 0.96202532, 1. , 1. ]]) """ cdf = self._frozen.cdf(x) # reduce dimensionality if possible if self.npdf == 1: cdf = reduce_dimensions(cdf, x) return cdf
[docs] def logcdf(self, x: ArrayLike) -> ArrayLike: """ Evaluates the log of the cumulative distribution function (CDF) for each of the distributions in the ensemble. Parameters ---------- x : ArrayLike Location(s) at which to do the evaluations Returns ------- cdf : ArrayLike The log of the CDF at the given location(s) Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.logcdf(np.linspace(3,6,6)) array([[-1.38629436, -0.597837 , -0.22314355, -0.05129329, 0. , 0. ], [-0.84308733, -0.41820413, -0.16475523, -0.03871451, 0. , 0. ]]) """ logcdf = self._frozen.logcdf(x) # reduce dimensionality if possible if self.npdf == 1: logcdf = reduce_dimensions(logcdf, x) return logcdf
[docs] def ppf(self, q: ArrayLike) -> ArrayLike: """ Evaluates the percentage point function (PPF) for each of the distributions in the ensemble.. Parameters ---------- q : ArrayLike Location(s) at which to do the evaluations Returns ------- ppf : ArrayLike The PPF at the given location(s) Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.ppf(0.5) array([[3.5 ], [3.18333333]]) """ ppf = self._frozen.ppf(q) # reduce dimensionality if possible if self.npdf == 1: ppf = reduce_dimensions(ppf, q) return ppf
[docs] def sf(self, q: ArrayLike) -> ArrayLike: """ Evaluates the survival fraction (SF) for each of the distributions in the ensemble. Parameters ---------- q : ArrayLike Location(s) at which to evaluate the distributions Returns ------- sf : ArrayLike The SF at the given location(s) Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.sf(0.5) array([[1. ], [0.96835443]]) """ sf = self._frozen.sf(q) # reduce dimensionality if possible if self.npdf == 1: sf = reduce_dimensions(sf, q) return sf
[docs] def logsf(self, q: ArrayLike) -> ArrayLike: """Evaluates the log of the survival function (SF) for each of the distributions in the ensemble. Parameters ---------- q : ArrayLike Location(s) at which to evaluate the distributions Returns ------- sf : ArrayLike The log of the SF at the given location(s) Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.logsf(0.5) array([[ 0. ], [-0.03215711]]) """ logsf = self._frozen.logsf(q) # reduce dimensionality if possible if self.npdf == 1: logsf = reduce_dimensions(logsf, q) return logsf
[docs] def isf(self, q: ArrayLike) -> ArrayLike: """ Evaluates the inverse of the survival fraction (SF) for each of the distributions in the ensemble. Parameters ---------- q : ArrayLike Location(s) at which to evaluate the distributions Returns ------- sf : ArrayLike The inverse of the survival fraction at the given location(s) Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.isf(0.5) array([[3.5 ], [3.18333333]]) """ isf = self._frozen.isf(q) # reduce dimensionality if possible if self.npdf == 1: isf = reduce_dimensions(isf, q) return isf
[docs] def rvs( self, size: int = 1, random_state: Union[None, int, np.random.Generator] = None, ) -> ArrayLike: """ Generate samples from the distributions in this ensemble. The returned samples are of shape (npdf, size), where size is the number of samples per distribution. Parameters ---------- size : int, optional Number of samples to return, by default 1. random_state : int, numpy.random.Generator, None, optional The random state to use. Can be provided with a random seed for consistency. By default None. Returns ------- samples : ArrayLike The array of samples for each distribution in the ensemble, shape (npdf,size) Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.rvs(size=2) array([[3.12956247, 3.72090937], [4.96783836, 3.24016123]]) """ return self._frozen.rvs( size=(self._frozen.npdf, size), random_state=random_state )
[docs] def stats(self, moments: str = "mv") -> tuple[ArrayLike, ...]: """ Return some statistics for each of the distributions in this ensemble. The moments to be returned are determined by the string given to `moments`, where each letter represents a specific moment. The options are: "m" = mean, "v" = variance, "s" = (Fisher's) skew, "k" = (Fisher's) kurtosis. Parameters ---------- moments : str, optional Which moments to include, by default "mv" Returns ------- stats : tuple[ArrayLike, ... ] A sequence of arrays of the moments requested, where the shape of the arrays is (npdf, 1) Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.stats() (array([[3.375 ], [3.01898734]]), array([[0.859375 ], [1.23698125]])) """ return self._frozen.stats(moments=moments)
[docs] def median(self) -> ArrayLike: """Return the median for each of the distributions in this ensemble. Returns ------- medians : ArrayLike The median for each distribution, returns a float if there is only one distribution, or the shape of the array is (npdf, 1) Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.median() array([[3.5 ], [3.18333333]]) """ median = self._frozen.median() # reduce dimensionality if possible if self.npdf == 1: median = reduce_dimensions(median, 1) return median
[docs] def mean(self) -> ArrayLike: """Return the mean for each of the distributions in this ensemble. Returns ------- means : ArrayLike The mean for each distribution, returns a float if there is only one distribution, or the shape of the array is (npdf, 1) Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.mean() array([[3.375 ], [3.01898734]]) """ mean = self._frozen.mean() # reduce dimensionality if possible if self.npdf == 1: mean = reduce_dimensions(mean, 1) return mean
[docs] def var(self) -> ArrayLike: """Return the variance for each of the distributions in this ensemble. Returns ------- variances : ArrayLike The variance for each distribution, returns a float if there is only one distribution, or the shape of the array is (npdf, 1) Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.var() array([[0.859375 ], [1.23698125]]) """ var = self._frozen.var() # reduce dimensionality if possible if self.npdf == 1: var = reduce_dimensions(var, 1) return var
[docs] def std(self) -> ArrayLike: """Return the standard deviation for each of the distributions in this ensemble. Returns ------- stds : ArrayLike The standard deviations for each distribution, the shape of the array is (npdf, 1) Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.std() array([[0.92702481], [1.11219659]]) """ std = self._frozen.std() # reduce dimensionality if possible if self.npdf == 1: std = reduce_dimensions(std, 1) return std
[docs] def moment(self, n: int) -> ArrayLike: """Return the nth moment for each of the distributions in this ensemble. Parameters ---------- n : int The order of the moment Returns ------- moments : ArrayLike The nth moment for each distribution, the shape of the array is (npdf, 1) Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.moment(2) array([[12.25 ], [10.35126582]]) """ moment = self._frozen.moment(n) # reduce dimensionality if possible if self.npdf == 1: moment = reduce_dimensions(moment, 1) return moment
[docs] def entropy(self) -> ArrayLike: """Return the differential entropy for each of the distributions in this ensemble. Returns ------- entropy : ArrayLike The entropy for each distribution, the shape of the array is (npdf, 1) Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.entropy() array([[1.21300757], [1.45307405]]) """ entropy = self._frozen.entropy() # reduce dimensionality if possible if self.npdf == 1: entropy = reduce_dimensions(entropy, 1) return entropy
# def pmf(self, k): # """ Return the kth pmf for this ensemble """ # return self._frozen.pmf(k) # def logpmf(self, k): # """ Return the log of the kth pmf for this ensemble """ # return self._frozen.logpmf(k)
[docs] def interval(self, alpha: ArrayLike) -> tuple[ArrayLike, ...]: """ Return the intervals corresponding to a confidence level of `alpha` for each of the distributions in this ensemble. Parameters ---------- alpha : ArrayLike The array of values to return intervals for. These should be the probability that a random variable will be drawn from the returned range. Each value should be in the range [0,1]. Returns ------- interval : tuple[ArrayLike, ...] A tuple of the arrays containing the intervals for each distribution, where the shape of the arrays is (npdf, len(alpha)) Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.interval(alpha=[0,0.5,0.9]) (array([[1.4 , 3. , 3.5 ], [0.79 , 2.2875 , 3.18333333]]), array([[3.5 , 4. , 4.8 ], [3.18333333, 3.84166667, 4.73666667]])) """ return self._frozen.interval(alpha)
[docs] def histogramize(self, bins: ArrayLike) -> tuple[ArrayLike]: """ Computes integrated histogram bin values for all distributions in the ensemble. Parameters ---------- bins : ArrayLike Array of N+1 endpoints of N bins Returns ------- histogram: tuple[ArrayLike, ArrayLike] The first array in the tuple is the bin edges that were input. The second array in the tuple is an (npdf, N) array of the values in the bins. Examples -------- >>> import qp >>> import numpy as np >>> ens_h = qp.hist.create_ensemble(bins= np.array([0,1,2,3,4,5]), ... pdfs = np.array([[0,0.1,0.1,0.4,0.2],[0.05,0.09,0.2,0.3,0.15]]) >>> ens_h.histogramize(bins=np.array([1,2,3,4,5])) (array([1, 2, 3, 4, 5]), array([[0.125 , 0.125 , 0.5 , 0.25 ], [0.11392405, 0.25316456, 0.37974684, 0.18987342]])) """ return self._frozen.histogramize(bins)
[docs] def integrate( self, limits: tuple[Union[float, ArrayLike], Union[float, ArrayLike]] ) -> ArrayLike: """ Computes the integral under the probability distribution functions (PDFs) of the distributions in the ensemble between the given limits. Parameters ---------- limits : tuple[Union[float, ArrayLike], Union[float, ArrayLike]] A tuple with the limits of integration, where the first object in the tuple is the lower limit, and the second object is the upper limit. The limit objects can be floats or arrays, where the number of limits is the length of those arrays, or `nlimits`. Returns ------- integral: ArrayLike Value of the integral(s), with the shape (npdf, nlimits) """ return self.cdf(limits[1]) - self.cdf(limits[0])
[docs] def mix_mod_fit(self, comps=5): # pragma: no cover """ Fits the parameters of a given functional form to an approximation Parameters ---------- comps : int, optional Number of components to consider using : str, optional Which existing approximation to use, defaults to first approximation vb : bool Report progress Returns ------- self.mix_mod: list [ `qp.Composite` ] List of `qp.Composite` objects approximating the PDFs Notes ----- Currently only supports mixture of Gaussians """ raise NotImplementedError("mix_mod_fit %i" % comps)
[docs] def moment_partial(self, n: int, limits: tuple, dx: float = 0.01) -> ArrayLike: """Return the nth moment over a particular range for each of the distributions in this ensemble. Parameters ---------- n : int The order of the moment to return limits : tuple The range over which to calculate the moment, where the second number is the upper limit. dx : float, optional The distance between grid points when calculating, by default 0.01 Returns ------- ArrayLike Array of the moments for each of the distributions, with shape (npdf,) """ D = int((limits[-1] - limits[0]) / dx) grid = np.linspace(limits[0], limits[1], D) # dx = (limits[-1] - limits[0]) / (D - 1) P_eval = self.gridded(grid)[1] grid_to_n = grid**n return quick_moment(P_eval, grid_to_n, dx)
[docs] def plot( self, key: Union[int, slice] = 0, **kwargs: str, ): """Plot the selected distribution as a curve. Parameters ---------- key : int or slice, optional The index or slice of the distribution or distributions from this ensemble to plot, by default 0. Other Parameters ---------------- axes : Axes The axes to plot on. Either this or xlim must be provided. xlim : (float, float) The x-axis limits. Either this or axes must be provided. kwargs : Any keyword arguments to pass to matplotlib's axes.plot() method. Returns ------- axes : Axes The plot axes """ return self._gen_class.plot(self[key], **kwargs)
[docs] def plot_native(self, key: Union[int, slice] = 0, **kwargs: str): """Plot the selected distribution in the default format for this parameterization. To find what arguments are required for specific parameterizations, you can check the docstrings of ``qp.[parameterization].plot_native``, where ``[parameterization]`` is the parameterization class for the current ensemble. Parameters ---------- key : int or slice, optional The index or slice of the distribution or distributions from this ensemble to plot, by default 0. kwargs : The keyword arguments to pass to the parameterization's plot_native method. Returns ------- axes : Axes The plot axes """ return self._gen_class.plot_native(self[key], **kwargs)
def _get_allocation_kwds(self, npdf: int) -> Mapping: tables = self.build_tables() keywords = {} for group, tab in tables.items(): if group != "meta": keywords[group] = {} for key, array in tab.items(): shape = list(array.shape) shape[0] = npdf keywords[group][key] = (shape, array.dtype) return keywords
[docs] def initializeHdf5Write( self, filename: str, npdf: int, comm=None ) -> tuple[dict[str, h5py.File | h5py.Group], h5py.File]: """Set up the output write for an ensemble, but set size to npdf rather than the size of the ensemble, as the "initial chunk" will not contain the full data Parameters ---------- filename : str Name of the file to create npdf : int Total number of distributions that the file will contain, usually larger then the size of the current ensemble comm : MPI communicator Optional MPI communicator to allow parallel writing Returns ------- group : dict[str, h5py.File | h5py.Group] A dictionary of the groups to write to. fout : h5py.File The output file object that has been created. """ kwds = self._get_allocation_kwds(npdf) group, fout = hdf5.initialize_HDF5_write(filename, comm=comm, **kwds) return group, fout
[docs] def writeHdf5Chunk( self, fname: "h5py.File" | "h5py.Group", start: int, end: int ) -> None: """Write a chunk of the ensemble data to file. This will write the data for the distributions in the slice from [start:end] to the file. This includes the ancillary data table. Parameters ---------- fname : h5py.File | h5py.Group The file or group object to write to start : int Starting index of data to write in the h5py file end : int Ending index of data to write in the h5py file """ odict = self.build_tables(encode=True, ext="hdf5").copy() odict.pop("meta") hdf5.write_dict_to_HDF5_chunk(fname, odict, start, end)
[docs] def finalizeHdf5Write(self, filename: "h5py.File" | "h5py.Group") -> None: """Write ensemble metadata to the output file and close the file. Parameters ---------- filename : h5py.File | h5py.Group The file or group object to complete writing and close. """ mdata = make_len_equal(self.metadata) hdf5.finalize_HDF5_write(filename, "meta", **mdata)