Source code for qp.metrics.pit

import logging

import numpy as np
from scipy import stats

import qp
from qp.metrics.array_metrics import quick_anderson_ksamp
from qp.metrics.metrics import calculate_outlier_rate

DEFAULT_QUANTS = np.linspace(0, 1, 100)


[docs]class PIT: """PIT(qp_ens, true_vals, eval_grid=DEFAULT_QUANTS) Probability Integral Transform Parameters ---------- qp_ens : Ensemble A collection of N distribution objects true_vals : [float] An array-like sequence of N float values representing the known true value for each distribution eval_grid : [float], optional A strictly increasing array-like sequence in the range [0,1], by default DEFAULT_QUANTS Returns ------- PIT object An object with an Ensemble containing the PIT distribution, and a full set of PIT samples. """ def __init__(self, qp_ens, true_vals, eval_grid=DEFAULT_QUANTS): """We will create a quantile Ensemble to store the PIT distribution, but also store the full set of PIT samples as ancillary data of the (single PDF) ensemble. Parameters ---------- qp_ens : Ensemble A collection of N distribution objects true_vals : [float] An array-like sequence of N float values representing the known true value for each distribution eval_grid : [float], optional A strictly increasing array-like sequence in the range [0,1], by default DEFAULT_QUANTS """ self._pit_samps = self._gather_pit_samples(qp_ens, true_vals) n_pit = np.min([len(self._pit_samps), len(eval_grid)]) if n_pit < len(eval_grid): logging.warning( "Number of pit samples is smaller than the evaluation grid size. " "Will create a new evaluation grid with size = number of pit samples" ) eval_grid = np.linspace(0, 1, n_pit) data_quants = np.quantile(self._pit_samps, eval_grid) self._pit = self._produce_output_ensemble(data_quants, eval_grid) @property def pit_samps(self): """Returns the PIT samples. i.e. ``CDF(true_vals)`` for each distribution in the Ensemble used to initialize the PIT object. Returns ------- np.array An array of floats """ return self._pit_samps @property def pit(self): """Return the PIT Ensemble object Returns ------- qp.Ensemble An Ensemble containing 1 qp.quant distribution. """ return self._pit
[docs] def calculate_pit_meta_metrics(self): """Convenience method that will calculate all of the PIT meta metrics and return them as a dictionary. Returns ------- dictionary The collection of PIT statistics """ pit_meta_metrics = {} pit_meta_metrics["ad"] = self.evaluate_PIT_anderson_ksamp() pit_meta_metrics["cvm"] = self.evaluate_PIT_CvM() pit_meta_metrics["ks"] = self.evaluate_PIT_KS() pit_meta_metrics["outlier_rate"] = self.evaluate_PIT_outlier_rate() return pit_meta_metrics
[docs] def evaluate_PIT_anderson_ksamp(self, pit_min=0.0, pit_max=1.0): """Use scipy.stats.anderson_ksamp to compute the Anderson-Darling statistic for the cdf(truth) values by comparing with a uniform distribution between 0 and 1. Up to the current version (1.9.3), scipy.stats.anderson does not support uniform distributions as reference for 1-sample test, therefore we create a uniform "distribution" and pass it as the second value in the list of parameters to the scipy implementation of k-sample Anderson-Darling. For details see: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.anderson_ksamp.html Parameters ---------- pit_min : float, optional Minimum PIT value to accept, by default 0. pit_max : float, optional Maximum PIT value to accept, by default 1. Returns ------- array A array of objects with attributes `statistic`, `critical_values`, and `significance_level`. For details see: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.anderson_ksamp.html """ # Removed the CDF values that are outside the min/max range trimmed_pit_values = self._trim_pit_values(pit_min, pit_max) uniform_yvals = np.linspace(pit_min, pit_max, len(trimmed_pit_values)) return quick_anderson_ksamp(trimmed_pit_values, uniform_yvals)
[docs] def evaluate_PIT_CvM(self): """Calculate the Cramer von Mises statistic using scipy.stats.cramervonmises using self._pit_samps compared to a uniform distribution. For more details see: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.cramervonmises.html Returns ------- array A array of objects with attributes `statistic` and `pvalue` For details see: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.cramervonmises.html """ return stats.cramervonmises(self._pit_samps, stats.uniform.cdf)
[docs] def evaluate_PIT_KS(self): """Calculate the Kolmogorov-Smirnov statistic using scipy.stats.kstest. For more details see: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kstest.html Returns ------- array A array of objects with attributes `statistic` and `pvalue`. For details see: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kstest.html """ return stats.kstest(self._pit_samps, stats.uniform.cdf)
[docs] def evaluate_PIT_outlier_rate(self, pit_min=0.0001, pit_max=0.9999): """Compute fraction of PIT outliers by evaluating the CDF of the distribution in the PIT Ensemble at `pit_min` and `pit_max`. Parameters ---------- pit_min : float, optional Lower bound for outliers, by default 0.0001 pit_max : float, optional Upper bound for outliers, by default 0.9999 Returns ------- float The percentage of outliers in this distribution given the min and max bounds. """ return calculate_outlier_rate(self._pit, pit_min, pit_max)[0]
@classmethod def _gather_pit_samples(cls, qp_ens, true_vals): pit_samples = np.squeeze(qp_ens.cdf(np.vstack(true_vals))) # These two lines set all `NaN` values to 0. This may or may not make sense # Alternatively if it's better to simply remove the `NaN`, this can be done # efficiently on line 61 with `data_quants = np.nanquantile(...)`.` sample_mask = np.isfinite(pit_samples) pit_samples[~sample_mask] = 0 if not np.all(sample_mask): #pragma: no cover logging.warning( "Some PIT samples were `NaN`. They have been replacd with 0." ) return pit_samples @classmethod def _produce_output_ensemble(cls, data_quants, eval_grid): # Remove duplicates values as well as values outside the range (0,1) _, unique_indices = np.unique(data_quants, return_index=True) unique_data_quants = data_quants[unique_indices] unique_eval_grid = eval_grid[unique_indices] quant_mask = cls._create_quant_mask(unique_data_quants) return qp.Ensemble( qp.quant, data=dict( quants=unique_eval_grid[quant_mask], locs=np.atleast_2d(unique_data_quants[quant_mask]), ), ) @classmethod def _create_quant_mask(cls, data_quants): """Create a numpy mask such that, when applied only values greater than 0 and less than 1.0 are kept. While this function is fairly simple, separating it into a small helper method makes testing much easier. Parameters ---------- data_quants : np.array [float] An array of values. Returns ------- np.array [bool] The boolean mask """ return np.bitwise_and(data_quants > 0.0, data_quants < 1) def _trim_pit_values(self, cdf_min, cdf_max): """Remove and report any cdf(x) that are outside the min/max range. Parameters ---------- cdf_min : float The minimum cdf(x) value to accept cdf_max : float The maximum cdf(x) value to accept Returns ------- clean: [float] The list of PIT values within the min/max range. """ # Create truth mask for pit values between cdf_min and pit max mask = (self._pit_samps >= cdf_min) & (self._pit_samps <= cdf_max) # Keep pit values that are within the min/max range pits_clean = self._pit_samps[mask] # Determine how many pit values were dropped and warn the user. diff = len(self._pit_samps) - len(pits_clean) if diff > 0: logging.warning("Removed %d PITs from the sample.", diff) return pits_clean