import logging
import numpy as np
from scipy import stats
import qp
from qp.metrics.array_metrics import quick_anderson_ksamp
from qp.metrics.metrics import calculate_outlier_rate
DEFAULT_QUANTS = np.linspace(0, 1, 100)
[docs]class PIT:
"""PIT(qp_ens, true_vals, eval_grid=DEFAULT_QUANTS)
Probability Integral Transform
Parameters
----------
qp_ens : Ensemble
A collection of N distribution objects
true_vals : [float]
An array-like sequence of N float values representing the known true value for each distribution
eval_grid : [float], optional
A strictly increasing array-like sequence in the range [0,1], by default DEFAULT_QUANTS
Returns
-------
PIT object
An object with an Ensemble containing the PIT distribution, and a full set of PIT samples.
"""
def __init__(self, qp_ens, true_vals, eval_grid=DEFAULT_QUANTS):
"""We will create a quantile Ensemble to store the PIT distribution, but also store the
full set of PIT samples as ancillary data of the (single PDF) ensemble.
Parameters
----------
qp_ens : Ensemble
A collection of N distribution objects
true_vals : [float]
An array-like sequence of N float values representing the known true value for each distribution
eval_grid : [float], optional
A strictly increasing array-like sequence in the range [0,1], by default DEFAULT_QUANTS
"""
self._pit_samps = self._gather_pit_samples(qp_ens, true_vals)
n_pit = np.min([len(self._pit_samps), len(eval_grid)])
if n_pit < len(eval_grid):
logging.warning(
"Number of pit samples is smaller than the evaluation grid size. "
"Will create a new evaluation grid with size = number of pit samples"
)
eval_grid = np.linspace(0, 1, n_pit)
data_quants = np.quantile(self._pit_samps, eval_grid)
self._pit = self._produce_output_ensemble(data_quants, eval_grid)
@property
def pit_samps(self):
"""Returns the PIT samples. i.e. ``CDF(true_vals)`` for each distribution
in the Ensemble used to initialize the PIT object.
Returns
-------
np.array
An array of floats
"""
return self._pit_samps
@property
def pit(self):
"""Return the PIT Ensemble object
Returns
-------
qp.Ensemble
An Ensemble containing 1 qp.quant distribution.
"""
return self._pit
[docs] def evaluate_PIT_anderson_ksamp(self, pit_min=0.0, pit_max=1.0):
"""Use scipy.stats.anderson_ksamp to compute the Anderson-Darling statistic
for the cdf(truth) values by comparing with a uniform distribution between 0 and 1.
Up to the current version (1.9.3), scipy.stats.anderson does not support
uniform distributions as reference for 1-sample test, therefore we create a uniform
"distribution" and pass it as the second value in the list of parameters to the scipy
implementation of k-sample Anderson-Darling.
For details see: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.anderson_ksamp.html
Parameters
----------
pit_min : float, optional
Minimum PIT value to accept, by default 0.
pit_max : float, optional
Maximum PIT value to accept, by default 1.
Returns
-------
array
A array of objects with attributes `statistic`, `critical_values`, and `significance_level`.
For details see:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.anderson_ksamp.html
"""
# Removed the CDF values that are outside the min/max range
trimmed_pit_values = self._trim_pit_values(pit_min, pit_max)
uniform_yvals = np.linspace(pit_min, pit_max, len(trimmed_pit_values))
return quick_anderson_ksamp(trimmed_pit_values, uniform_yvals)
[docs] def evaluate_PIT_CvM(self):
"""Calculate the Cramer von Mises statistic using scipy.stats.cramervonmises using self._pit_samps
compared to a uniform distribution. For more details see:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.cramervonmises.html
Returns
-------
array
A array of objects with attributes `statistic` and `pvalue`
For details see:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.cramervonmises.html
"""
return stats.cramervonmises(self._pit_samps, stats.uniform.cdf)
[docs] def evaluate_PIT_KS(self):
"""Calculate the Kolmogorov-Smirnov statistic using scipy.stats.kstest. For more details see:
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kstest.html
Returns
-------
array
A array of objects with attributes `statistic` and `pvalue`.
For details see: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kstest.html
"""
return stats.kstest(self._pit_samps, stats.uniform.cdf)
[docs] def evaluate_PIT_outlier_rate(self, pit_min=0.0001, pit_max=0.9999):
"""Compute fraction of PIT outliers by evaluating the CDF of the distribution in the PIT Ensemble
at `pit_min` and `pit_max`.
Parameters
----------
pit_min : float, optional
Lower bound for outliers, by default 0.0001
pit_max : float, optional
Upper bound for outliers, by default 0.9999
Returns
-------
float
The percentage of outliers in this distribution given the min and max bounds.
"""
return calculate_outlier_rate(self._pit, pit_min, pit_max)[0]
@classmethod
def _gather_pit_samples(cls, qp_ens, true_vals):
pit_samples = np.squeeze(qp_ens.cdf(np.vstack(true_vals)))
# These two lines set all `NaN` values to 0. This may or may not make sense
# Alternatively if it's better to simply remove the `NaN`, this can be done
# efficiently on line 61 with `data_quants = np.nanquantile(...)`.`
sample_mask = np.isfinite(pit_samples)
pit_samples[~sample_mask] = 0
if not np.all(sample_mask): #pragma: no cover
logging.warning(
"Some PIT samples were `NaN`. They have been replacd with 0."
)
return pit_samples
@classmethod
def _produce_output_ensemble(cls, data_quants, eval_grid):
# Remove duplicates values as well as values outside the range (0,1)
_, unique_indices = np.unique(data_quants, return_index=True)
unique_data_quants = data_quants[unique_indices]
unique_eval_grid = eval_grid[unique_indices]
quant_mask = cls._create_quant_mask(unique_data_quants)
return qp.Ensemble(
qp.quant,
data=dict(
quants=unique_eval_grid[quant_mask],
locs=np.atleast_2d(unique_data_quants[quant_mask]),
),
)
@classmethod
def _create_quant_mask(cls, data_quants):
"""Create a numpy mask such that, when applied only values greater than
0 and less than 1.0 are kept. While this function is fairly simple,
separating it into a small helper method makes testing much easier.
Parameters
----------
data_quants : np.array [float]
An array of values.
Returns
-------
np.array [bool]
The boolean mask
"""
return np.bitwise_and(data_quants > 0.0, data_quants < 1)
def _trim_pit_values(self, cdf_min, cdf_max):
"""Remove and report any cdf(x) that are outside the min/max range.
Parameters
----------
cdf_min : float
The minimum cdf(x) value to accept
cdf_max : float
The maximum cdf(x) value to accept
Returns
-------
clean: [float]
The list of PIT values within the min/max range.
"""
# Create truth mask for pit values between cdf_min and pit max
mask = (self._pit_samps >= cdf_min) & (self._pit_samps <= cdf_max)
# Keep pit values that are within the min/max range
pits_clean = self._pit_samps[mask]
# Determine how many pit values were dropped and warn the user.
diff = len(self._pit_samps) - len(pits_clean)
if diff > 0:
logging.warning("Removed %d PITs from the sample.", diff)
return pits_clean