qp practical example

Alex Malz, Phil Marshall, Eric Charles

In this notebook we use the qp module to study some photo-Z PDFs.

Setup, reading ensemble

Imports

First let’s import the packages we will need for this notebook

import numpy as np
import os

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline

import qp

Data files

Now lets download the data files we will need, if we haven’t already

# base_url = 'https://slac.stanford.edu/~echarles/qp_example'
# if not os.path.exists('qp_test_ensemble.hf5'):
#     os.system('curl -o %s -OL %s/%s' % ('qp_test_ensemble.hf5', base_url, 'qp_test_ensemble.hf5'))

Reading ensemble

Now we read the ensemble, note that we only need the name of the data file, the name of the metadata file is assumed.

#ens = qp.read('qp_test_ensemble.hf5')
QP_DIR = os.path.abspath(os.path.dirname(qp.__file__))
data_file = os.path.join(QP_DIR, 'data', 'test.hdf5')
ens = qp.read(data_file)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
File ~/checkouts/readthedocs.org/user_builds/qp/envs/main/lib/python3.10/site-packages/tables_io/io_utils/read.py:233, in read_native(filepath, fmt, keys, allow_missing_keys, slice_dict, **kwargs)
    232 try:
--> 233     return read_HDF5_to_dicts(filepath, keys=keys, slice_dict=slice_dict)
    234 except Exception as e:

File ~/checkouts/readthedocs.org/user_builds/qp/envs/main/lib/python3.10/site-packages/tables_io/io_utils/read.py:738, in read_HDF5_to_dicts(filepath, keys, slice_dict)
    718 """
    719 Reads `numpy.array` objects into an `OrderedDict` from an hdf5 file. If a list of keys is given,
    720 will only read those specific datasets.
   (...)
    736     The data
    737 """
--> 738 fin = h5py.File(filepath)
    739 l_out = []

File ~/checkouts/readthedocs.org/user_builds/qp/envs/main/lib/python3.10/site-packages/h5py/_hl/files.py:555, in File.__init__(self, name, mode, driver, libver, userblock_size, swmr, rdcc_nslots, rdcc_nbytes, rdcc_w0, track_order, fs_strategy, fs_persist, fs_threshold, fs_page_size, page_buf_size, min_meta_keep, min_raw_keep, locking, alignment_threshold, alignment_interval, meta_block_size, track_times, **kwds)
    552     fcpl = make_fcpl(track_order=track_order, track_times=track_times,
    553                      fs_strategy=fs_strategy, fs_persist=fs_persist,
    554                      fs_threshold=fs_threshold, fs_page_size=fs_page_size)
--> 555     fid = make_fid(name, mode, userblock_size, fapl, fcpl, swmr=swmr)
    557 if isinstance(libver, tuple):

File ~/checkouts/readthedocs.org/user_builds/qp/envs/main/lib/python3.10/site-packages/h5py/_hl/files.py:232, in make_fid(name, mode, userblock_size, fapl, fcpl, swmr)
    231         flags |= h5f.ACC_SWMR_READ
--> 232     fid = h5f.open(name, flags, fapl=fapl)
    233 elif mode == 'r+':

File h5py/_objects.pyx:54, in h5py._objects.with_phil.wrapper()

File h5py/_objects.pyx:55, in h5py._objects.with_phil.wrapper()

File h5py/h5f.pyx:106, in h5py.h5f.open()

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = '/home/docs/checkouts/readthedocs.org/user_builds/qp/envs/main/lib/python3.10/site-packages/qp/data/test.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

The above exception was the direct cause of the following exception:

RuntimeError                              Traceback (most recent call last)
Cell In[3], line 4
      2 QP_DIR = os.path.abspath(os.path.dirname(qp.__file__))
      3 data_file = os.path.join(QP_DIR, 'data', 'test.hdf5')
----> 4 ens = qp.read(data_file)

File ~/checkouts/readthedocs.org/user_builds/qp/envs/main/lib/python3.10/site-packages/qp/core/factory.py:423, in Factory.read(self, filename, fmt, read_slice)
    420 else:
    421     slice_dict = None
--> 423 tables = tables_io.read(
    424     filename,
    425     NUMPY_DICT,
    426     fmt=fmt,
    427     keys=keys,
    428     allow_missing_keys=allow_missing_keys,
    429     slice_dict=slice_dict,
    430 )  # pylint: disable=no-member
    432 # set up file_fmt to have the file extension information
    433 if fmt is not None:

File ~/checkouts/readthedocs.org/user_builds/qp/envs/main/lib/python3.10/site-packages/tables_io/io_utils/read.py:145, in read(filepath, tType, fmt, keys, allow_missing_keys, slice_dict, **kwargs)
     50 def read(
     51     filepath: str,
     52     tType: Union[int, str, None] = None,
   (...)
     57     **kwargs,
     58 ):
     59     """Reads in a given file to either a `Table-like` format if there is one table within the file,
     60     or a `TableDict-like` format if there are multiple tables or files. Uses :py:func:`read_native` to read the file.
     61 
   (...)
    143 
    144     """
--> 145     odict = read_native(filepath, fmt, keys, allow_missing_keys, slice_dict, **kwargs)
    147     if len(odict) == 1:
    148         # For special keys, use the table alone without an enclosing dictionary.
    149         single_dict_key = list(odict.keys())[0]

File ~/checkouts/readthedocs.org/user_builds/qp/envs/main/lib/python3.10/site-packages/tables_io/io_utils/read.py:235, in read_native(filepath, fmt, keys, allow_missing_keys, slice_dict, **kwargs)
    233         return read_HDF5_to_dicts(filepath, keys=keys, slice_dict=slice_dict)
    234     except Exception as e:
--> 235         raise RuntimeError(
    236             read_native_error_message(
    237                 filepath, fType, fmt, keys, allow_missing_keys, **kwargs
    238             )
    239             + f" \n because of error: \n {e}"
    240         ) from e
    241 if fType == NUMPY_FITS:
    242     try:

RuntimeError: numpyHdf5 file could not be read in with the following arguments: 
 filepath: '/home/docs/checkouts/readthedocs.org/user_builds/qp/envs/main/lib/python3.10/site-packages/qp/data/test.hdf5', fmt: 'None', keys: None, allow_missing_keys: False, and **kwargs: {} 
 because of error: 
 [Errno 2] Unable to synchronously open file (unable to open file: name = '/home/docs/checkouts/readthedocs.org/user_builds/qp/envs/main/lib/python3.10/site-packages/qp/data/test.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

Exploration

This will show use that the

# Confirm that we have read the ensembles
print("Ensemble = ", ens)
# Print some simple information about the ensemble
print("Rep      = ", ens.gen_class.name)
print("NPDF     = ", ens.npdf)
print("Metadata = ", ens.metadata())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 2
      1 # Confirm that we have read the ensembles
----> 2 print("Ensemble = ", ens)
      3 # Print some simple information about the ensemble
      4 print("Rep      = ", ens.gen_class.name)

NameError: name 'ens' is not defined

Plotting Example

Now we are going to plot some PDFs from the ensemble Note that the first call to the plot specifies the x-axis limits, but does not specify the key (i.e., which PDF in the ensemble), so that defaults to 0 (i.e., the first PDF).

axes = ens.plot(xlim=(0., 0.5), label="PDF 0")
_ = ens.plot(key=1, axes=axes, label="PDF 1")
_ = ens.plot(key=1300, axes=axes, label="PDF 1300")
legend = axes.figure.legend()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 axes = ens.plot(xlim=(0., 0.5), label="PDF 0")
      2 _ = ens.plot(key=1, axes=axes, label="PDF 1")
      3 _ = ens.plot(key=1300, axes=axes, label="PDF 1300")

NameError: name 'ens' is not defined

Timing benchmarks

Now we are going to extract some information from the ensemble and time how long it takes to do so

# These are the grid points and quantiles at which we will extract values
test_xvals = ens.gen_obj.xvals
test_quantiles = np.linspace(0, 1, 51)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 2
      1 # These are the grid points and quantiles at which we will extract values
----> 2 test_xvals = ens.gen_obj.xvals
      3 test_quantiles = np.linspace(0, 1, 51)

NameError: name 'ens' is not defined
%%time
# Time the pdf evaluation for 20000 PDFs
pdfs = ens.pdf(test_xvals)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
%%time
# Time the cdf (Cumulative distribution function) evaluation for 20000 PDFs
cdfs = ens.cdf(test_xvals)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
%%time
ppfs = ens.ppf(test_quantiles)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:1

NameError: name 'ens' is not defined
%%time
# Time the sf (survival fraction, 1-cdf) evaluation for 20000 PDFs
sfs = ens.sf(test_xvals)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
%%time
# Time the isf (inverse survival fraction) evaluation for 20000 PDFs
isfs = ens.isf(test_quantiles)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
%%time
# Time the generation of 100 samples for each of the 20000 PDFs
samples = ens.rvs(size=100)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined

Changing the grid for the representation

%%time
# Convert to a grid using 51 grid points
ens_g51 = qp.convert(ens, 'interp', xvals=np.linspace(0, 3, 51))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
%%time
# Convert to a grid using 21 grid points
ens_g21 = qp.convert(ens, 'interp', xvals=np.linspace(0, 1, 21))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
key = 0
axes_g = ens.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_g51.plot(key, axes=axes_g, label="g51")
_ = ens_g21.plot(key, axes=axes_g, label="g21")
leg_g = axes_g.figure.legend()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 2
      1 key = 0
----> 2 axes_g = ens.plot(key, xlim=(0, 0.5), label="orig")
      3 _ = ens_g51.plot(key, axes=axes_g, label="g51")
      4 _ = ens_g21.plot(key, axes=axes_g, label="g21")

NameError: name 'ens' is not defined

Conversion to other represenations

quantile representaion

%%time
# Convert using 51 quantiles
ens_q51 = qp.convert(ens, 'quant', quants=np.linspace(0.01, 0.99, 51))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
%%time
# Convert using 21 quantiles
ens_q21 = qp.convert(ens, 'quant', quants=np.linspace(0.01, 0.99, 21))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
key = 0
axes_q = ens.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_q51.plot(key, axes=axes_q, label="q51")
_ = ens_q21.plot(key, axes=axes_q, label="q21")
leg_q = axes_q.figure.legend()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[18], line 2
      1 key = 0
----> 2 axes_q = ens.plot(key, xlim=(0, 0.5), label="orig")
      3 _ = ens_q51.plot(key, axes=axes_q, label="q51")
      4 _ = ens_q21.plot(key, axes=axes_q, label="q21")

NameError: name 'ens' is not defined

Histogram representation

%%time
# Convert to a histogram using 51 bins
ens_h51 = qp.convert(ens, 'hist', bins=np.linspace(0, 3.0, 51))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
%%time
# Convert to a histogram using 21 bins
ens_h21 = qp.convert(ens, 'hist', bins=np.linspace(0, 3.0, 21))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
key = 0
axes_h = ens.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_h51.plot(key, axes=axes_h, label="h51")
_ = ens_h21.plot(key, axes=axes_h, label="h21")
leg_h = axes_h.figure.legend()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[21], line 2
      1 key = 0
----> 2 axes_h = ens.plot(key, xlim=(0, 0.5), label="orig")
      3 _ = ens_h51.plot(key, axes=axes_h, label="h51")
      4 _ = ens_h21.plot(key, axes=axes_h, label="h21")

NameError: name 'ens' is not defined

Other representations

qp also includes spline-based and Gaussian mixture represenations. The conversion to these forms much slower, so we will first reduce the base ensemble from 20000 PDFs to 100 PDFs

ens_red = ens[np.arange(100)]
print("Reduced ensemble has %i PDFs" % (ens_red.npdf))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[22], line 1
----> 1 ens_red = ens[np.arange(100)]
      2 print("Reduced ensemble has %i PDFs" % (ens_red.npdf))

NameError: name 'ens' is not defined

Spline representation

We can convert to the spline representation a few different ways. This particular method specifies that we should evaluate each PDF at a grid of points and then use those to construct the spline represenation. We do this for 2 different grids. Note how much slower this conversion function is that the ones above.

%%time
# Convert to a histogram using 51 grid points
ens_s51 = qp.convert(ens_red, 'spline', xvals=np.linspace(0, 3.0, 51), method="xy")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens_red' is not defined
ens_s21 = qp.convert(ens_red, 'spline', xvals=np.linspace(0, 3.0, 21), method="xy")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[24], line 1
----> 1 ens_s21 = qp.convert(ens_red, 'spline', xvals=np.linspace(0, 3.0, 21), method="xy")

NameError: name 'ens_red' is not defined
key = 0
axes_s = ens_red.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_s51.plot(key, axes=axes_s, label="s51")
_ = ens_s21.plot(key, axes=axes_s, label="s21")
leg_s = axes_s.figure.legend()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[25], line 2
      1 key = 0
----> 2 axes_s = ens_red.plot(key, xlim=(0, 0.5), label="orig")
      3 _ = ens_s51.plot(key, axes=axes_s, label="s51")
      4 _ = ens_s21.plot(key, axes=axes_s, label="s21")

NameError: name 'ens_red' is not defined

Gaussian mixture model

%%time
# Convert to a gaussian mixture using 301 sample points and 3 components
ens_m3 = qp.convert(ens_red, 'mixmod', xvals=np.linspace(0, 3.0, 301), ncomps=3)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens_red' is not defined
%%time
# Convert to a gaussian mixture using 301 sample points and 3 components
ens_m5 = qp.convert(ens_red, 'mixmod', xvals=np.linspace(0, 3.0, 301), ncomps=5)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens_red' is not defined
key = 0
axes_m = ens_red.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_m3.plot(key, axes=axes_m, label="m3")
_ = ens_m5.plot(key, axes=axes_m, label="m5")
leg_m = axes_m.figure.legend()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[28], line 2
      1 key = 0
----> 2 axes_m = ens_red.plot(key, xlim=(0, 0.5), label="orig")
      3 _ = ens_m3.plot(key, axes=axes_m, label="m3")
      4 _ = ens_m5.plot(key, axes=axes_m, label="m5")

NameError: name 'ens_red' is not defined
key = 1
axes_m1 = ens_red.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_m3.plot(key, axes=axes_m1, label="m3")
_ = ens_m5.plot(key, axes=axes_m1, label="m5")
leg_m1 = axes_m1.figure.legend()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[29], line 2
      1 key = 1
----> 2 axes_m1 = ens_red.plot(key, xlim=(0, 0.5), label="orig")
      3 _ = ens_m3.plot(key, axes=axes_m1, label="m3")
      4 _ = ens_m5.plot(key, axes=axes_m1, label="m5")

NameError: name 'ens_red' is not defined