qp practical example

Alex Malz, Phil Marshall, Eric Charles

In this notebook we use the qp module to study some photo-Z PDFs.

Setup, reading ensemble

Imports

First let’s import the packages we will need for this notebook

import numpy as np
import os

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline

import qp

Data files

Now lets download the data files we will need, if we haven’t already

# base_url = 'https://slac.stanford.edu/~echarles/qp_example'
# if not os.path.exists('qp_test_ensemble.hf5'):
#     os.system('curl -o %s -OL %s/%s' % ('qp_test_ensemble.hf5', base_url, 'qp_test_ensemble.hf5'))

Reading ensemble

Now we read the ensemble, note that we only need the name of the data file, the name of the metadata file is assumed.

#ens = qp.read('qp_test_ensemble.hf5')
QP_DIR = os.path.abspath(os.path.dirname(qp.__file__))
data_file = os.path.join(QP_DIR, 'data', 'test.hdf5')
ens = qp.read(data_file)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
File ~/checkouts/readthedocs.org/user_builds/qp/envs/stable/lib/python3.10/site-packages/tables_io/io_utils/read.py:239, in read_native(filepath, fmt, keys, allow_missing_keys, slice_dict, **kwargs)
    238 try:
--> 239     return read_HDF5_to_dicts(filepath, keys=keys, slice_dict=slice_dict)
    240 except Exception as e:

File ~/checkouts/readthedocs.org/user_builds/qp/envs/stable/lib/python3.10/site-packages/tables_io/io_utils/read.py:741, in read_HDF5_to_dicts(filepath, keys, slice_dict)
    721 """
    722 Reads `numpy.array` objects into an `OrderedDict` from an hdf5 file. If a list of keys is given,
    723 will only read those specific datasets.
   (...)
    739     The data
    740 """
--> 741 fin = h5py.File(filepath)
    742 l_out = []

File ~/checkouts/readthedocs.org/user_builds/qp/envs/stable/lib/python3.10/site-packages/h5py/_hl/files.py:555, in File.__init__(self, name, mode, driver, libver, userblock_size, swmr, rdcc_nslots, rdcc_nbytes, rdcc_w0, track_order, fs_strategy, fs_persist, fs_threshold, fs_page_size, page_buf_size, min_meta_keep, min_raw_keep, locking, alignment_threshold, alignment_interval, meta_block_size, track_times, **kwds)
    552     fcpl = make_fcpl(track_order=track_order, track_times=track_times,
    553                      fs_strategy=fs_strategy, fs_persist=fs_persist,
    554                      fs_threshold=fs_threshold, fs_page_size=fs_page_size)
--> 555     fid = make_fid(name, mode, userblock_size, fapl, fcpl, swmr=swmr)
    557 if isinstance(libver, tuple):

File ~/checkouts/readthedocs.org/user_builds/qp/envs/stable/lib/python3.10/site-packages/h5py/_hl/files.py:232, in make_fid(name, mode, userblock_size, fapl, fcpl, swmr)
    231         flags |= h5f.ACC_SWMR_READ
--> 232     fid = h5f.open(name, flags, fapl=fapl)
    233 elif mode == 'r+':

File h5py/_objects.pyx:54, in h5py._objects.with_phil.wrapper()

File h5py/_objects.pyx:55, in h5py._objects.with_phil.wrapper()

File h5py/h5f.pyx:106, in h5py.h5f.open()

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = '/home/docs/checkouts/readthedocs.org/user_builds/qp/envs/stable/lib/python3.10/site-packages/qp/data/test.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

The above exception was the direct cause of the following exception:

RuntimeError                              Traceback (most recent call last)
Cell In[3], line 4
      2 QP_DIR = os.path.abspath(os.path.dirname(qp.__file__))
      3 data_file = os.path.join(QP_DIR, 'data', 'test.hdf5')
----> 4 ens = qp.read(data_file)

File ~/checkouts/readthedocs.org/user_builds/qp/envs/stable/lib/python3.10/site-packages/qp/core/factory.py:424, in Factory.read(self, filename, fmt, read_slice)
    421 else:
    422     slice_dict = None
--> 424 tables = tables_io.read(
    425     filename,
    426     NUMPY_DICT,
    427     fmt=fmt,
    428     keys=keys,
    429     allow_missing_keys=allow_missing_keys,
    430     slice_dict=slice_dict,
    431 )  # pylint: disable=no-member
    433 # set up file_fmt to have the file extension information
    434 if fmt is not None:

File ~/checkouts/readthedocs.org/user_builds/qp/envs/stable/lib/python3.10/site-packages/tables_io/io_utils/read.py:151, in read(filepath, tType, fmt, keys, allow_missing_keys, slice_dict, **kwargs)
     56 def read(
     57     filepath: str,
     58     tType: Union[int, str, None] = None,
   (...)
     63     **kwargs,
     64 ):
     65     """Reads in a given file to either a `Table-like` format if there is one table within the file,
     66     or a `TableDict-like` format if there are multiple tables or files. Uses :py:func:`read_native` to read the file.
     67 
   (...)
    149 
    150     """
--> 151     odict = read_native(filepath, fmt, keys, allow_missing_keys, slice_dict, **kwargs)
    153     if len(odict) == 1:
    154         # For special keys, use the table alone without an enclosing dictionary.
    155         single_dict_key = list(odict.keys())[0]

File ~/checkouts/readthedocs.org/user_builds/qp/envs/stable/lib/python3.10/site-packages/tables_io/io_utils/read.py:241, in read_native(filepath, fmt, keys, allow_missing_keys, slice_dict, **kwargs)
    239         return read_HDF5_to_dicts(filepath, keys=keys, slice_dict=slice_dict)
    240     except Exception as e:
--> 241         raise RuntimeError(
    242             read_native_error_message(
    243                 filepath, fType, fmt, keys, allow_missing_keys, **kwargs
    244             )
    245             + f" \n because of error: \n {e}"
    246         ) from e
    247 if fType == NUMPY_FITS:
    248     try:

RuntimeError: numpyHdf5 file could not be read in with the following arguments: 
 filepath: '/home/docs/checkouts/readthedocs.org/user_builds/qp/envs/stable/lib/python3.10/site-packages/qp/data/test.hdf5', fmt: 'None', keys: None, allow_missing_keys: False, and **kwargs: {} 
 because of error: 
 [Errno 2] Unable to synchronously open file (unable to open file: name = '/home/docs/checkouts/readthedocs.org/user_builds/qp/envs/stable/lib/python3.10/site-packages/qp/data/test.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

Exploration

This will show use that the

# Confirm that we have read the ensembles
print("Ensemble = ", ens)
# Print some simple information about the ensemble
print("Rep      = ", ens.gen_class.name)
print("NPDF     = ", ens.npdf)
print("Metadata = ", ens.metadata())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[4], line 2
      1 # Confirm that we have read the ensembles
----> 2 print("Ensemble = ", ens)
      3 # Print some simple information about the ensemble
      4 print("Rep      = ", ens.gen_class.name)

NameError: name 'ens' is not defined

Plotting Example

Now we are going to plot some PDFs from the ensemble Note that the first call to the plot specifies the x-axis limits, but does not specify the key (i.e., which PDF in the ensemble), so that defaults to 0 (i.e., the first PDF).

axes = ens.plot(xlim=(0., 0.5), label="PDF 0")
_ = ens.plot(key=1, axes=axes, label="PDF 1")
_ = ens.plot(key=1300, axes=axes, label="PDF 1300")
legend = axes.figure.legend()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 axes = ens.plot(xlim=(0., 0.5), label="PDF 0")
      2 _ = ens.plot(key=1, axes=axes, label="PDF 1")
      3 _ = ens.plot(key=1300, axes=axes, label="PDF 1300")

NameError: name 'ens' is not defined

Timing benchmarks

Now we are going to extract some information from the ensemble and time how long it takes to do so

# These are the grid points and quantiles at which we will extract values
test_xvals = ens.gen_obj.xvals
test_quantiles = np.linspace(0, 1, 51)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[6], line 2
      1 # These are the grid points and quantiles at which we will extract values
----> 2 test_xvals = ens.gen_obj.xvals
      3 test_quantiles = np.linspace(0, 1, 51)

NameError: name 'ens' is not defined
%%time
# Time the pdf evaluation for 20000 PDFs
pdfs = ens.pdf(test_xvals)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
%%time
# Time the cdf (Cumulative distribution function) evaluation for 20000 PDFs
cdfs = ens.cdf(test_xvals)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
%%time
ppfs = ens.ppf(test_quantiles)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:1

NameError: name 'ens' is not defined
%%time
# Time the sf (survival fraction, 1-cdf) evaluation for 20000 PDFs
sfs = ens.sf(test_xvals)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
%%time
# Time the isf (inverse survival fraction) evaluation for 20000 PDFs
isfs = ens.isf(test_quantiles)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
%%time
# Time the generation of 100 samples for each of the 20000 PDFs
samples = ens.rvs(size=100)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined

Changing the grid for the representation

%%time
# Convert to a grid using 51 grid points
ens_g51 = qp.convert(ens, 'interp', xvals=np.linspace(0, 3, 51))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
%%time
# Convert to a grid using 21 grid points
ens_g21 = qp.convert(ens, 'interp', xvals=np.linspace(0, 1, 21))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
key = 0
axes_g = ens.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_g51.plot(key, axes=axes_g, label="g51")
_ = ens_g21.plot(key, axes=axes_g, label="g21")
leg_g = axes_g.figure.legend()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[15], line 2
      1 key = 0
----> 2 axes_g = ens.plot(key, xlim=(0, 0.5), label="orig")
      3 _ = ens_g51.plot(key, axes=axes_g, label="g51")
      4 _ = ens_g21.plot(key, axes=axes_g, label="g21")

NameError: name 'ens' is not defined

Conversion to other represenations

quantile representaion

%%time
# Convert using 51 quantiles
ens_q51 = qp.convert(ens, 'quant', quants=np.linspace(0.01, 0.99, 51))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
%%time
# Convert using 21 quantiles
ens_q21 = qp.convert(ens, 'quant', quants=np.linspace(0.01, 0.99, 21))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
key = 0
axes_q = ens.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_q51.plot(key, axes=axes_q, label="q51")
_ = ens_q21.plot(key, axes=axes_q, label="q21")
leg_q = axes_q.figure.legend()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[18], line 2
      1 key = 0
----> 2 axes_q = ens.plot(key, xlim=(0, 0.5), label="orig")
      3 _ = ens_q51.plot(key, axes=axes_q, label="q51")
      4 _ = ens_q21.plot(key, axes=axes_q, label="q21")

NameError: name 'ens' is not defined

Histogram representation

%%time
# Convert to a histogram using 51 bins
ens_h51 = qp.convert(ens, 'hist', bins=np.linspace(0, 3.0, 51))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
%%time
# Convert to a histogram using 21 bins
ens_h21 = qp.convert(ens, 'hist', bins=np.linspace(0, 3.0, 21))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens' is not defined
key = 0
axes_h = ens.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_h51.plot(key, axes=axes_h, label="h51")
_ = ens_h21.plot(key, axes=axes_h, label="h21")
leg_h = axes_h.figure.legend()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[21], line 2
      1 key = 0
----> 2 axes_h = ens.plot(key, xlim=(0, 0.5), label="orig")
      3 _ = ens_h51.plot(key, axes=axes_h, label="h51")
      4 _ = ens_h21.plot(key, axes=axes_h, label="h21")

NameError: name 'ens' is not defined

Other representations

qp also includes spline-based and Gaussian mixture represenations. The conversion to these forms much slower, so we will first reduce the base ensemble from 20000 PDFs to 100 PDFs

ens_red = ens[np.arange(100)]
print("Reduced ensemble has %i PDFs" % (ens_red.npdf))
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[22], line 1
----> 1 ens_red = ens[np.arange(100)]
      2 print("Reduced ensemble has %i PDFs" % (ens_red.npdf))

NameError: name 'ens' is not defined

Spline representation

We can convert to the spline representation a few different ways. This particular method specifies that we should evaluate each PDF at a grid of points and then use those to construct the spline represenation. We do this for 2 different grids. Note how much slower this conversion function is that the ones above.

%%time
# Convert to a histogram using 51 grid points
ens_s51 = qp.convert(ens_red, 'spline', xvals=np.linspace(0, 3.0, 51), method="xy")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens_red' is not defined
ens_s21 = qp.convert(ens_red, 'spline', xvals=np.linspace(0, 3.0, 21), method="xy")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[24], line 1
----> 1 ens_s21 = qp.convert(ens_red, 'spline', xvals=np.linspace(0, 3.0, 21), method="xy")

NameError: name 'ens_red' is not defined
key = 0
axes_s = ens_red.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_s51.plot(key, axes=axes_s, label="s51")
_ = ens_s21.plot(key, axes=axes_s, label="s21")
leg_s = axes_s.figure.legend()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[25], line 2
      1 key = 0
----> 2 axes_s = ens_red.plot(key, xlim=(0, 0.5), label="orig")
      3 _ = ens_s51.plot(key, axes=axes_s, label="s51")
      4 _ = ens_s21.plot(key, axes=axes_s, label="s21")

NameError: name 'ens_red' is not defined

Gaussian mixture model

%%time
# Convert to a gaussian mixture using 301 sample points and 3 components
ens_m3 = qp.convert(ens_red, 'mixmod', xvals=np.linspace(0, 3.0, 301), ncomps=3)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens_red' is not defined
%%time
# Convert to a gaussian mixture using 301 sample points and 3 components
ens_m5 = qp.convert(ens_red, 'mixmod', xvals=np.linspace(0, 3.0, 301), ncomps=5)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
File <timed exec>:2

NameError: name 'ens_red' is not defined
key = 0
axes_m = ens_red.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_m3.plot(key, axes=axes_m, label="m3")
_ = ens_m5.plot(key, axes=axes_m, label="m5")
leg_m = axes_m.figure.legend()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[28], line 2
      1 key = 0
----> 2 axes_m = ens_red.plot(key, xlim=(0, 0.5), label="orig")
      3 _ = ens_m3.plot(key, axes=axes_m, label="m3")
      4 _ = ens_m5.plot(key, axes=axes_m, label="m5")

NameError: name 'ens_red' is not defined
key = 1
axes_m1 = ens_red.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_m3.plot(key, axes=axes_m1, label="m3")
_ = ens_m5.plot(key, axes=axes_m1, label="m5")
leg_m1 = axes_m1.figure.legend()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[29], line 2
      1 key = 1
----> 2 axes_m1 = ens_red.plot(key, xlim=(0, 0.5), label="orig")
      3 _ = ens_m3.plot(key, axes=axes_m1, label="m3")
      4 _ = ens_m5.plot(key, axes=axes_m1, label="m5")

NameError: name 'ens_red' is not defined