qp practical example
Alex Malz, Phil Marshall, Eric Charles
In this notebook we use the qp module to study some photo-Z PDFs.
Setup, reading ensemble
Imports
First let’s import the packages we will need for this notebook
import numpy as np
import os
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline
import qp
Data files
Now lets download the data files we will need, if we haven’t already
# base_url = 'https://slac.stanford.edu/~echarles/qp_example'
# if not os.path.exists('qp_test_ensemble.hf5'):
# os.system('curl -o %s -OL %s/%s' % ('qp_test_ensemble.hf5', base_url, 'qp_test_ensemble.hf5'))
Reading ensemble
Now we read the ensemble, note that we only need the name of the data file, the name of the metadata file is assumed.
#ens = qp.read('qp_test_ensemble.hf5')
QP_DIR = os.path.abspath(os.path.dirname(qp.__file__))
data_file = os.path.join(QP_DIR, 'data', 'test.hdf5')
ens = qp.read(data_file)
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
File ~/checkouts/readthedocs.org/user_builds/qp/envs/latest/lib/python3.10/site-packages/tables_io/io_utils/read.py:233, in read_native(filepath, fmt, keys, allow_missing_keys, slice_dict, **kwargs)
232 try:
--> 233 return read_HDF5_to_dicts(filepath, keys=keys, slice_dict=slice_dict)
234 except Exception as e:
File ~/checkouts/readthedocs.org/user_builds/qp/envs/latest/lib/python3.10/site-packages/tables_io/io_utils/read.py:738, in read_HDF5_to_dicts(filepath, keys, slice_dict)
718 """
719 Reads `numpy.array` objects into an `OrderedDict` from an hdf5 file. If a list of keys is given,
720 will only read those specific datasets.
(...)
736 The data
737 """
--> 738 fin = h5py.File(filepath)
739 l_out = []
File ~/checkouts/readthedocs.org/user_builds/qp/envs/latest/lib/python3.10/site-packages/h5py/_hl/files.py:555, in File.__init__(self, name, mode, driver, libver, userblock_size, swmr, rdcc_nslots, rdcc_nbytes, rdcc_w0, track_order, fs_strategy, fs_persist, fs_threshold, fs_page_size, page_buf_size, min_meta_keep, min_raw_keep, locking, alignment_threshold, alignment_interval, meta_block_size, track_times, **kwds)
552 fcpl = make_fcpl(track_order=track_order, track_times=track_times,
553 fs_strategy=fs_strategy, fs_persist=fs_persist,
554 fs_threshold=fs_threshold, fs_page_size=fs_page_size)
--> 555 fid = make_fid(name, mode, userblock_size, fapl, fcpl, swmr=swmr)
557 if isinstance(libver, tuple):
File ~/checkouts/readthedocs.org/user_builds/qp/envs/latest/lib/python3.10/site-packages/h5py/_hl/files.py:232, in make_fid(name, mode, userblock_size, fapl, fcpl, swmr)
231 flags |= h5f.ACC_SWMR_READ
--> 232 fid = h5f.open(name, flags, fapl=fapl)
233 elif mode == 'r+':
File h5py/_objects.pyx:54, in h5py._objects.with_phil.wrapper()
File h5py/_objects.pyx:55, in h5py._objects.with_phil.wrapper()
File h5py/h5f.pyx:106, in h5py.h5f.open()
FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = '/home/docs/checkouts/readthedocs.org/user_builds/qp/envs/latest/lib/python3.10/site-packages/qp/data/test.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)
The above exception was the direct cause of the following exception:
RuntimeError Traceback (most recent call last)
Cell In[3], line 4
2 QP_DIR = os.path.abspath(os.path.dirname(qp.__file__))
3 data_file = os.path.join(QP_DIR, 'data', 'test.hdf5')
----> 4 ens = qp.read(data_file)
File ~/checkouts/readthedocs.org/user_builds/qp/envs/latest/lib/python3.10/site-packages/qp/core/factory.py:423, in Factory.read(self, filename, fmt, read_slice)
420 else:
421 slice_dict = None
--> 423 tables = tables_io.read(
424 filename,
425 NUMPY_DICT,
426 fmt=fmt,
427 keys=keys,
428 allow_missing_keys=allow_missing_keys,
429 slice_dict=slice_dict,
430 ) # pylint: disable=no-member
432 # set up file_fmt to have the file extension information
433 if fmt is not None:
File ~/checkouts/readthedocs.org/user_builds/qp/envs/latest/lib/python3.10/site-packages/tables_io/io_utils/read.py:145, in read(filepath, tType, fmt, keys, allow_missing_keys, slice_dict, **kwargs)
50 def read(
51 filepath: str,
52 tType: Union[int, str, None] = None,
(...)
57 **kwargs,
58 ):
59 """Reads in a given file to either a `Table-like` format if there is one table within the file,
60 or a `TableDict-like` format if there are multiple tables or files. Uses :py:func:`read_native` to read the file.
61
(...)
143
144 """
--> 145 odict = read_native(filepath, fmt, keys, allow_missing_keys, slice_dict, **kwargs)
147 if len(odict) == 1:
148 # For special keys, use the table alone without an enclosing dictionary.
149 single_dict_key = list(odict.keys())[0]
File ~/checkouts/readthedocs.org/user_builds/qp/envs/latest/lib/python3.10/site-packages/tables_io/io_utils/read.py:235, in read_native(filepath, fmt, keys, allow_missing_keys, slice_dict, **kwargs)
233 return read_HDF5_to_dicts(filepath, keys=keys, slice_dict=slice_dict)
234 except Exception as e:
--> 235 raise RuntimeError(
236 read_native_error_message(
237 filepath, fType, fmt, keys, allow_missing_keys, **kwargs
238 )
239 + f" \n because of error: \n {e}"
240 ) from e
241 if fType == NUMPY_FITS:
242 try:
RuntimeError: numpyHdf5 file could not be read in with the following arguments:
filepath: '/home/docs/checkouts/readthedocs.org/user_builds/qp/envs/latest/lib/python3.10/site-packages/qp/data/test.hdf5', fmt: 'None', keys: None, allow_missing_keys: False, and **kwargs: {}
because of error:
[Errno 2] Unable to synchronously open file (unable to open file: name = '/home/docs/checkouts/readthedocs.org/user_builds/qp/envs/latest/lib/python3.10/site-packages/qp/data/test.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)
Exploration
This will show use that the
# Confirm that we have read the ensembles
print("Ensemble = ", ens)
# Print some simple information about the ensemble
print("Rep = ", ens.gen_class.name)
print("NPDF = ", ens.npdf)
print("Metadata = ", ens.metadata())
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[4], line 2
1 # Confirm that we have read the ensembles
----> 2 print("Ensemble = ", ens)
3 # Print some simple information about the ensemble
4 print("Rep = ", ens.gen_class.name)
NameError: name 'ens' is not defined
Plotting Example
Now we are going to plot some PDFs from the ensemble
Note that the first call to the plot specifies the x-axis limits, but does not specify the key (i.e., which PDF in the ensemble), so that defaults to 0 (i.e., the first PDF).
axes = ens.plot(xlim=(0., 0.5), label="PDF 0")
_ = ens.plot(key=1, axes=axes, label="PDF 1")
_ = ens.plot(key=1300, axes=axes, label="PDF 1300")
legend = axes.figure.legend()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[5], line 1
----> 1 axes = ens.plot(xlim=(0., 0.5), label="PDF 0")
2 _ = ens.plot(key=1, axes=axes, label="PDF 1")
3 _ = ens.plot(key=1300, axes=axes, label="PDF 1300")
NameError: name 'ens' is not defined
Timing benchmarks
Now we are going to extract some information from the ensemble and time how long it takes to do so
# These are the grid points and quantiles at which we will extract values
test_xvals = ens.gen_obj.xvals
test_quantiles = np.linspace(0, 1, 51)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[6], line 2
1 # These are the grid points and quantiles at which we will extract values
----> 2 test_xvals = ens.gen_obj.xvals
3 test_quantiles = np.linspace(0, 1, 51)
NameError: name 'ens' is not defined
%%time
# Time the pdf evaluation for 20000 PDFs
pdfs = ens.pdf(test_xvals)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
File <timed exec>:2
NameError: name 'ens' is not defined
%%time
# Time the cdf (Cumulative distribution function) evaluation for 20000 PDFs
cdfs = ens.cdf(test_xvals)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
File <timed exec>:2
NameError: name 'ens' is not defined
%%time
ppfs = ens.ppf(test_quantiles)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
File <timed exec>:1
NameError: name 'ens' is not defined
%%time
# Time the sf (survival fraction, 1-cdf) evaluation for 20000 PDFs
sfs = ens.sf(test_xvals)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
File <timed exec>:2
NameError: name 'ens' is not defined
%%time
# Time the isf (inverse survival fraction) evaluation for 20000 PDFs
isfs = ens.isf(test_quantiles)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
File <timed exec>:2
NameError: name 'ens' is not defined
%%time
# Time the generation of 100 samples for each of the 20000 PDFs
samples = ens.rvs(size=100)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
File <timed exec>:2
NameError: name 'ens' is not defined
Changing the grid for the representation
%%time
# Convert to a grid using 51 grid points
ens_g51 = qp.convert(ens, 'interp', xvals=np.linspace(0, 3, 51))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
File <timed exec>:2
NameError: name 'ens' is not defined
%%time
# Convert to a grid using 21 grid points
ens_g21 = qp.convert(ens, 'interp', xvals=np.linspace(0, 1, 21))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
File <timed exec>:2
NameError: name 'ens' is not defined
key = 0
axes_g = ens.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_g51.plot(key, axes=axes_g, label="g51")
_ = ens_g21.plot(key, axes=axes_g, label="g21")
leg_g = axes_g.figure.legend()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[15], line 2
1 key = 0
----> 2 axes_g = ens.plot(key, xlim=(0, 0.5), label="orig")
3 _ = ens_g51.plot(key, axes=axes_g, label="g51")
4 _ = ens_g21.plot(key, axes=axes_g, label="g21")
NameError: name 'ens' is not defined
Conversion to other represenations
quantile representaion
%%time
# Convert using 51 quantiles
ens_q51 = qp.convert(ens, 'quant', quants=np.linspace(0.01, 0.99, 51))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
File <timed exec>:2
NameError: name 'ens' is not defined
%%time
# Convert using 21 quantiles
ens_q21 = qp.convert(ens, 'quant', quants=np.linspace(0.01, 0.99, 21))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
File <timed exec>:2
NameError: name 'ens' is not defined
key = 0
axes_q = ens.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_q51.plot(key, axes=axes_q, label="q51")
_ = ens_q21.plot(key, axes=axes_q, label="q21")
leg_q = axes_q.figure.legend()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[18], line 2
1 key = 0
----> 2 axes_q = ens.plot(key, xlim=(0, 0.5), label="orig")
3 _ = ens_q51.plot(key, axes=axes_q, label="q51")
4 _ = ens_q21.plot(key, axes=axes_q, label="q21")
NameError: name 'ens' is not defined
Histogram representation
%%time
# Convert to a histogram using 51 bins
ens_h51 = qp.convert(ens, 'hist', bins=np.linspace(0, 3.0, 51))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
File <timed exec>:2
NameError: name 'ens' is not defined
%%time
# Convert to a histogram using 21 bins
ens_h21 = qp.convert(ens, 'hist', bins=np.linspace(0, 3.0, 21))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
File <timed exec>:2
NameError: name 'ens' is not defined
key = 0
axes_h = ens.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_h51.plot(key, axes=axes_h, label="h51")
_ = ens_h21.plot(key, axes=axes_h, label="h21")
leg_h = axes_h.figure.legend()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[21], line 2
1 key = 0
----> 2 axes_h = ens.plot(key, xlim=(0, 0.5), label="orig")
3 _ = ens_h51.plot(key, axes=axes_h, label="h51")
4 _ = ens_h21.plot(key, axes=axes_h, label="h21")
NameError: name 'ens' is not defined
Other representations
qp also includes spline-based and Gaussian mixture represenations. The conversion to these forms much slower, so we will first reduce the base ensemble from 20000 PDFs to 100 PDFs
ens_red = ens[np.arange(100)]
print("Reduced ensemble has %i PDFs" % (ens_red.npdf))
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[22], line 1
----> 1 ens_red = ens[np.arange(100)]
2 print("Reduced ensemble has %i PDFs" % (ens_red.npdf))
NameError: name 'ens' is not defined
Spline representation
We can convert to the spline representation a few different ways. This particular method specifies that we should evaluate each PDF at a grid of points and then use those to construct the spline represenation. We do this for 2 different grids. Note how much slower this conversion function is that the ones above.
%%time
# Convert to a histogram using 51 grid points
ens_s51 = qp.convert(ens_red, 'spline', xvals=np.linspace(0, 3.0, 51), method="xy")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
File <timed exec>:2
NameError: name 'ens_red' is not defined
ens_s21 = qp.convert(ens_red, 'spline', xvals=np.linspace(0, 3.0, 21), method="xy")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[24], line 1
----> 1 ens_s21 = qp.convert(ens_red, 'spline', xvals=np.linspace(0, 3.0, 21), method="xy")
NameError: name 'ens_red' is not defined
key = 0
axes_s = ens_red.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_s51.plot(key, axes=axes_s, label="s51")
_ = ens_s21.plot(key, axes=axes_s, label="s21")
leg_s = axes_s.figure.legend()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[25], line 2
1 key = 0
----> 2 axes_s = ens_red.plot(key, xlim=(0, 0.5), label="orig")
3 _ = ens_s51.plot(key, axes=axes_s, label="s51")
4 _ = ens_s21.plot(key, axes=axes_s, label="s21")
NameError: name 'ens_red' is not defined
Gaussian mixture model
%%time
# Convert to a gaussian mixture using 301 sample points and 3 components
ens_m3 = qp.convert(ens_red, 'mixmod', xvals=np.linspace(0, 3.0, 301), ncomps=3)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
File <timed exec>:2
NameError: name 'ens_red' is not defined
%%time
# Convert to a gaussian mixture using 301 sample points and 3 components
ens_m5 = qp.convert(ens_red, 'mixmod', xvals=np.linspace(0, 3.0, 301), ncomps=5)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
File <timed exec>:2
NameError: name 'ens_red' is not defined
key = 0
axes_m = ens_red.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_m3.plot(key, axes=axes_m, label="m3")
_ = ens_m5.plot(key, axes=axes_m, label="m5")
leg_m = axes_m.figure.legend()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[28], line 2
1 key = 0
----> 2 axes_m = ens_red.plot(key, xlim=(0, 0.5), label="orig")
3 _ = ens_m3.plot(key, axes=axes_m, label="m3")
4 _ = ens_m5.plot(key, axes=axes_m, label="m5")
NameError: name 'ens_red' is not defined
key = 1
axes_m1 = ens_red.plot(key, xlim=(0, 0.5), label="orig")
_ = ens_m3.plot(key, axes=axes_m1, label="m3")
_ = ens_m5.plot(key, axes=axes_m1, label="m5")
leg_m1 = axes_m1.figure.legend()
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[29], line 2
1 key = 1
----> 2 axes_m1 = ens_red.plot(key, xlim=(0, 0.5), label="orig")
3 _ = ens_m3.plot(key, axes=axes_m1, label="m3")
4 _ = ens_m5.plot(key, axes=axes_m1, label="m5")
NameError: name 'ens_red' is not defined