From 8e1eff634a33a4d2bc31844150fbb5d6b8f15fe7 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Wed, 22 Aug 2018 10:52:50 +0200 Subject: [PATCH 1/3] Rollup: New sampler API -> support for emcee (#68) * start InferenceFile -> BaseInferenceFile * rename hdf.py base_hdf.py * add parse_parameters function * add module for base mcmc io * make _read_samples_data the abstract method * added read_samples_data to base_mcmc * add emcee file handling * replace read/write functions with io in BaseSampler * add checkpoint requirement; rename samples raw_samples * start updating emcee * move emcee_pt to it's own module * add base_mcmc (needs work) * start changing the base sampler api * add write_metadata to models * move setting up checkpoint and run interval to sampler methods * rearrange read/write functions; add checkpoint and finalize methods; add run method to base_mcmc * fix whitespace * add acl support * update executable * add finalize to emcee, fix typos * change write_posterior to expect filename, not file * change burn in module to just have functions * start to define burn in support class * move burn in class to burn_in module; add evaluate * add write burn in to io * add from_config for burn-in class * more support for burn-in, calculation of independent samples * add thin_start/interval/end to the hdf file attrs * fix typos, whitespace in burn_in * fix whitespace, typos in base_hdf * rename EnsembleMCMCIO to MCMCIO; fix whitespace * fix typo * fix whitespace * write filetype to inference hdf files; provide a loadfile function * fix some import errors * remove sampler_class from io to avoid circular imports * fix bugs * fix bugs, move niterations/nsamples into config file * add halfchain, posterior_step, min_iterations back to burn_in * fix bugs to get acl working post burn in * fix bugs in nacl burn in test * write more information to the logging messages * fix bugs in min_iterations burn-in test * fix more bugs * fix pep8 issues * fix bugs for running with data * whitespace --- bin/gwin | 343 +++--------- gwin/burn_in.py | 621 +++++++++++---------- gwin/io/__init__.py | 187 ++++++- gwin/io/base_hdf.py | 659 +++++++++++++++++++++++ gwin/io/base_mcmc.py | 251 +++++++++ gwin/io/emcee.py | 75 +++ gwin/io/hdf.py | 801 ---------------------------- gwin/models/base.py | 14 + gwin/models/base_data.py | 16 + gwin/models/gaussian_noise.py | 28 + gwin/option_utils.py | 82 --- gwin/sampler/__init__.py | 41 +- gwin/sampler/base.py | 976 ++++++--------------------------- gwin/sampler/base_mcmc.py | 564 ++++++++++++++++++++ gwin/sampler/emcee.py | 978 +++++----------------------------- gwin/sampler/emcee_pt.py | 756 ++++++++++++++++++++++++++ 16 files changed, 3250 insertions(+), 3142 deletions(-) create mode 100644 gwin/io/base_hdf.py create mode 100644 gwin/io/base_mcmc.py create mode 100644 gwin/io/emcee.py delete mode 100644 gwin/io/hdf.py create mode 100644 gwin/sampler/base_mcmc.py create mode 100644 gwin/sampler/emcee_pt.py diff --git a/bin/gwin b/bin/gwin index 2d0439b..cacded7 100644 --- a/bin/gwin +++ b/bin/gwin @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright (C) 2016 Christopher M. Biwer +# Copyright (C) 2016 Christopher M. Biwer, Collin Capano # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the @@ -32,18 +32,38 @@ from pycbc.waveform import generator import gwin from gwin import (__version__, burn_in, option_utils) -from gwin.io.hdf import InferenceFile -from gwin.option_utils import validate_checkpoint_files from gwin.calibration import Recalibrate # command line usage parser = argparse.ArgumentParser(usage=__file__ + " [--options]", description=__doc__) - -# version option parser.add_argument("--version", action="version", version=__version__, help="Prints version information.") - +parser.add_argument("--verbose", action="store_true", default=False, + help="Print logging messages.") +# output options +parser.add_argument("--output-file", type=str, required=True, + help="Output file path.") +parser.add_argument("--force", action="store_true", default=False, + help="If the output-file already exists, overwrite it. " + "Otherwise, an OSError is raised.") +parser.add_argument("--save-backup", action="store_true", + default=False, + help="Don't delete the backup file after the run has " + "completed.") +# parallelization options +parser.add_argument("--nprocesses", type=int, default=1, + help="Number of processes to use. If not given then only " + "a single core will be used.") +parser.add_argument("--use-mpi", action='store_true', default=False, + help="Use MPI to parallelize the sampler") +parser.add_argument("--samples-file", default=None, + help="Use an iteration from an InferenceFile as the " + "initial proposal distribution. The same " + "number of walkers and the same [variable_params] " + "section in the configuration file should be used. " + "The priors must allow encompass the initial " + "positions from the InferenceFile being read.") # add data options parser.add_argument("--instruments", type=str, nargs="+", help="IFOs, eg. H1 L1.") @@ -57,57 +77,8 @@ parser.add_argument("--psd-end-time", type=float, default=None, parser.add_argument("--seed", type=int, default=0, help="Seed to use for the random number generator that " "initially distributes the walkers. Default is 0.") -parser.add_argument("--samples-file", default=None, - help="Use an iteration from an InferenceFile as the " - "initial proposal distribution. The same " - "number of walkers and the same [variable_params] " - "section in the configuration file should be used. " - "The priors must allow encompass the initial " - "positions from the InferenceFile being read.") - -# add sampler options -option_utils.add_sampler_option_group(parser) - # add config options option_utils.add_config_opts_to_parser(parser) - -# output options -parser.add_argument("--output-file", type=str, required=True, - help="Output file path.") -parser.add_argument("--force", action="store_true", default=False, - help="If the output-file already exists, overwrite it. " - "Otherwise, an OSError is raised.") -parser.add_argument("--save-strain", action="store_true", default=False, - help="Save the conditioned strain time series to the " - "output file. If gate-overwhitened, this is done " - "before all gates have been applied.") -parser.add_argument("--save-stilde", action="store_true", default=False, - help="Save the conditioned strain frequency series to " - "the output file. This is done after all gates have " - "been applied.") -parser.add_argument("--save-psd", action="store_true", default=False, - help="Save the psd of each ifo to the output file.") -parser.add_argument("--checkpoint-interval", type=int, default=None, - help="Number of iterations to take before saving new " - "samples to file, calculating ACL, and updating " - "burn-in estimate.") -parser.add_argument("--resume-from-checkpoint", action="store_true", - default=False, - help="Automatically load results from checkpoint/backup " - "file.") -parser.add_argument("--save-backup", action="store_true", - default=False, - help="Don't delete the backup file after the run has " - "completed.") -parser.add_argument("--checkpoint-fast", action="store_true", - help="Do not calculate ACL after each checkpoint, only at " - "the end. Not applicable if n-independent-samples " - "have been specified.") - -# verbose option -parser.add_argument("--verbose", action="store_true", default=False, - help="Print logging messages.") - # add module pre-defined options fft.insert_fft_option_group(parser) opt.insert_optimization_option_group(parser) @@ -131,41 +102,6 @@ scheme.verify_processing_options(opts, parser) #strain.verify_strain_options(opts, parser) weave.verify_weave_options(opts, parser) -# check for the output file -if os.path.exists(opts.output_file) and not opts.force: - raise OSError("output-file already exists; use --force if you wish to " - "overwrite it.") - -# check for backup file(s) -checkpoint_file = opts.output_file + '.checkpoint' -backup_file = opts.output_file + '.bkup' -checkpoint_valid = validate_checkpoint_files(checkpoint_file, backup_file) - -# determine what to do with checkpoints -if checkpoint_valid and not opts.resume_from_checkpoint and not opts.force: - raise OSError("valid checkpoint file {} found, but " - "resume-from-checkpoint not on. If you wish to overwrite " - "use --force; otherwise, use --resume-from-checkpoint") -if not opts.resume_from_checkpoint and opts.force: - checkpoint_valid = False - -# check for how many iterations to run -max_iterations = opts.niterations -if opts.niterations is not None and opts.n_independent_samples is not None: - raise ValueError("Must specify either niterations or n-independent-" - "samples, not both") -elif opts.niterations is not None: - get_nsamples = opts.niterations -elif opts.n_independent_samples is not None: - if opts.checkpoint_interval is None: - raise ValueError("n-independent-samples requires a checkpoint-" - "interval; see help") - get_nsamples = opts.n_independent_samples -else: - raise ValueError("Must specify niterations or n-independent-samples; " - "see --help") - - # set seed numpy.random.seed(opts.seed) logging.info("Using seed %i", opts.seed) @@ -199,8 +135,9 @@ with ctx: # get ifo-specific instances of calibration model if cp.has_section('calibration'): logging.info("Initializing calibration model") - recalibration = {ifo: Recalibrate.from_config(cp, ifo, section='calibration') for - ifo in opts.instruments} + recalibration = {ifo: Recalibrate.from_config(cp, ifo, + section='calibration') + for ifo in opts.instruments} model_args['recalibration'] = recalibration # get gates for templates @@ -213,206 +150,66 @@ with ctx: # construct class that will return the natural logarithm of likelihood model = gwin.models.read_from_config(cp, **model_args) - burn_in_eval = burn_in.BurnIn(opts.burn_in_function, - min_iterations=opts.min_burn_in) - logging.info("Setting up sampler") - # create sampler that will run - sampler = option_utils.sampler_from_cli(opts, model) - - # save information about this data and settings - if not checkpoint_valid: - with InferenceFile(checkpoint_file, "w") as fp: - # save command line and data - logging.info("Creating and writing data to output file") - fp.write_data( - strain_dict=strain_dict if opts.save_strain else None, - stilde_dict=stilde_dict if opts.save_stilde else None, - psd_dict=psd_dict if opts.save_psd else None, - low_frequency_cutoff_dict=low_frequency_cutoff_dict) - - # save injection parameters - if opts.injection_file: - for ifo in opts.instruments: - logging.info("Writing %s injections to output file", ifo) - if ifo in opts.injection_file.keys(): - inj_file = opts.injection_file[ifo] - elif len(opts.injection_file) == 1: - inj_file = opts.injection_file.values()[0] - else: - logging.warn("Could not find injections for %s", ifo) - continue - fp.write_injections(opts.injection_file.values()[0], ifo) - # copy to backup - shutil.copy(checkpoint_file, backup_file) - - # write the command line, resume point - for fn in [checkpoint_file, backup_file]: - with InferenceFile(fn, "a") as fp: - fp.write_command_line() - if checkpoint_valid: - fp.write_resume_point() + # Create sampler that will run. + # Note: the pool is created at this point. This means that, + # unless you enjoy angering your cluster admins, + # NO SAMPLES FILE IO SHOULD BE DONE PRIOR TO THIS POINT!!! + sampler = gwin.sampler.load_from_config( + cp, model, nprocesses=opts.nprocesses, use_mpi=opts.use_mpi) + + # set up output/checkpoint file + # Note: PyCBC's multi-ifo parser uses key:ifo for + # the injection file, even though we will use the same + # injection file all detectors. This + # should be fixed in a future version of PyCBC. Once it is, + # update this. Until then, just use the first file. + if opts.injection_file: + injection_file = opts.injection_file.values()[0] # None if not set + else: + injection_file = None + sampler.setup_output(opts.output_file, force=opts.force, + injection_file=injection_file) - # set the walkers initial positions from a pre-existing InferenceFile - # or a specific initial distribution listed in the configuration file - # or else use the prior distributions to set initial positions - logging.info("Setting walkers initial conditions for varying parameters") + # Figure out where to get the initial conditions from: a samples file, + # the checkpoint file, the prior, or an initial prior. samples_file = opts.samples_file # use the checkpoint file instead if resume from checkpoint - if opts.resume_from_checkpoint and checkpoint_valid: - samples_file = checkpoint_file + if not sampler.new_checkpoint: + samples_file = sampler.checkpoint_file if samples_file is not None: logging.info("Initial positions taken from last iteration in %s", samples_file) - samples_file = InferenceFile(samples_file, "r") init_prior = None - elif len(cp.get_subsections("initial")): - initial_dists = distributions.read_distributions_from_config( - cp, section="initial") - constraints = distributions.read_constraints_from_config(cp, - constraint_section="initial_constraint") - init_prior = distributions.JointDistribution(sampler.variable_params, - *initial_dists, **{"constraints" : constraints}) else: - init_prior = None - sampler.set_p0(samples_file=samples_file, prior=init_prior) - - # if getting samples from file then put sampler and random number generator - # back in its former state - if samples_file is not None: - sampler.set_state_from_file(samples_file) - samples_file.close() - - # run sampler's burn in if it is in the list of burn in functions - if "use_sampler" in burn_in_eval.burn_in_functions: - # remove the sampler's burn in so we don't run more than once - burn_in_eval.burn_in_functions.pop("use_sampler") - # we'll only do this if we don't have a valid checkpoint: since the - # checkpoint happens after the sampler's burn in, the sampler's burn in - # must have already run if we have a valid checkpoint file - if not checkpoint_valid: - with InferenceFile(checkpoint_file, "a") as fp: - logging.info("Running sampler's burn in function") - burnidx, is_burned_in = burn_in.use_sampler(sampler, fp) - sampler.write_burn_in_iterations(fp, burnidx, is_burned_in) - # write the burn in results - logging.info("Writing burn in samples to file") - sampler.write_results(fp, static_params=model.static_params, - ifos=opts.instruments) - # write to backup file - with InferenceFile(backup_file, "a") as fp: - sampler.write_burn_in_iterations(fp, burnidx, is_burned_in) - sampler.write_results(fp, static_params=model.static_params, - ifos=opts.instruments) - - - # get the starting number of samples: - # nsamples keeps track of the number of samples we've obtained (if - # --n-independent-samples is used, this is the number of independent - # samples; otherwise, this is the number of iterations); - # start is the number of iterations that the file already contains (either - # due to sampler burn-in, or a previous checkpoint) - try: - with InferenceFile(checkpoint_file, "r") as fp: - start = fp.niterations - except KeyError: - start = 0 - if opts.n_independent_samples is not None: - try: - with InferenceFile(checkpoint_file, "r") as fp: - nsamples = fp.n_independent_samples - except AttributeError: - nsamples = start - else: - nsamples = start - # to ensure iterations are counted properly, he sampler's lastclear should - # be the same as start - sampler.lastclear = start - - interval = opts.checkpoint_interval - if interval is None: - interval = get_nsamples - - # run sampler until we have the desired number of samples - while nsamples < get_nsamples: - - end = start + interval - - # adjust the interval if we would go past the number of iterations - if opts.n_independent_samples is None and end > get_nsamples: - interval = get_nsamples - start - end = start + interval - - # run sampler and set initial values to None so that sampler - # picks up from where it left off next call - logging.info("Running sampler for {} to {} iterations".format(start, - end)) - sampler.run(interval) - - # write new samples - with InferenceFile(checkpoint_file, "a") as fp: - - logging.info("Writing results to file") - sampler.write_results(fp, static_params=model.static_params, - ifos=opts.instruments) - logging.info("Updating burn in") - burnidx, is_burned_in = burn_in_eval.update(sampler, fp) - - # compute the acls and write - acls = None - if opts.n_independent_samples is not None or end >= get_nsamples \ - or not opts.checkpoint_fast: - logging.info("Computing acls") - acls = sampler.compute_acls(fp) - sampler.write_acls(fp, acls) - - # write to backup - with InferenceFile(backup_file, "a") as fp: - - logging.info("Writing to backup file") - sampler.write_results(fp, static_params=model.static_params, - ifos=opts.instruments) - sampler.write_burn_in_iterations(fp, burnidx, is_burned_in) - if acls is not None: - sampler.write_acls(fp, acls) - - # check validity - checkpoint_valid = validate_checkpoint_files(checkpoint_file, - backup_file) - if not checkpoint_valid: - raise IOError("error writing to checkpoint file") - - # update nsamples for next loop - if opts.n_independent_samples is not None: - with InferenceFile(checkpoint_file, 'r') as fp: - nsamples = fp.n_independent_samples - logging.info("Have {} independent samples".format(nsamples)) - else: - nsamples += interval + # try to load an initial distribution from the config file + init_prior = gwin.sampler.initial_dist_from_config(cp) + sampler.set_initial_conditions(initial_distribution=init_prior, + samples_file=samples_file) - # clear the in-memory chain to save memory - logging.info("Clearing chain") - sampler.clear_chain() + # Run the sampler + sampler.run() - start = end + # Finalize the output + sampler.finalize() - # compute evidence, if supported - with InferenceFile(checkpoint_file, 'a') as fp: - try: - lnz, dlnz = sampler.calculate_logevidence(fp) - logging.info("Saving evidence") - sampler.write_logevidence(fp, lnz, dlnz) - except NotImplementedError: - pass + # FIXME: move to emcee_pt's finalize method + #with InferenceFile(checkpoint_file, 'a') as fp: + # try: + # lnz, dlnz = sampler.calculate_logevidence(fp) + # logging.info("Saving evidence") + # sampler.write_logevidence(fp, lnz, dlnz) + # except NotImplementedError: + # pass # rename checkpoint to output and delete backup logging.info("Moving checkpoint to output") -os.rename(checkpoint_file, opts.output_file) +os.rename(sampler.checkpoint_file, opts.output_file) if not opts.save_backup: logging.info("Deleting backup file") - os.remove(backup_file) + os.remove(sampler.backup_file) # exit logging.info("Done") diff --git a/gwin/burn_in.py b/gwin/burn_in.py index bcb4ef6..d87bf69 100644 --- a/gwin/burn_in.py +++ b/gwin/burn_in.py @@ -13,380 +13,361 @@ # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# """ This modules provides classes and functions for determining when Markov Chains have burned in. """ import numpy - from scipy.stats import ks_2samp +from pycbc.filter import autocorrelation +from pycbc.io.record import get_vars_from_arg -def ks_test(sampler, fp, threshold=0.9): - """Burn in based on whether the p-value of the KS test between the samples - at the last iteration and the samples midway along the chain for each - parameter is > ``threshold``. +# The value to use for a burn-in iteration if a chain is not burned in +NOT_BURNED_IN_ITER = -1 - Parameters - ---------- - sampler : gwin.sampler - Sampler to determine burn in for. May be either an instance of a - `gwin.sampler`, or the class itself. - fp : InferenceFile - Open inference hdf file containing the samples to load for determing - burn in. - threshold : float - The thershold to use for the p-value. Default is 0.9. - Returns - ------- - burn_in_idx : array - Array of indices giving the burn-in index for each chain. - is_burned_in : array - Array of booleans indicating whether each chain is burned in. - """ - nwalkers = fp.nwalkers - niterations = fp.niterations - # Create a dictionary which would have keys are the variable args and - # values are booleans indicating whether the p-value for the parameters - # satisfies the KS test - is_burned_in_param = {} - # iterate over the parameters - for param in fp.variable_params: - # read samples for the parameter from the last iteration of the chain - samples_last_iter = sampler.read_samples(fp, param, iteration=-1, - flatten=True)[param] - # read samples for the parameter from the iteration midway - # along the chain - samples_chain_midpt = sampler.read_samples( - fp, param, iteration=int(niterations/2), flatten=True)[param] - _, p_value = ks_2samp(samples_last_iter, samples_chain_midpt) - # check if p_value is > than the desired range - is_burned_in_param[param] = p_value > threshold - - # The chains are burned in if the p-value of the KS test lies - # in the range [0.1,0.9] for all the parameters. - # If the KS test is passed, the chains have burned in at their - # mid-way point. - if all(is_burned_in_param.values()): - is_burned_in = numpy.ones(nwalkers, dtype=bool) - burn_in_idx = numpy.repeat(niterations/2, nwalkers).astype(int) - else: - is_burned_in = numpy.zeros(nwalkers, dtype=bool) - burn_in_idx = numpy.repeat(niterations, nwalkers).astype(int) - return burn_in_idx, is_burned_in +# +# ============================================================================= +# +# Convenience functions +# +# ============================================================================= +# -def n_acl(sampler, fp, nacls=10): - """Burn in based on ACL. +def ks_test(samples1, samples2, threshold=0.9): + """Applies a KS test to determine if two sets of samples are the same. - The sampler is considered burned in if the number of itertions is >= - ``nacls`` times the maximum ACL over all parameters, as measured from the - first iteration. + The ks test is applied parameter-by-parameter. If the two-tailed p-value + returned by the test is greater than ``threshold``, the samples are + considered to be the same. Parameters ---------- - sampler : pycbc.inference.sampler - Sampler to determine burn in for. May be either an instance of a - `inference.sampler`, or the class itself. - fp : InferenceFile - Open inference hdf file containing the samples to load for determing - burn in. - nacls : int - Number of ACLs to use for burn in. Default is 10. + samples1 : dict + Dictionary of mapping parameters to the first set of samples. + samples2 : dict + Dictionary of mapping parameters to the second set of samples. + threshold : float + The thershold to use for the p-value. Default is 0.9. Returns ------- - burn_in_idx : array - Array of indices giving the burn-in index for each chain. By definition - of this function, all chains reach burn in at the same iteration. Thus - the returned array is the burn-in index repeated by the number of - chains. - is_burned_in : array - Array of booleans indicating whether each chain is burned in. Since - all chains obtain burn in at the same time, this is either an array - of all False or True. + dict : + Dictionary mapping parameter names to booleans indicating whether the + given parameter passes the KS test. """ - acl = numpy.array(sampler.compute_acls(fp, start_index=0).values()).max() - burn_idx = nacls * acl - is_burned_in = burn_idx < fp.niterations - if not is_burned_in: - burn_idx = fp.niterations - nwalkers = fp.nwalkers - return numpy.repeat(burn_idx, nwalkers).astype(int), \ - numpy.repeat(is_burned_in, nwalkers).astype(bool) + is_the_same = {} + assert set(samples1.keys()) == set(samples2.keys()), ( + "samples1 and 2 must have the same parameters") + # iterate over the parameters + for param in samples1: + s1 = samples1[param] + s2 = samples2[param] + _, p_value = ks_2samp(s1, s2) + is_the_same[param] = p_value > threshold + return is_the_same -def max_posterior(sampler, fp): +def max_posterior(lnps_per_walker, dim): """Burn in based on samples being within dim/2 of maximum posterior. Parameters ---------- - sampler : gwin.sampler - Sampler to determine burn in for. May be either an instance of a - `gwin.sampler`, or the class itself. - fp : InferenceFile - Open inference hdf file containing the samples to load for determing - burn in. + lnps_per_walker : 2D array + Array of values that are proportional to the log posterior values. Must + have shape ``nwalkers x niterations``. + dim : int + The dimension of the parameter space. Returns ------- - burn_in_idx : array - Array of indices giving the burn-in index for each chain. - is_burned_in : array - Array of booleans indicating whether each chain is burned in. + burn_in_idx : array of int + The burn in indices of each walker. If a walker is not burned in, its + index will be be equal to the length of the chain. + is_burned_in : array of bool + Whether or not a walker is burned in. """ - # get the posteriors - # Note: multi-tempered samplers should just return the coldest chain by - # default - chain_stats = sampler.read_samples( - fp, ['loglr', 'logprior'], samples_group=fp.stats_group, - thin_interval=1, thin_start=0, thin_end=None, flatten=False) - chain_posteriors = chain_stats['loglr'] + chain_stats['logprior'] - dim = float(len(fp.variable_params)) - - # find the posterior to compare against - max_p = chain_posteriors.max() - criteria = max_p - dim/2 - nwalkers = chain_posteriors.shape[-2] - niterations = chain_posteriors.shape[-1] - burn_in_idx = numpy.repeat(niterations, nwalkers).astype(int) - is_burned_in = numpy.zeros(nwalkers, dtype=bool) - - # find the first iteration in each chain where the logplr has exceeded + if len(lnps_per_walker.shape) != 2: + raise ValueError("lnps_per_walker must have shape " + "nwalkers x niterations") + # find the value to compare against + max_p = lnps_per_walker.max() + criteria = max_p - dim/2. + nwalkers, niterations = lnps_per_walker.shape + burn_in_idx = numpy.empty(nwalkers, dtype=int) + is_burned_in = numpy.empty(nwalkers, dtype=bool) + # find the first iteration in each chain where the logpost has exceeded # max_p - dim/2 for ii in range(nwalkers): - chain = chain_posteriors[..., ii, :] - # numpy.where will return a tuple with multiple arrays if the chain is - # more than 1D (which can happen for multi-tempered samplers). Always - # taking the last array ensures we are looking at the indices that - # count out iterations - idx = numpy.where(chain >= criteria)[-1] - if idx.size != 0: - burn_in_idx[ii] = idx[0] - is_burned_in[ii] = True + chain = lnps_per_walker[ii, :] + passedidx = numpy.where(chain >= criteria)[0] + is_burned_in[ii] = passedidx.size > 0 + if is_burned_in[ii]: + burn_in_idx[ii] = passedidx[0] + else: + burn_in_idx[ii] = NOT_BURNED_IN_ITER return burn_in_idx, is_burned_in -def posterior_step(sampler, fp): - """Burn in based on the last time a chain made a jump > dim/2. +def posterior_step(logposts, dim): + """Finds the last time a chain made a jump > dim/2. Parameters ---------- - sampler : gwin.sampler - Sampler to determine burn in for. May be either an instance of a - `gwin.sampler`, or the class itself. - fp : InferenceFile - Open inference hdf file containing the samples to load for determing - burn in. + logposts : array + 1D array of values that are proportional to the log posterior values. + dim : int + The dimension of the parameter space. Returns ------- - burn_in_idx : array - Array of indices giving the burn-in index for each chain. - is_burned_in : array - Array of booleans indicating whether each chain is burned in. - By definition of this function, all values are set to True. + int + The index of the last time the logpost made a jump > dim/2. If that + never happened, returns 0. """ - # get the posteriors - # Note: multi-tempered samplers should just return the coldest chain by - # default - chain_stats = sampler.read_samples( - fp, ['loglr', 'logprior'], samples_group=fp.stats_group, - thin_interval=1, thin_start=0, thin_end=None, flatten=False) - chain_posteriors = chain_stats['loglr'] + chain_stats['logprior'] - nwalkers = chain_posteriors.shape[-2] - dim = float(len(fp.variable_params)) - burn_in_idx = numpy.zeros(nwalkers).astype(int) + if logposts.ndim > 1: + raise ValueError("logposts must be a 1D array") criteria = dim/2. + dp = numpy.diff(logposts) + indices = numpy.where(dp >= criteria)[0] + if indices.size > 0: + idx = indices[-1] + 1 + else: + idx = 0 + return idx - # find the last iteration in each chain where the logplr has - # jumped by more than dim/2 - for ii in range(nwalkers): - chain = chain_posteriors[..., ii, :] - dp = abs(numpy.diff(chain)) - idx = numpy.where(dp >= criteria)[-1] - if idx.size != 0: - burn_in_idx[ii] = idx[-1] + 1 - return burn_in_idx, numpy.ones(nwalkers, dtype=bool) - - -def half_chain(sampler, fp): - """Takes the second half of the iterations as post-burn in. - - Parameters - ---------- - sampler : gwin.sampler - This option is not used; it is just here give consistent API as the - other burn in functions. - fp : InferenceFile - Open inference hdf file containing the samples to load for determing - burn in. - - Returns - ------- - burn_in_idx : array - Array of indices giving the burn-in index for each chain. - is_burned_in : array - Array of booleans indicating whether each chain is burned in. - By definition of this function, all values are set to True. - """ - nwalkers = fp.nwalkers - niterations = fp.niterations - return ( - numpy.repeat(niterations/2, nwalkers).astype(int), - numpy.ones(nwalkers, dtype=bool), - ) - - -def use_sampler(sampler, fp=None): - """Uses the sampler's burn_in function. - - Parameters - ---------- - sampler : gwin.sampler - Sampler to determine burn in for. Must be an instance of an - `gwin.sampler` that has a `burn_in` function. - fp : InferenceFile, optional - This option is not used; it is just here give consistent API as the - other burn in functions. - - Returns - ------- - burn_in_idx : array - Array of indices giving the burn-in index for each chain. - is_burned_in : array - Array of booleans indicating whether each chain is burned in. - Since the sampler's burn in function will run until all chains - are burned, all values are set to True. - """ - sampler.burn_in() - return ( - sampler.burn_in_iterations, - numpy.ones(len(sampler.burn_in_iterations), dtype=bool), - ) - - -burn_in_functions = { - 'ks_test': ks_test, - 'n_acl': n_acl, - 'max_posterior': max_posterior, - 'posterior_step': posterior_step, - 'half_chain': half_chain, - 'use_sampler': use_sampler, - } - - -class BurnIn(object): - """Class to estimate the number of burn in iterations. - Parameters - ---------- - function_names : list, optional - List of name of burn in functions to use. All names in the provided - list muset be in the `burn_in_functions` dict. If none provided, will - use no burn-in functions. - min_iterations : int, optional - Minimum number of burn in iterations to use. The burn in iterations - returned by evaluate will be the maximum of this value - and the values returned by the burn in functions provided in - `function_names`. Default is 0. - - Examples - -------- - Initialize a `BurnIn` instance that will use `max_posterior` and - `posterior_step` as the burn in criteria: - - >>> import gwin - >>> burn_in = gwin.BurnIn(['max_posterior', 'posterior_step']) - - Use this `BurnIn` instance to find the burn-in iteration of each walker - in an inference result file: - - >>> from pycbc.io import InferenceFile - >>> fp = InferenceFile('gwin.hdf', 'r') - >>> burn_in.evaluate(gwin.samplers[fp.sampler_name], fp) - array([11486, 11983, 11894, ..., 11793, 11888, 11981]) +# +# ============================================================================= +# +# Burn in classes +# +# ============================================================================= +# - """ - def __init__(self, function_names, min_iterations=0): - if function_names is None: - function_names = [] - self.min_iterations = min_iterations - self.burn_in_functions = {fname: burn_in_functions[fname] - for fname in function_names} +class MCMCBurnInTests(object): + """Provides methods for estimating burn-in of an ensemble MCMC.""" + + available_tests = ('halfchain', 'min_iterations', 'max_posterior', + 'posterior_step', 'nacl', 'ks_test', + ) + + def __init__(self, sampler, burn_in_test, **kwargs): + self.sampler = sampler + # determine the burn-in tests that are going to be done + self.do_tests = get_vars_from_arg(burn_in_test) + self.burn_in_test = burn_in_test + self.burn_in_data = {t: {} for t in self.do_tests} + self.is_burned_in = False + self.burn_in_iteration = NOT_BURNED_IN_ITER + # Arguments specific to each test... + # for nacl: + self._nacls = int(kwargs.pop('nacls', 5)) + # for kstest: + self._ksthreshold = float(kwargs.pop('ks_threshold', 0.9)) + # for max_posterior and posterior_step + self._ndim = int(kwargs.pop('ndim', len(sampler.variable_params))) + # for min iterations + self._min_iterations = int(kwargs.pop('min_iterations', 0)) + + def _getniters(self, filename): + """Convenience function to get the number of iterations in the file. + + If `niterations` hasn't been written to the file yet, just returns 0. + """ + with self.sampler.io(filename, 'r') as fp: + try: + niters = fp.niterations + except KeyError: + niters = 0 + return niters - def evaluate(self, sampler, fp): - """Evaluates sampler's chains to find burn in. + def _getlogposts(self, filename): + """Convenience function for retrieving log posteriors. Parameters ---------- - sampler : gwin.sampler - Sampler to determine burn in for. May be either an instance of a - `gwin.sampler`, or the class itself. - fp : InferenceFile - Open inference hdf file containing the samples to load for - determing burn in. + filename : str + The file to read. Returns ------- - burnidx : array - Array of indices giving the burn-in index for each chain. - is_burned_in : array - Array of booleans indicating whether each chain is burned in. + array + The log posterior values. They are not flattened, so have dimension + nwalkers x niterations. """ - # if the number of iterations is < than the minimium desired, - # just return the number of iterations and all False - if fp.niterations < self.min_iterations: - return numpy.repeat(self.min_iterations, fp.nwalkers), \ - numpy.zeros(fp.nwalkers, dtype=bool) - # if the file already has burn in iterations saved, use those as a - # base - try: - burnidx = fp['burn_in_iterations'][:] - except KeyError: - # just use the minimum - burnidx = numpy.repeat(self.min_iterations, fp.nwalkers) - # start by assuming is burned in; the &= below will make this false - # if any test yields false - is_burned_in = numpy.ones(fp.nwalkers, dtype=bool) - if self.burn_in_functions != {}: - newidx = [] - for func in self.burn_in_functions.values(): - idx, state = func(sampler, fp) - newidx.append(idx) - is_burned_in &= state - newidx = numpy.vstack(newidx).max(axis=0) - # update the burn in idx if any test yields a larger iteration - mask = burnidx < newidx - burnidx[mask] = newidx[mask] - # if any burn-in idx are less than the min iterations, set to the - # min iterations - burnidx[burnidx < self.min_iterations] = self.min_iterations - return burnidx, is_burned_in - - def update(self, sampler, fp): - """Evaluates burn in and saves the updated indices to the given file. - - Parameters - ---------- - sampler : gwin.sampler - Sampler to determine burn in for. May be either an instance of a - `gwin.sampler`, or the class itself. - fp : InferenceFile - Open inference hdf file containing the samples to load for - determing burn in. - - Returns - ------- - burnidx : array - Array of indices giving the burn-in index for each chain. - is_burned_in : array - Array of booleans indicating whether each chain is burned in. + with self.sampler.io(filename, 'r') as fp: + samples = fp.read_raw_samples( + ['loglikelihood', 'logprior'], thin_start=0, thin_interval=1, + flatten=False) + logposts = samples['loglikelihood'] + samples['logprior'] + return logposts + + def halfchain(self, filename): + """Just uses half the chain as the burn-in iteration. + """ + niters = self._getniters(filename) + data = self.burn_in_data['halfchain'] + # this test cannot determine when something will burn in + # only when it was not burned in in the past + data['is_burned_in'] = True + data['burn_in_iteration'] = niters/2 + + def min_iterations(self, filename): + """Just checks that the sampler has been run for the minimum number + of iterations. + """ + niters = self._getniters(filename) + data = self.burn_in_data['min_iterations'] + data['is_burned_in'] = self._min_iterations < niters + if data['is_burned_in']: + data['burn_in_iteration'] = self._min_iterations + else: + data['burn_in_iteration'] = NOT_BURNED_IN_ITER + + def max_posterior(self, filename): + """Applies max posterior test to self.""" + logposts = self._getlogposts(filename) + burn_in_idx, is_burned_in = max_posterior(logposts, self._ndim) + data = self.burn_in_data['max_posterior'] + # required things to store + data['is_burned_in'] = is_burned_in.all() + if data['is_burned_in']: + data['burn_in_iteration'] = burn_in_idx.max() + else: + data['burn_in_iteration'] = NOT_BURNED_IN_ITER + # additional info + data['iteration_per_walker'] = burn_in_idx + data['status_per_walker'] = is_burned_in + + def posterior_step(self, filename): + """Applies the posterior-step test.""" + logposts = self._getlogposts(filename) + burn_in_idx = numpy.array([posterior_step(logps, self._ndim) + for logps in logposts]) + data = self.burn_in_data['posterior_step'] + # this test cannot determine when something will burn in + # only when it was not burned in in the past + data['is_burned_in'] = True + data['burn_in_iteration'] = burn_in_idx.max() + # additional info + data['iteration_per_walker'] = burn_in_idx + + def nacl(self, filename): + """Burn in based on ACL. + + This applies the following test to determine burn in: + + 1. The first half of the chain is ignored. + + 2. An ACL is calculated from the second half. + + 3. If ``nacls`` times the ACL is < the number of iterations / 2, + the chain is considered to be burned in at the half-way point. """ - burnidx, is_burned_in = self.evaluate(sampler, fp) - sampler.burn_in_iterations = burnidx - sampler.write_burn_in_iterations(fp, burnidx, is_burned_in) - return burnidx, is_burned_in + niters = self._getniters(filename) + kstart = int(niters / 2.) + acls = self.sampler.compute_acl(filename, start_index=kstart) + is_burned_in = {param: (self._nacls * acl) < kstart + for (param, acl) in acls.items()} + data = self.burn_in_data['nacl'] + # required things to store + data['is_burned_in'] = all(is_burned_in.values()) + if data['is_burned_in']: + data['burn_in_iteration'] = kstart + else: + data['burn_in_iteration'] = NOT_BURNED_IN_ITER + # additional information + data['status_per_parameter'] = is_burned_in + # since we calculated it, save the acls to the sampler + self.sampler.acls = acls + + def ks_test(self, filename): + """Applies ks burn-in test.""" + with self.sampler.io(filename, 'r') as fp: + niters = fp.niterations + # get the samples from the mid point + samples1 = fp.read_raw_samples( + ['loglikelihood', 'logprior'], iteration=int(niters/2.)) + # get the last samples + samples2 = fp.read_raw_samples( + ['loglikelihood', 'logprior'], iteration=-1) + # do the test + # is_the_same is a dictionary of params --> bool indicating whether or + # not the 1D marginal is the same at the half way point + is_the_same = ks_test(samples1, samples2, threshold=self._ksthreshold) + data = self.burn_in_data['ks_test'] + # required things to store + data['is_burned_in'] = all(is_the_same.values()) + if data['is_burned_in']: + data['burn_in_iteration'] = int(niters/2.) + else: + data['burn_in_iteration'] = NOT_BURNED_IN_ITER + # additional + data['status_per_parameter'] = is_the_same + + def evaluate(self, filename): + """Runs all of the burn-in tests.""" + for tst in self.do_tests: + getattr(self, tst)(filename) + # The iteration to use for burn-in depends on the logic in the burn-in + # test string. For example, if the test was 'max_posterior | nacl' and + # max_posterior burned-in at iteration 5000 while nacl burned in at + # iteration 6000, we'd want to use 5000 as the burn-in iteration. + # However, if the test was 'max_posterior & nacl', we'd want to use + # 6000 as the burn-in iteration. The code below handles all cases by + # doing the following: first, take the collection of burn in iterations + # from all the burn in tests that were applied. Next, cycle over the + # iterations in increasing order, checking which tests have burned in + # by that point. Then evaluate the burn-in string at that point to see + # if it passes, and if so, what the iteration is. The first point that + # the test passes is used as the burn-in iteration. + data = self.burn_in_data + burn_in_iters = numpy.unique([data[t]['burn_in_iteration'] + for t in self.do_tests]) + burn_in_iters.sort() + for ii in burn_in_iters: + test_results = {t: (data[t]['is_burned_in'] & + 0 <= data[t]['burn_in_iteration'] <= ii) + for t in self.do_tests} + is_burned_in = eval(self.burn_in_test, {"__builtins__": None}, + test_results) + if is_burned_in: + break + self.is_burned_in = is_burned_in + if is_burned_in: + self.burn_in_iteration = ii + else: + self.burn_in_iteration = NOT_BURNED_IN_ITER + + @classmethod + def from_config(cls, cp, sampler): + """Loads burn in from section [sampler-burn_in].""" + section = 'sampler' + tag = 'burn_in' + burn_in_test = cp.get_opt_tag(section, 'burn-in-test', tag) + kwargs = {} + if cp.has_option_tag(section, 'nacl', tag): + kwargs['nacl'] = int(cp.get_opt_tag(section, 'nacl', tag)) + if cp.has_option_tag(section, 'ks-threshold', tag): + kwargs['ks_threshold'] = float( + cp.get_opt_tag(section, 'ks-threshold', tag)) + if cp.has_option_tag(section, 'ndim', tag): + kwargs['ndim'] = int( + cp.get_opt_tag(section, 'ndim', tag)) + if cp.has_option_tag(section, 'min-iterations', tag): + kwargs['min_iterations'] = int( + cp.get_opt_tag(section, 'min-iterations', tag)) + return cls(sampler, burn_in_test, **kwargs) diff --git a/gwin/io/__init__.py b/gwin/io/__init__.py index 2e19621..c284bf6 100644 --- a/gwin/io/__init__.py +++ b/gwin/io/__init__.py @@ -17,5 +17,190 @@ """I/O utilities for GWIn """ -from .hdf import InferenceFile +from __future__ import absolute_import + +import os +import shutil +import logging +import h5py as _h5py + +from .emcee import EmceeFile from .txt import InferenceTXTFile + +filetypes = { + EmceeFile.name: EmceeFile, +} + + +def loadfile(path, mode=None, filetype=None, **kwargs): + """Loads the given file using the appropriate InferenceFile class. + + If ``filetype`` is not provided, this will try to retreive the ``filetype`` + from the file's ``attrs``. If the file does not exist yet, an IOError will + be raised if ``filetype`` is not provided. + + Parameters + ---------- + path : str + The filename to load. + mode : str, optional + What mode to load the file with, e.g., 'w' for write, 'r' for read, + 'a' for append. Default will default to h5py.File's mode, which is 'a'. + filetype : str, optional + Force the file to be loaded with the given class name. This must be + provided if creating a new file. + + Returns + ------- + filetype instance + An open file handler to the file. The class used for IO with the file + is determined by the ``filetype`` keyword (if provided) or the + ``filetype`` stored in the file (if not provided). + """ + if filetype is None: + # try to read the file to get its filetype + try: + with _h5py.File(path, 'r') as fp: + filetype = fp.attrs['filetype'] + except IOError: + # file doesn't exist, filetype must be provided + raise IOError("The file appears not to exist. In this case, " + "filetype must be provided.") + return filetypes[filetype](path, mode=mode, **kwargs) + +# +# ============================================================================= +# +# HDF Utilities +# +# ============================================================================= +# + + +def check_integrity(filename): + """Checks the integrity of an InferenceFile. + + Checks done are: + + * can the file open? + * do all of the datasets in the samples group have the same shape? + * can the first and last sample in all of the datasets in the samples + group be read? + + If any of these checks fail, an IOError is raised. + + Parameters + ---------- + filename: str + Name of an InferenceFile to check. + + Raises + ------ + ValueError + If the given file does not exist. + KeyError + If the samples group does not exist. + IOError + If any of the checks fail. + """ + # check that the file exists + if not os.path.exists(filename): + raise ValueError("file {} does not exist".format(filename)) + # if the file is corrupted such that it cannot be opened, the next line + # will raise an IOError + with loadfile(filename, 'r') as fp: + # check that all datasets in samples have the same shape + parameters = fp[fp.samples_group].keys() + group = fp.samples_group + '/{}' + # use the first parameter as a reference shape + ref_shape = fp[group.format(parameters[0])].shape + if not all(fp[group.format(param)].shape == ref_shape + for param in parameters): + raise IOError("not all datasets in the samples group have the " + "same shape") + # check that we can read the first/last sample + firstidx = tuple([0]*len(ref_shape)) + lastidx = tuple([-1]*len(ref_shape)) + for param in parameters: + fp[group.format(param)][firstidx] + fp[group.format(param)][lastidx] + + +def validate_checkpoint_files(checkpoint_file, backup_file): + """Checks if the given checkpoint and/or backup files are valid. + + The checkpoint file is considered valid if: + + * it passes all tests run by ``check_integrity``; + * it has at least one sample written to it (indicating at least one + checkpoint has happened). + + The same applies to the backup file. The backup file must also have the + same number of samples as the checkpoint file, otherwise, the backup is + considered invalid. + + If the checkpoint (backup) file is found to be valid, but the backup + (checkpoint) file is not valid, then the checkpoint (backup) is copied to + the backup (checkpoint). Thus, this function ensures that checkpoint and + backup files are either both valid or both invalid. + + Parameters + ---------- + checkpoint_file : string + Name of the checkpoint file. + backup_file : string + Name of the backup file. + + Returns + ------- + checkpoint_valid : bool + Whether or not the checkpoint (and backup) file may be used for loading + samples. + """ + # check if checkpoint file exists and is valid + try: + check_integrity(checkpoint_file) + checkpoint_valid = True + except (ValueError, KeyError, IOError): + checkpoint_valid = False + # backup file + try: + check_integrity(backup_file) + backup_valid = True + except (ValueError, KeyError, IOError): + backup_valid = False + # check if there are any samples in the file; if not, we'll just start from + # scratch + if checkpoint_valid: + with loadfile(checkpoint_file, 'r') as fp: + try: + group = '{}/{}'.format(fp.samples_group, fp.variable_params[0]) + nsamples = fp[group].size + checkpoint_valid = nsamples != 0 + except KeyError: + checkpoint_valid = False + # check if there are any samples in the backup file + if backup_valid: + with loadfile(backup_file, 'r') as fp: + try: + group = '{}/{}'.format(fp.samples_group, fp.variable_params[0]) + backup_nsamples = fp[group].size + backup_valid = backup_nsamples != 0 + except KeyError: + backup_valid = False + # check that the checkpoint and backup have the same number of samples; + # if not, assume the checkpoint has the correct number + if checkpoint_valid and backup_valid: + backup_valid = nsamples == backup_nsamples + # decide what to do based on the files' statuses + if checkpoint_valid and not backup_valid: + # copy the checkpoint to the backup + logging.info("Backup invalid; copying checkpoint file") + shutil.copy(checkpoint_file, backup_file) + backup_valid = True + elif backup_valid and not checkpoint_valid: + logging.info("Checkpoint invalid; copying backup file") + # copy the backup to the checkpoint + shutil.copy(backup_file, checkpoint_file) + checkpoint_valid = True + return checkpoint_valid diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py new file mode 100644 index 0000000..8a1665c --- /dev/null +++ b/gwin/io/base_hdf.py @@ -0,0 +1,659 @@ +# Copyright (C) 2016 Christopher M. Biwer, Collin Capano +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# self.option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# +"""This modules defines functions for reading and writing samples that the +inference samplers generate. +""" + +from __future__ import absolute_import + +import os +import sys +import logging +from abc import ABCMeta, abstractmethod, abstractproperty + +import numpy + +import h5py + +from pycbc import DYN_RANGE_FAC +from pycbc.io import FieldArray +from pycbc.types import FrequencySeries +from pycbc.waveform import parameters as wfparams + + +class BaseInferenceFile(h5py.File): + """Base class for all inference hdf files. + + This is a subclass of the h5py.File object. It adds functions for + handling reading and writing the samples from the samplers. + + Parameters + ----------- + path : str + The path to the HDF file. + mode : {None, str} + The mode to open the file, eg. "w" for write and "r" for read. + """ + __metaclass__ = ABCMeta + + name = None + samples_group = 'samples' + sampler_group = 'sampler_info' + data_group = 'data' + injections_group = 'injections' + + def __init__(self, path, mode=None, **kwargs): + super(BaseInferenceFile, self).__init__(path, mode, **kwargs) + # check that file type matches self + try: + filetype = self.attrs['filetype'] + except KeyError: + if mode == 'w': + # first time creating the file, add this class's name + filetype = self.name + self.attrs['filetype'] = filetype + else: + filetype = None + if filetype != self.name: + raise ValueError("This file has filetype {}, whereas this class " + "is named {}. This indicates that the file was " + "not written by this class, and so cannot be " + "read by this class.".format(filetype, self.name)) + + def __getattr__(self, attr): + """Things stored in ``.attrs`` are promoted to instance attributes. + + Note that properties will be called before this, so if there are any + properties that share the same name as something in ``.attrs``, that + property will get returned. + """ + return self.attrs[attr] + + @abstractmethod + def write_samples(self, samples, **kwargs): + """This should write all of the provided samples. + + This function should be used to write both samples and model stats. + + Parameters + ---------- + fp : open hdf file + The file to write to. + samples : dict + Samples should be provided as a dictionary of numpy arrays. + \**kwargs : + Any other keyword args the sampler needs to write data. + """ + pass + + @abstractmethod + def write_sampler_metadata(self, sampler): + """This should write the given sampler's metadata to the file. + + This should also include the model's metadata. + """ + pass + + def parse_parameters(self, parameters, array_class=None): + """Parses a parameters arg to figure out what fields need to be loaded. + + Parameters + ---------- + parameters : (list of) strings + The parameter(s) to retrieve. A parameter can be the name of any + field in ``samples_group``, a virtual field or method of + ``FieldArray`` (as long as the file contains the necessary fields + to derive the virtual field or method), and/or a function of + these. + array_class : array class, optional + The type of array to use to parse the parameters. The class must + have a ``parse_parameters`` method. Default is to use a + ``FieldArray``. + + Returns + ------- + list : + A list of strings giving the fields to load from the file. + """ + # get the type of array class to use + if array_class is None: + array_class = FieldArray + # get the names of fields needed for the given parameters + possible_fields = self[self.samples_group].keys() + return array_class.parse_parameters(parameters, possible_fields) + + def read_samples(self, parameters, array_class=None, **kwargs): + """Reads samples for the given parameter(s). + + The ``parameters`` can be the name of any dataset in ``samples_group``, + a virtual field or method of ``FieldArray`` (as long as the file + contains the necessary fields to derive the virtual field or method), + and/or any numpy function of these. + + The ``parameters`` are parsed to figure out what datasets are needed. + Only those datasets will be loaded, and will be the base-level fields + of the returned ``FieldArray``. + + The ``static_params`` are also added as attributes of the returned + ``FieldArray``. + + Parameters + ----------- + fp : InferenceFile + An open file handler to read the samples from. + parameters : (list of) strings + The parameter(s) to retrieve. + array_class : FieldArray-like class, optional + The type of array to return. The class must have ``from_kwargs`` + and ``parse_parameters`` methods. If None, will return a + ``FieldArray``. + \**kwargs : + All other keyword arguments are passed to ``read_raw_samples``. + + Returns + ------- + FieldArray : + The samples as a ``FieldArray``. + """ + # get the type of array class to use + if array_class is None: + array_class = FieldArray + # get the names of fields needed for the given parameters + possible_fields = self[self.samples_group].keys() + loadfields = array_class.parse_parameters(parameters, possible_fields) + samples = self.read_raw_samples(loadfields, **kwargs) + # convert to FieldArray + samples = array_class.from_kwargs(**samples) + # add the static params + for (p, val) in self.static_params.items(): + setattr(samples, p, val) + return samples + + @abstractmethod + def read_raw_samples(self, fields, **kwargs): + """Low level function for reading datasets in the samples group. + + This should return a dictionary of numpy arrays. + """ + pass + + @abstractmethod + def write_posterior(self, posterior_file, **kwargs): + """This should write a posterior plus any other metadata to the given + file. + + Parameters + ---------- + posterior_file : str + Name of the file to write to. + \**kwargs : + Any other keyword args the sampler needs to write the posterior. + """ + pass + + @property + def static_params(self): + """Returns a dictionary of the static_params. The keys are the argument + names, values are the value they were set to. + """ + return {arg: self.attrs[arg] for arg in self.attrs["static_params"]} + + @property + def effective_nsamples(self): + """Returns the effective number of samples stored in the file. + """ + try: + return self.attrs['effective_nsamples'] + except KeyError: + return 0 + + def write_effective_nsamples(self, effective_nsamples): + """Writes the effective number of samples stored in the file.""" + self.attrs['effective_nsamples'] = effective_nsamples + + @property + def thin_start(self): + """The default start index to use when reading samples. + + This tries to read from ``thin_start`` in the ``attrs``. If it isn't + there, just returns 0.""" + try: + return self.attrs['thin_start'] + except KeyError: + return 0 + + @property + def thin_interval(self): + """The default interval to use when reading samples. + + This tries to read from ``thin_interval`` in the ``attrs``. If it + isn't there, just returns 1. + """ + try: + return self.attrs['thin_interval'] + except KeyError: + return 1 + + @property + def thin_end(self): + """The defaut end index to use when reading samples. + + This tries to read from ``thin_end`` in the ``attrs``. If it isn't + there, just returns None. + """ + try: + return self.attrs['thin_end'] + except KeyError: + return None + + @property + def cmd(self): + """Returns the (last) saved command line. + + If the file was created from a run that resumed from a checkpoint, only + the last command line used is returned. + + Returns + ------- + cmd : string + The command line that created this InferenceFile. + """ + cmd = self.attrs["cmd"] + if isinstance(cmd, numpy.ndarray): + cmd = cmd[-1] + return cmd + + def write_logevidence(self, lnz, dlnz): + """Writes the given log evidence and its error. + + Results are saved to file's 'log_evidence' and 'dlog_evidence' + attributes. + + Parameters + ---------- + lnz : float + The log of the evidence. + dlnz : float + The error in the estimate of the log evidence. + """ + self.attrs['log_evidence'] = lnz + self.attrs['dlog_evidence'] = dlnz + + @property + def log_evidence(self): + """Returns the log of the evidence and its error, if they exist in the + file. Raises a KeyError otherwise. + """ + return self.attrs["log_evidence"], self.attrs["dlog_evidence"] + + def write_random_state(self, group=None, state=None): + """Writes the state of the random number generator from the file. + + The random state is written to ``sampler_group``/random_state. + + Parameters + ---------- + group : str + Name of group to write random state to. + state : tuple, optional + Specify the random state to write. If None, will use + ``numpy.random.get_state()``. + """ + group = self.sampler_group if group is None else group + dataset_name = "/".join([group, "random_state"]) + if state is None: + state = numpy.random.get_state() + s, arr, pos, has_gauss, cached_gauss = state + if dataset_name in self: + self[dataset_name][:] = arr + else: + self.create_dataset(dataset_name, arr.shape, fletcher32=True, + dtype=arr.dtype) + self[dataset_name][:] = arr + self[dataset_name].attrs["s"] = s + self[dataset_name].attrs["pos"] = pos + self[dataset_name].attrs["has_gauss"] = has_gauss + self[dataset_name].attrs["cached_gauss"] = cached_gauss + + def read_random_state(self, group=None): + """Reads the state of the random number generator from the file. + + Parameters + ---------- + group : str + Name of group to read random state from. + + Returns + ------- + tuple + A tuple with 5 elements that can be passed to numpy.set_state. + """ + group = self.sampler_group if group is None else group + dataset_name = "/".join([group, "random_state"]) + arr = self[dataset_name][:] + s = self[dataset_name].attrs["s"] + pos = self[dataset_name].attrs["pos"] + has_gauss = self[dataset_name].attrs["has_gauss"] + cached_gauss = self[dataset_name].attrs["cached_gauss"] + return s, arr, pos, has_gauss, cached_gauss + + def write_strain(self, strain_dict, group=None): + """Writes strain for each IFO to file. + + Parameters + ----------- + strain : {dict, FrequencySeries} + A dict of FrequencySeries where the key is the IFO. + group : {None, str} + The group to write the strain to. If None, will write to the top + level. + """ + subgroup = self.data_group + "/{ifo}/strain" + if group is None: + group = subgroup + else: + group = '/'.join([group, subgroup]) + for ifo, strain in strain_dict.items(): + self[group.format(ifo=ifo)] = strain + self[group.format(ifo=ifo)].attrs['delta_t'] = strain.delta_t + self[group.format(ifo=ifo)].attrs['start_time'] = \ + float(strain.start_time) + + def write_stilde(self, stilde_dict, group=None): + """Writes stilde for each IFO to file. + + Parameters + ----------- + stilde : {dict, FrequencySeries} + A dict of FrequencySeries where the key is the IFO. + group : {None, str} + The group to write the strain to. If None, will write to the top + level. + """ + subgroup = self.data_group + "/{ifo}/stilde" + if group is None: + group = subgroup + else: + group = '/'.join([group, subgroup]) + for ifo, stilde in stilde_dict.items(): + self[group.format(ifo=ifo)] = stilde + self[group.format(ifo=ifo)].attrs['delta_f'] = stilde.delta_f + self[group.format(ifo=ifo)].attrs['epoch'] = float(stilde.epoch) + + def write_psd(self, psds, group=None): + """Writes PSD for each IFO to file. + + Parameters + ----------- + psds : {dict, FrequencySeries} + A dict of FrequencySeries where the key is the IFO. + group : {None, str} + The group to write the psd to. Default is ``data_group``. + """ + subgroup = self.data_group + "/{ifo}/psds/0" + if group is None: + group = subgroup + else: + print group, subgroup + group = '/'.join([group, subgroup]) + for ifo in psds: + self[group.format(ifo=ifo)] = psds[ifo] + self[group.format(ifo=ifo)].attrs['delta_f'] = psds[ifo].delta_f + + def write_injections(self, injection_file): + """Writes injection parameters from the given injection file. + + Everything in the injection file is copied to ``injections_group``. + + Parameters + ---------- + injection_file : str + Path to HDF injection file. + """ + try: + with h5py.File(injection_file, "r") as fp: + super(BaseInferenceFile, self).copy(fp, self.injections_group) + except IOError: + logging.warn("Could not read %s as an HDF file", injection_file) + + def write_command_line(self): + """Writes command line to attributes. + + The command line is written to the file's ``attrs['cmd']``. If this + attribute already exists in the file (this can happen when resuming + from a checkpoint), ``attrs['cmd']`` will be a list storing the current + command line and all previous command lines. + """ + cmd = [" ".join(sys.argv)] + try: + previous = self.attrs["cmd"] + if isinstance(previous, str): + # convert to list + previous = [previous] + elif isinstance(previous, numpy.ndarray): + previous = previous.tolist() + except KeyError: + previous = [] + self.attrs["cmd"] = cmd + previous + + @abstractmethod + def write_resume_point(self): + """Should write the point that a sampler starts up. + + How the resume point is indexed is up to the sampler. For example, + MCMC samplers use the number of iterations that are stored in the + checkpoint file. + """ + pass + + def get_slice(self, thin_start=None, thin_interval=None, thin_end=None): + """Formats a slice using the given arguments that can be used to + retrieve a thinned array from an InferenceFile. + + Parameters + ---------- + thin_start : int, optional + The starting index to use. If None, will use the ``thin_start`` + attribute. + thin_interval : int, optional + The interval to use. If None, will use the ``thin_interval`` + attribute. + thin_end : int, optional + The end index to use. If None, will use the ``thin_end`` attribute. + + Returns + ------- + slice : + The slice needed. + """ + if thin_start is None: + thin_start = self.thin_start + if thin_interval is None: + thin_interval = self.thin_interval + if thin_end is None: + thin_end = self.thin_end + return slice(thin_start, thin_end, thin_interval) + + def copy_metadata(self, other): + """Copies all metadata from this file to the other file. + + Metadata is defined as everything in the top-level ``.attrs``. + + Parameters + ---------- + other : InferenceFile + An open inference file to write the data to. + """ + logging.info("Copying metadata") + # copy attributes + for key in self.attrs.keys(): + other.attrs[key] = self.attrs[key] + + def copy_info(self, other, ignore=None): + """Copies "info" from this file to the other. + + "Info" is defined all groups that are not the samples group. + + Parameters + ---------- + other : output file + The output file. Must be an hdf file. + ignore : (list of) str + Don't copy the given groups. + """ + logging.info("Copying info") + # copy non-samples/stats data + if ignore is None: + ignore = [] + if isinstance(ignore, (str, unicode)): + ignore = [ignore] + ignore = set(ignore + [self.samples_group]) + copy_groups = set(self.keys()) - ignore + for key in copy_groups: + super(BaseInferenceFile, self).copy(key, other) + + def copy_samples(self, other, parameters=None, parameter_names=None, + read_args=None, write_args=None): + """Should copy samples to the other files. + + Parameters + ---------- + other : InferenceFile + An open inference file to write to. + parameters : list of str, optional + List of parameters to copy. If None, will copy all parameters. + parameter_names : dict, optional + Rename one or more parameters to the given name. The dictionary + should map parameter -> parameter name. If None, will just use the + original parameter names. + read_args : dict, optional + Arguments to pass to ``read_samples``. + write_args : dict, optional + Arguments to pass to ``write_samples``. + """ + # select the samples to copy + logging.info("Reading samples to copy") + if parameters is None: + parameters = self.variable_params + # if list of desired parameters is different, rename + if set(parameters) != set(self.variable_params): + other.attrs['variable_params'] = parameters + samples = self.read_samples(parameters, **read_args) + logging.info("Copying {} samples".format(samples.size)) + # if different parameter names are desired, get them from the samples + if parameter_names: + arrs = {pname: samples[p] for p, pname in parameter_names.items()} + arrs.update({p: samples[p] for p in parameters if + p not in parameter_names}) + samples = FieldArray.from_kwargs(**arrs) + other.attrs['variable_params'] = samples.fieldnames + logging.info("Writing samples") + other.write_samples(other, samples, **write_args) + + def copy(self, other, ignore=None, parameters=None, parameter_names=None, + read_args=None, write_args=None): + """Copies metadata, info, and samples in this file to another file. + + Parameters + ---------- + other : str or InferenceFile + The file to write to. May be either a string giving a filename, + or an open hdf file. If the former, the file will be opened with + the write attribute (note that if a file already exists with that + name, it will be deleted). + ignore : (list of) strings + Don't copy the given groups. If the samples group is included, no + samples will be copied. + parameters : list of str, optional + List of parameters in the samples group to copy. If None, will copy + all parameters. + parameter_names : dict, optional + Rename one or more parameters to the given name. The dictionary + should map parameter -> parameter name. If None, will just use the + original parameter names. + read_args : dict, optional + Arguments to pass to ``read_samples``. + write_args : dict, optional + Arguments to pass to ``write_samples``. + + Returns + ------- + InferenceFile + The open file handler to other. + """ + if not isinstance(other, h5py.File): + # check that we're not trying to overwrite this file + if other == self.name: + raise IOError("destination is the same as this file") + other = self.__class__(other, 'w') + # metadata + self.copy_metadata(other) + # info + if ignore is None: + ignore = [] + if isinstance(ignore, (str, unicode)): + ignore = [ignore] + self.copy_info(other, ignore=ignore) + # samples + if self.samples_group not in ignore: + self.copy_samples(other, parameters=parameters, + parameter_names=parameter_names, + read_args=read_args, + write_args=write_args) + # if any down selection was done, re-set the default + # thin-start/interval/end + p = self[self.samples_group].keys()[0] + my_shape = self[self.samples_group][p].shape + p = other[other.samples_group].keys()[0] + other_shape = other[other.samples_group][p].shape + if my_shape != other_shape: + other.attrs['thin_start'] = 0 + other.attrs['thin_interval'] = 1 + other.attrs['thin_end'] = None + return other + + +def write_kwargs_to_hdf_attrs(attrs, **kwargs): + """Writes the given keywords to the given ``attrs``. + + If any keyword argument points to a dict, the keyword will point to a + list of the dict's keys. Each key is then written to the attrs with its + corresponding value. + + Parameters + ---------- + attrs : an HDF attrs + Can be either the ``attrs`` of the hdf file, or any group in a file. + \**kwargs : + The keywords to write. + """ + for arg, val in kwargs.items(): + if val is None: + val = str(None) + if isinstance(val, dict): + attrs[arg] = val.keys() + # just call self again with the dict as kwargs + write_kwargs_to_hdf_attrs(attrs, **val) + else: + attrs[arg] = val diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py new file mode 100644 index 0000000..f77247f --- /dev/null +++ b/gwin/io/base_mcmc.py @@ -0,0 +1,251 @@ +# Copyright (C) 2016 Christopher M. Biwer, Collin Capano +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# self.option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# +"""Provides I/O that is specific to MCMC samplers. +""" + +from __future__ import absolute_import + +from abc import (ABCMeta, abstractmethod) + +import numpy +from .base_hdf import write_kwargs_to_hdf_attrs + + +class MCMCIO(object): + """Abstract base class that provides some IO functions for ensemble MCMCs. + """ + __metaclass__ = ABCMeta + + @abstractmethod + def read_acls(self): + """Should return all of the individual chains' acls. + """ + pass + + def write_samples(self, samples, parameters=None, + start_iteration=None, max_iterations=None): + """Writes samples to the given file. + + Results are written to: + + ``fp[samples_group/{vararg}]``, + + where ``{vararg}`` is the name of a model params. The samples are + written as an ``nwalkers x niterations`` array. + + Parameters + ----------- + samples : dict + The samples to write. Each array in the dictionary should have + shape nwalkers x niterations. + parameters : list, optional + Only write the specified parameters to the file. If None, will + write all of the keys in the ``samples`` dict. + start_iteration : int, optional + Write results to the file's datasets starting at the given + iteration. Default is to append after the last iteration in the + file. + max_iterations : int, optional + Set the maximum size that the arrays in the hdf file may be resized + to. Only applies if the samples have not previously been written + to file. The default (None) is to use the maximum size allowed by + h5py. + """ + nwalkers, niterations = samples.values()[0].shape + assert all(p.shape == (nwalkers, niterations) + for p in samples.values()), ( + "all samples must have the same shape") + if max_iterations is not None and max_iterations < niterations: + raise IndexError("The provided max size is less than the " + "number of iterations") + group = self.samples_group + '/{name}' + if parameters is None: + parameters = samples.keys() + # loop over number of dimensions + for param in parameters: + dataset_name = group.format(name=param) + istart = start_iteration + try: + fp_niterations = self[dataset_name].shape[-1] + if istart is None: + istart = fp_niterations + istop = istart + niterations + if istop > fp_niterations: + # resize the dataset + self[dataset_name].resize(istop, axis=1) + except KeyError: + # dataset doesn't exist yet + if istart is not None and istart != 0: + raise ValueError("non-zero start_iteration provided, " + "but dataset doesn't exist yet") + istart = 0 + istop = istart + niterations + self.create_dataset(dataset_name, (nwalkers, istop), + maxshape=(nwalkers, max_iterations), + dtype=samples[param].dtype, + fletcher32=True) + self[dataset_name][:, istart:istop] = samples[param] + + def read_raw_samples(self, fields, + thin_start=None, thin_interval=None, thin_end=None, + iteration=None, walkers=None, flatten=True): + """Base function for reading samples. + + Parameters + ----------- + fields : list + The list of field names to retrieve. Must be names of datasets in + the ``samples_group``. + + Returns + ------- + dict + A dictionary of field name -> numpy array pairs. + """ + if isinstance(fields, (str, unicode)): + fields = [fields] + # walkers to load + if walkers is not None: + widx = numpy.zeros(fp.nwalkers, dtype=bool) + widx[walkers] = True + else: + widx = slice(0, None) + # get the slice to use + if iteration is not None: + get_index = iteration + else: + get_index = self.get_slice(thin_start=thin_start, + thin_end=thin_end, + thin_interval=thin_interval) + # load + group = self.samples_group + '/{name}' + arrays = {} + for name in fields: + arr = self[group.format(name=name)][widx, get_index] + if flatten: + arr = arr.flatten() + arrays[name] = arr + return arrays + + def write_resume_point(self): + """Keeps a list of the number of iterations that were in a file when a + run was resumed from a checkpoint.""" + try: + resume_pts = self.attrs["resume_points"].tolist() + except KeyError: + resume_pts = [] + try: + niterations = self.niterations + except KeyError: + niterations = 0 + resume_pts.append(niterations) + self.attrs["resume_points"] = resume_pts + + def write_niterations(self, niterations): + """Writes the given number of iterations to the sampler group.""" + self[self.sampler_group].attrs['niterations'] = niterations + + @property + def niterations(self): + """Returns the number of iterations the sampler was run for.""" + return self[self.sampler_group].attrs['niterations'] + + def write_sampler_metadata(self, sampler): + """Writes the sampler's metadata.""" + self.attrs['sampler'] = sampler.name + if self.sampler_group not in self.keys(): + # create the sampler group + self.create_group(self.sampler_group) + self[self.sampler_group].attrs['nwalkers'] = sampler.nwalkers + # write the model's metadata + sampler.model.write_metadata(self) + + def write_acls(self, acls): + """Writes the given autocorrelation lengths. + + The ACL of each parameter is saved to + ``[sampler_group]/acls/{param}']``. The maximum over all the + parameters is saved to the file's 'acl' attribute. + + Parameters + ---------- + acls : dict + A dictionary of ACLs keyed by the parameter. + + Returns + ------- + ACL + The maximum of the acls that was written to the file. + """ + group = self.sampler_group + '/acls/{}' + # write the individual acls + for param in acls: + try: + # we need to use the write_direct function because it's + # apparently the only way to update scalars in h5py + self[group.format(param)].write_direct( + numpy.array(acls[param])) + except KeyError: + # dataset doesn't exist yet + self[group.format(param)] = acls[param] + # write the maximum over all params + acl = numpy.array(acls.values()).max() + self[self.sampler_group].attrs['acl'] = acl + # set the default thin interval to be the acl (if it is finite) + if numpy.isfinite(acl): + self.attrs['thin_interval'] = acl + + def read_acls(self): + """Reads the acls of all the parameters. + + Parameters + ---------- + fp : InferenceFile + An open file handler to read the acls from. + + Returns + ------- + dict + A dictionary of the ACLs, keyed by the parameter name. + """ + group = self[self.sampler_group]['acls'] + return {param: group[param].value for param in group.keys()} + + def write_burn_in(self, burn_in): + """Write the given burn-in data to the given filename.""" + group = self[self.sampler_group] + group.attrs['burn_in_test'] = burn_in.burn_in_test + group.attrs['is_burned_in'] = burn_in.is_burned_in + group.attrs['burn_in_iteration'] = burn_in.burn_in_iteration + # set the defaut thin_start to be the burn_in_iteration + self.attrs['thin_start'] = burn_in.burn_in_iteration + # write individual test data + for tst in burn_in.burn_in_data: + key = 'burn_in_tests/{}'.format(tst) + try: + attrs = group[key].attrs + except KeyError: + group.create_group(key) + attrs = group[key].attrs + write_kwargs_to_hdf_attrs(attrs, **burn_in.burn_in_data[tst]) diff --git a/gwin/io/emcee.py b/gwin/io/emcee.py new file mode 100644 index 0000000..8331226 --- /dev/null +++ b/gwin/io/emcee.py @@ -0,0 +1,75 @@ +# Copyright (C) 2018 Collin Capano +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# self.option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# +"""Provides IO for the emcee sampler. +""" + +from .base_hdf import BaseInferenceFile +from .base_mcmc import MCMCIO + + +class EmceeFile(MCMCIO, BaseInferenceFile): + """Class to handle file IO for the ``emcee`` sampler.""" + + name = 'emcee_file' + + def read_acceptance_fraction(self, walkers=None): + """Reads the acceptance fraction. + + Parameters + ----------- + walkers : {None, (list of) int} + The walker index (or a list of indices) to retrieve. If None, + samples from all walkers will be obtained. + + Returns + ------- + array + Array of acceptance fractions with shape (requested walkers,). + """ + group = self.sampler_group + '/acceptance_fraction' + if walkers is None: + wmask = numpy.ones(self.nwalkers, dtype=bool) + else: + wmask = numpy.zeros(self.nwalkers, dtype=bool) + wmask[walkers] = True + return self[group][wmask] + + def write_acceptance_fraction(self, acceptance_fraction): + """Write acceptance_fraction data to file. Results are written to + the ``[sampler_group]/acceptance_fraction``. + + Parameters + ----------- + acceptance_fraction : numpy.ndarray + Array of acceptance fractions to write. + """ + group = self.sampler_group + '/acceptance_fraction' + try: + self[group][:] = acceptance_fraction + except KeyError: + # dataset doesn't exist yet, create it + self[group] = acceptance_fraction + + def write_posterior(self, filename, **kwargs): + pass diff --git a/gwin/io/hdf.py b/gwin/io/hdf.py deleted file mode 100644 index 1799694..0000000 --- a/gwin/io/hdf.py +++ /dev/null @@ -1,801 +0,0 @@ -# Copyright (C) 2016 Christopher M. Biwer -# This program is free software; you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation; either version 3 of the License, or (at your -# self.option) any later version. -# -# This program is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General -# Public License for more details. -# -# You should have received a copy of the GNU General Public License along -# with this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - - -# -# ============================================================================= -# -# Preamble -# -# ============================================================================= -# -"""This modules defines functions for reading and writing samples that the -inference samplers generate. -""" - -import os -import sys -import logging - -import numpy - -import h5py - -from pycbc import DYN_RANGE_FAC -from pycbc.io import FieldArray -from pycbc.types import FrequencySeries -from pycbc.waveform import parameters as wfparams - -from .. import sampler as gwin_sampler - - -class _PosteriorOnlyParser(object): - """Provides interface for reading/writing samples from/to an InferenceFile - that contains flattened posterior samples. - """ - @staticmethod - def _read_fields(fp, fields_group, fields, array_class, - thin_start=None, thin_interval=None, thin_end=None, - iteration=None): - """Reads fields from the given file. - """ - if iteration is not None: - get_index = iteration - else: - get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end, - thin_interval=thin_interval) - # load - arrays = {} - group = fields_group + '/{}' - arrays = {field: fp[group.format(field)][get_index] - for field in fields} - return array_class.from_kwargs(**arrays) - - @classmethod - def read_samples(cls, fp, parameters, samples_group=None, - thin_start=0, thin_end=None, thin_interval=1, - iteration=None, array_class=None): - """Reads posterior samples from a posterior-only file. - """ - # get the group to load from - if samples_group is None: - samples_group = fp.samples_group - # get the type of array class to use - if array_class is None: - array_class = FieldArray - # get the names of fields needed for the given parameters - possible_fields = fp[samples_group].keys() - loadfields = array_class.parse_parameters(parameters, possible_fields) - return cls._read_fields(fp, samples_group, loadfields, array_class, - thin_start=thin_start, - thin_interval=thin_interval, thin_end=thin_end, - iteration=iteration) - - @staticmethod - def write_samples_group(fp, samples_group, fields, samples): - """Writes the given samples to the given samples group. - """ - for field in samples.fieldnames: - grp = '{}/{}'.format(samples_group, field) - fp[grp] = samples[field] - - @classmethod - def n_independent_samples(cls, fp): - """Returns the number of independent samples stored in the file. - """ - return cls.read_samples(fp, fp.variable_params[0]).size - - -class InferenceFile(h5py.File): - """ A subclass of the h5py.File object that has extra functions for - handling reading and writing the samples from the samplers. - - Parameters - ----------- - path : str - The path to the HDF file. - mode : {None, str} - The mode to open the file, eg. "w" for write and "r" for read. - """ - name = "hdf" - samples_group = 'samples' - stats_group = 'model_stats' - sampler_group = 'sampler_states' - - def __init__(self, path, mode=None, **kwargs): - super(InferenceFile, self).__init__(path, mode, **kwargs) - - @property - def posterior_only(self): - """Whether the file only contains flattened posterior samples. - """ - try: - return self.attrs['posterior_only'] - except KeyError: - return False - - @property - def sampler_name(self): - """Returns the name of the sampler that was used.""" - return self.attrs["sampler"] - - @property - def sampler_class(self): - """Returns the sampler class that was used.""" - try: - sampler = self.sampler_name - except KeyError: - return None - return gwin_sampler.samplers[sampler] - - @property - def samples_parser(self): - """Returns the class to use to read/write samples from/to the file.""" - if self.posterior_only: - return _PosteriorOnlyParser - else: - return self.sampler_class - - @property - def model_name(self): - """Returns the name of the model that was used.""" - return self.attrs["model"] - - @property - def variable_params(self): - """Returns list of variable_params. - - Returns - ------- - variable_params : {list, str} - List of str that contain variable_params keys. - """ - return self.attrs["variable_params"] - - @property - def static_params(self): - """Returns a dictionary of the static_params. The keys are the argument - names, values are the value they were set to. - """ - return {arg: self.attrs[arg] for arg in self.attrs["static_params"]} - - @property - def sampling_params(self): - """Returns the parameters that were used to sample. - - Returns - ------- - sampling_params : {list, str} - List of the sampling params. - """ - return self.attrs["sampling_params"] - - @property - def lognl(self): - """Returns the log noise likelihood.""" - return self.attrs["lognl"] - - @property - def niterations(self): - """Returns number of iterations performed. - - Returns - ------- - niterations : int - Number of iterations performed. - """ - return self.attrs["niterations"] - - @property - def n_independent_samples(self): - """Returns the number of independent samples stored in the file. - """ - return self.samples_parser.n_independent_samples(self) - - @property - def burn_in_iterations(self): - """Returns number of iterations in the burn in. - """ - return self.attrs["burn_in_iterations"] - - @property - def is_burned_in(self): - """Returns whether or not the sampler is burned in. - """ - return self.attrs["is_burned_in"] - - @property - def nwalkers(self): - """Returns number of walkers used. - - Returns - ------- - nwalkesr : int - Number of walkers used. - """ - return self.attrs["nwalkers"] - - @property - def ntemps(self): - """Returns number of temperatures used.""" - return self.attrs["ntemps"] - - @property - def acl(self): - """ Returns the saved autocorelation length (ACL). - - Returns - ------- - acl : {int, float} - The ACL. - """ - return self.attrs["acl"] - - @property - def cmd(self): - """Returns the (last) saved command line. - - If the file was created from a run that resumed from a checkpoint, only - the last command line used is returned. - - Returns - ------- - cmd : string - The command line that created this InferenceFile. - """ - cmd = self.attrs["cmd"] - if isinstance(cmd, numpy.ndarray): - cmd = cmd[-1] - return cmd - - @property - def resume_points(self): - """The iterations at which a run was resumed from checkpoint. - - Returns - ------- - resume_points : array or None - An array of integers giving the points at which the run resumed. - - Raises - ------ - KeyError - If the run never resumed from a checkpoint. - """ - return self.attrs['resume_points'] - - @property - def log_evidence(self): - """Returns the log of the evidence and its error, if they exist in the - file. Raises a KeyError otherwise. - """ - return self.attrs["log_evidence"], self.attrs["dlog_evidence"] - - def read_samples(self, parameters, samples_group=None, **kwargs): - """Reads samples from the file. - - Parameters - ----------- - parameters : (list of) strings - The parameter(s) to retrieve. A parameter can be the name of any - field in `samples_group`, a virtual field or method of - `FieldArray` (as long as the file contains the necessary fields - to derive the virtual field or method), and/or a function of - these. - samples_group : str - Group in HDF InferenceFile that parameters belong to. - **kwargs : - The rest of the keyword args are passed to the sampler's - `read_samples` method. - - Returns - ------- - FieldArray - Samples for the given parameters, as an instance of a - FieldArray. - """ - # get the appropriate sampler class - samples_group = samples_group if samples_group else self.samples_group - return self.samples_parser.read_samples(self, parameters, - samples_group=samples_group, - **kwargs) - - def read_model_stats(self, **kwargs): - """Reads model stats from self. - - Parameters - ----------- - **kwargs : - The keyword args are passed to the sampler's - ``read_model_stats`` method. - - Returns - ------- - stats : {FieldArray, None} - Likelihood stats in the file, as a FieldArray. The fields of the - array are the names of the stats that are in the ``model_stats`` - group. - """ - parameters = self[self.stats_group].keys() - return self.read_samples(parameters, samples_group=self.stats_group, - **kwargs) - - def read_acceptance_fraction(self, **kwargs): - """Returns the acceptance fraction that was written to the file. - - Parameters - ---------- - **kwargs : - All keyword arguments are passed to the sampler's - `read_acceptance_fraction` function. - Returns - ------- - numpy.array - The acceptance fraction. - """ - return self.sampler_class.read_acceptance_fraction(self, **kwargs) - - def read_acls(self): - """Returns all of the individual chains' acls. See the `read_acls` - function of this file's sampler for more details. - """ - return self.sampler_class.read_acls(self) - - def read_label(self, parameter, error_on_none=False): - """Returns the label for the parameter. - - Parameters - ----------- - parameter : str - Name of parameter to get a label for. Will first try to retrieve - a label from this file's "label" attributes. If the parameter - is not found there, will look for a label from - pycbc.waveform.parameters. - error_on_none : {False, bool} - If True, will raise a ValueError if a label cannot be found, or if - the label is None. Otherwise, the parameter will just be returned - if no label can be found. - - Returns - ------- - label : str - A formatted string for the name of the paramter. - """ - # get label - try: - label = self[parameter].attrs["label"] - except KeyError: - # try looking in pycbc.waveform.parameters - try: - label = getattr(wfparams, parameter).label - except AttributeError: - label = None - if label is None: - if error_on_none: - raise ValueError("Cannot find a label for paramter %s" % ( - parameter)) - else: - return parameter - return label - - def read_random_state(self, group=None): - """ Reads the state of the random number generator from the file. - - Parameters - ---------- - group : str - Name of group to read random state from. - - Returns - ------- - tuple - A tuple with 5 elements that can be passed to numpy.set_state. - """ - group = self.sampler_group if group is None else group - dataset_name = "/".join([group, "random_state"]) - arr = self[dataset_name][:] - s = self[dataset_name].attrs["s"] - pos = self[dataset_name].attrs["pos"] - has_gauss = self[dataset_name].attrs["has_gauss"] - cached_gauss = self[dataset_name].attrs["cached_gauss"] - return s, arr, pos, has_gauss, cached_gauss - - def write_strain(self, strain_dict, group=None): - """Writes strain for each IFO to file. - - Parameters - ----------- - strain : {dict, FrequencySeries} - A dict of FrequencySeries where the key is the IFO. - group : {None, str} - The group to write the strain to. If None, will write to the top - level. - """ - subgroup = "{ifo}/strain" - if group is None: - group = subgroup - else: - group = '/'.join([group, subgroup]) - for ifo, strain in strain_dict.items(): - self[group.format(ifo=ifo)] = strain - self[group.format(ifo=ifo)].attrs['delta_t'] = strain.delta_t - self[group.format(ifo=ifo)].attrs['start_time'] = \ - float(strain.start_time) - - def write_stilde(self, stilde_dict, group=None): - """Writes stilde for each IFO to file. - - Parameters - ----------- - stilde : {dict, FrequencySeries} - A dict of FrequencySeries where the key is the IFO. - group : {None, str} - The group to write the strain to. If None, will write to the top - level. - """ - subgroup = "{ifo}/stilde" - if group is None: - group = subgroup - else: - group = '/'.join([group, subgroup]) - for ifo, stilde in stilde_dict.items(): - self[group.format(ifo=ifo)] = stilde - self[group.format(ifo=ifo)].attrs['delta_f'] = stilde.delta_f - self[group.format(ifo=ifo)].attrs['epoch'] = float(stilde.epoch) - - def write_psd(self, psds, low_frequency_cutoff, group=None): - """Writes PSD for each IFO to file. - - Parameters - ----------- - psds : {dict, FrequencySeries} - A dict of FrequencySeries where the key is the IFO. - low_frequency_cutoff : {dict, float} - A dict of the low-frequency cutoff where the key is the IFO. The - minimum value will be stored as an attr in the File. - group : {None, str} - The group to write the strain to. If None, will write to the top - level. - """ - subgroup = "{ifo}/psds/0" - if group is None: - group = subgroup - else: - group = '/'.join([group, subgroup]) - self.attrs["low_frequency_cutoff"] = min(low_frequency_cutoff.values()) - for ifo in psds: - self[group.format(ifo=ifo)] = psds[ifo] - self[group.format(ifo=ifo)].attrs['delta_f'] = psds[ifo].delta_f - - def write_data(self, strain_dict=None, stilde_dict=None, - psd_dict=None, low_frequency_cutoff_dict=None, - group=None): - """Writes the strain/stilde/psd. - - Parameters - ---------- - strain_dict : {None, dict} - A dictionary of strains. If None, no strain will be written. - stilde_dict : {None, dict} - A dictionary of stilde. If None, no stilde will be written. - psd_dict : {None, dict} - A dictionary of psds. If None, no psds will be written. - low_freuency_cutoff_dict : {None, dict} - A dictionary of low frequency cutoffs used for each detector in - `psd_dict`; must be provided if `psd_dict` is not None. - group : {None, str} - The group to write the strain to. If None, will write to the top - level. - """ - # save PSD - if psd_dict is not None: - if low_frequency_cutoff_dict is None: - raise ValueError("must provide low_frequency_cutoff_dict if " - "saving psds to output") - # apply dynamic range factor for saving PSDs since - # plotting code expects it - psd_dyn_dict = {} - for key, val in psd_dict.iteritems(): - psd_dyn_dict[key] = FrequencySeries(val*DYN_RANGE_FAC**2, - delta_f=val.delta_f) - self.write_psd(psds=psd_dyn_dict, - low_frequency_cutoff=low_frequency_cutoff_dict, - group=group) - - # save stilde - if stilde_dict is not None: - self.write_stilde(stilde_dict, group=group) - - # save strain if desired - if strain_dict is not None: - self.write_strain(strain_dict, group=group) - - def write_injections(self, injection_file, ifo): - """ Writes injection parameters for an IFO to file. - - Parameters - ---------- - injection_file : str - Path to HDF injection file. - ifo : str - IFO name. - """ - subgroup = "{ifo}/injections" - self.create_group(subgroup.format(ifo=ifo)) - try: - with h5py.File(injection_file, "r") as fp: - for param in fp.keys(): - self[subgroup.format(ifo=ifo)][param] = fp[param][:] - for key in fp.attrs.keys(): - self[subgroup.format(ifo=ifo)].attrs[key] = fp.attrs[key] - except IOError: - logging.warn("Could not read %s as an HDF file", injection_file) - - def write_command_line(self): - """Writes command line to attributes. - - The command line is written to the file's ``attrs['cmd']``. If this - attribute already exists in the file (this can happen when resuming - from a checkpoint), ``attrs['cmd']`` will be a list storing the current - command line and all previous command lines. - """ - cmd = [" ".join(sys.argv)] - try: - previous = self.attrs["cmd"] - if isinstance(previous, str): - # convert to list - previous = [previous] - elif isinstance(previous, numpy.ndarray): - previous = previous.tolist() - except KeyError: - previous = [] - self.attrs["cmd"] = cmd + previous - - def write_resume_point(self): - """Keeps a list of the number of iterations that were in a file when a - run was resumed from a checkpoint.""" - try: - resume_pts = self.attrs["resume_points"].tolist() - except KeyError: - resume_pts = [] - try: - niterations = self.niterations - except KeyError: - niterations = 0 - resume_pts.append(niterations) - self.attrs["resume_points"] = resume_pts - - def write_random_state(self, group=None, state=None): - """ Writes the state of the random number generator from the file. - - Parameters - ---------- - group : str - Name of group to read random state to. - state : tuple, optional - Specify the random state to write. If None, will use - ``numpy.random.get_state()``. - """ - group = self.sampler_group if group is None else group - dataset_name = "/".join([group, "random_state"]) - if state is None: - state = numpy.random.get_state() - s, arr, pos, has_gauss, cached_gauss = state - if group in self: - self[dataset_name][:] = arr - else: - self.create_dataset(dataset_name, arr.shape, fletcher32=True, - dtype=arr.dtype) - self[dataset_name][:] = arr - self[dataset_name].attrs["s"] = s - self[dataset_name].attrs["pos"] = pos - self[dataset_name].attrs["has_gauss"] = has_gauss - self[dataset_name].attrs["cached_gauss"] = cached_gauss - - def get_slice(self, thin_start=None, thin_interval=None, thin_end=None): - """Formats a slice using the given arguments that can be used to - retrieve a thinned array from an InferenceFile. - - Parameters - ---------- - thin_start : {None, int} - The starting index to use. If None, will try to retrieve the - `burn_in_iterations` from the given file. If no - `burn_in_iterations` exists, will default to the start of the - array. - thin_interval : {None, int} - The interval to use. If None, will try to retrieve the acl from the - given file. If no acl attribute exists, will default to 1. - thin_end : {None, int} - The end index to use. If None, will retrieve to the end of the - array. - - Returns - ------- - slice : - The slice needed. - """ - - # default is to skip burn in samples - if thin_start is None: - try: - thin_start = self.burn_in_iterations - # if the sampler hasn't burned in, the burn_in_iterations will - # be the same as the number of iterations, which would result - # in 0 samples. In that case, just use the last one - if thin_start == self.niterations: - thin_start = thin_start - 1 - except KeyError: - pass - - # default is to use stored ACL and accept every i-th sample - if thin_interval is None: - try: - thin_interval = int(numpy.ceil(self.acl)) - except KeyError: - pass - return slice(thin_start, thin_end, thin_interval) - - def copy_metadata(self, other): - """Copies all metadata from this file to the other file. - - Metadata is defined as all data that is not in either the samples or - stats group. - - Parameters - ---------- - other : InferenceFile - An open inference file to write the data to. - """ - logging.info("Copying metadata") - # copy non-samples/stats data - for key in self.keys(): - if key not in [self.samples_group, self.stats_group]: - super(InferenceFile, self).copy(key, other) - # copy attributes - for key in self.attrs.keys(): - other.attrs[key] = self.attrs[key] - - def copy(self, other, parameters=None, parameter_names=None, - posterior_only=False, **kwargs): - """Copies data in this file to another file. - - The samples and stats to copy may be down selected using the given - kwargs. All other data (the "metadata") are copied exactly. - - Parameters - ---------- - other : str or InferenceFile - The file to write to. May be either a string giving a filename, - or an open hdf file. If the former, the file will be opened with - the write attribute (note that if a file already exists with that - name, it will be deleted). - parameters : list of str, optional - List of parameters to copy. If None, will copy all parameters. - parameter_names : dict, optional - Rename one or more parameters to the given name. The dictionary - should map parameter -> parameter name. If None, will just use the - original parameter names. - posterior_only : bool, optional - Write the samples and model stats as flattened arrays, and - set other's posterior_only attribute. For example, if this file - has a parameter's samples written to - `{samples_group}/{param}/walker{x}`, then other will have all of - the selected samples from all walkers written to - `{samples_group}/{param}/`. - **kwargs : - All other keyword arguments are passed to `read_samples`. - - Returns - ------- - InferenceFile - The open file handler to other. - """ - if not isinstance(other, h5py.File): - # check that we're not trying to overwrite this file - if other == self.name: - raise IOError("destination is the same as this file") - other = InferenceFile(other, 'w') - # copy metadata over - self.copy_metadata(other) - # update other's posterior attribute - if posterior_only: - other.attrs['posterior_only'] = posterior_only - # select the samples to copy - logging.info("Reading samples to copy") - if parameters is None: - parameters = self.variable_params - # if list of desired parameters is different, rename model params - if set(parameters) != set(self.variable_params): - other.attrs['variable_params'] = parameters - # if only the posterior is desired, we'll flatten the results - if not posterior_only and not self.posterior_only: - kwargs['flatten'] = False - samples = self.read_samples(parameters, **kwargs) - logging.info("Copying {} samples".format(samples.size)) - # if different parameter names are desired, get them from the samples - if parameter_names: - arrs = {pname: samples[p] for p, pname in parameter_names.items()} - arrs.update({p: samples[p] for p in parameters if - p not in parameter_names}) - samples = FieldArray.from_kwargs(**arrs) - other.attrs['variable_params'] = samples.fieldnames - logging.info("Writing samples") - other.samples_parser.write_samples_group(other, self.samples_group, - samples.fieldnames, samples) - # do the same for the model stats - logging.info("Reading stats to copy") - stats = self.read_model_stats(**kwargs) - logging.info("Writing stats") - other.samples_parser.write_samples_group(other, self.stats_group, - stats.fieldnames, stats) - # if any down selection was done, re-set the burn in iterations and - # the acl, and the niterations. - # The last dimension of the samples returned by the sampler should - # be the number of iterations. - if samples.shape[-1] != self.niterations: - other.attrs['acl'] = 1 - other.attrs['burn_in_iterations'] = 0 - other.attrs['niterations'] = samples.shape[-1] - return other - - -def check_integrity(filename): - """Checks the integrity of an InferenceFile. - - Checks done are: - - * can the file open? - * do all of the datasets in the samples group have the same shape? - * can the first and last sample in all of the datasets in the samples - group be read? - - If any of these checks fail, an IOError is raised. - - Parameters - ---------- - filename: str - Name of an InferenceFile to check. - - Raises - ------ - ValueError - If the given file does not exist. - KeyError - If the samples group does not exist. - IOError - If any of the checks fail. - """ - # check that the file exists - if not os.path.exists(filename): - raise ValueError("file {} does not exist".format(filename)) - # if the file is corrupted such that it cannot be opened, the next line - # will raise an IOError - with InferenceFile(filename, 'r') as fp: - # check that all datasets in samples have the same shape - parameters = fp[fp.samples_group].keys() - group = fp.samples_group + '/{}' - # use the first parameter as a reference shape - ref_shape = fp[group.format(parameters[0])].shape - if not all(fp[group.format(param)].shape == ref_shape - for param in parameters): - raise IOError("not all datasets in the samples group have the " - "same shape") - # check that we can read the first/last sample - firstidx = tuple([0]*len(ref_shape)) - lastidx = tuple([-1]*len(ref_shape)) - for param in parameters: - fp[group.format(param)][firstidx] - fp[group.format(param)][lastidx] diff --git a/gwin/models/base.py b/gwin/models/base.py index f4c4378..e15dc57 100644 --- a/gwin/models/base.py +++ b/gwin/models/base.py @@ -34,6 +34,7 @@ from pycbc.io import FieldArray from pycbc.workflow import ConfigParser +from gwin.io.base_hdf import write_kwargs_to_hdf_attrs # # ============================================================================= @@ -742,3 +743,16 @@ def from_config(cls, cp, **kwargs): args['sampling_transforms'] = sampling_transforms args.update(kwargs) return cls(**args) + + def write_metadata(self, fp): + """Writes metadata to the given file handler. + + Parameters + ---------- + fp : gwin.io.BaseInferenceFile instance + The inference file to write to. + """ + fp.attrs['model'] = self.name + fp.attrs['variable_params'] = list(self.variable_params) + fp.attrs['sampling_params'] = list(self.sampling_params) + write_kwargs_to_hdf_attrs(fp.attrs, static_params=self.static_params) diff --git a/gwin/models/base_data.py b/gwin/models/base_data.py index 0c2095e..ebb5723 100644 --- a/gwin/models/base_data.py +++ b/gwin/models/base_data.py @@ -150,6 +150,11 @@ def data(self): """Returns the data that was set.""" return self._data + @property + def detectors(self): + """Returns the detectors used.""" + return self._data.keys() + def _transform_params(self, **params): """Adds waveform transforms to parent's ``_transform_params``.""" params = super(BaseDataModel, self)._transform_params(**params) @@ -231,3 +236,14 @@ def from_config(cls, cp, data, delta_f=None, delta_t=None, args['waveform_generator'] = waveform_generator return cls(**args) + + def write_metadata(self, fp): + """Adds data to the metadata that's written. + + Parameters + ---------- + fp : gwin.io.BaseInferenceFile instance + The inference file to write to. + """ + super(BaseDataModel, self).write_metadata(fp) + fp.write_stilde(self.data) diff --git a/gwin/models/gaussian_noise.py b/gwin/models/gaussian_noise.py index a2279de..81dfb4e 100644 --- a/gwin/models/gaussian_noise.py +++ b/gwin/models/gaussian_noise.py @@ -244,6 +244,7 @@ def __init__(self, variable_params, data, waveform_generator, d = data.values()[0] N = len(d) # figure out the kmin, kmax to use + self._f_lower = f_lower kmin, kmax = filter.get_cutoff_indices(f_lower, f_upper, d.delta_f, (N-1)*2) self._kmin = kmin @@ -252,9 +253,12 @@ def __init__(self, variable_params, data, waveform_generator, norm = 4*d.delta_f # we'll store the weight to apply to the inner product if psds is None: + self._psds = None w = Array(numpy.sqrt(norm)*numpy.ones(N)) self._weight = {det: w for det in data} else: + # store a copy of the psds + self._psds = {ifo: d.copy() for (ifo, d) in psds.items()} # temporarily suppress numpy divide by 0 warning numpysettings = numpy.seterr(divide='ignore') self._weight = {det: Array(numpy.sqrt(norm/psds[det])) @@ -432,3 +436,27 @@ def det_optimal_snrsq(self, det): self.loglr # now try returning again return getattr(self._current_stats, '{}_optimal_snrsq'.format(det)) + + def write_metadata(self, fp): + """Adds writing the psds and lognl, since it's a constant. + + The lognl is written to the sample group's ``attrs``. + + Parameters + ---------- + fp : gwin.io.BaseInferenceFile instance + The inference file to write to. + """ + super(GaussianNoise, self).write_metadata(fp) + fp.attrs['f_lower'] = self._f_lower + if self._psds is not None: + fp.write_psd(self._psds) + try: + attrs = fp[fp.samples_group].attrs + except KeyError: + # group doesn't exist, create it + fp.create_group(fp.samples_group) + attrs = fp[fp.samples_group].attrs + attrs['lognl'] = self.lognl + for det in self.detectors: + attrs['{}_lognl'.format(det)] = self.det_lognl(det) diff --git a/gwin/option_utils.py b/gwin/option_utils.py index 5fe539e..47ff79c 100644 --- a/gwin/option_utils.py +++ b/gwin/option_utils.py @@ -191,86 +191,6 @@ def sampler_from_cli(opts, model, pool=None): # # ----------------------------------------------------------------------------- -def validate_checkpoint_files(checkpoint_file, backup_file): - """Checks if the given checkpoint and/or backup files are valid. - - The checkpoint file is considered valid if: - - * it passes all tests run by ``InferenceFile.check_integrity``; - * it has at least one sample written to it (indicating at least one - checkpoint has happened). - - The same applies to the backup file. The backup file must also have the - same number of samples as the checkpoint file, otherwise, the backup is - considered invalid. - - If the checkpoint (backup) file is found to be valid, but the backup - (checkpoint) file is not valid, then the checkpoint (backup) is copied to - the backup (checkpoint). Thus, this function ensures that checkpoint and - backup files are either both valid or both invalid. - - Parameters - ---------- - checkpoint_file : string - Name of the checkpoint file. - backup_file : string - Name of the backup file. - - Returns - ------- - checkpoint_valid : bool - Whether or not the checkpoint (and backup) file may be used for loading - samples. - """ - # check if checkpoint file exists and is valid - logging.info("Validating checkpoint and backup files") - try: - check_integrity(checkpoint_file) - checkpoint_valid = True - except (ValueError, KeyError, IOError): - checkpoint_valid = False - # backup file - try: - check_integrity(backup_file) - backup_valid = True - except (ValueError, KeyError, IOError): - backup_valid = False - # check if there are any samples in the file; if not, we'll just start from - # scratch - if checkpoint_valid: - with InferenceFile(checkpoint_file, 'r') as fp: - try: - group = '{}/{}'.format(fp.samples_group, fp.variable_params[0]) - nsamples = fp[group].size - checkpoint_valid = nsamples != 0 - except KeyError: - checkpoint_valid = False - # check if there are any samples in the backup file - if backup_valid: - with InferenceFile(backup_file, 'r') as fp: - try: - group = '{}/{}'.format(fp.samples_group, fp.variable_params[0]) - backup_nsamples = fp[group].size - backup_valid = backup_nsamples != 0 - except KeyError: - backup_valid = False - # check that the checkpoint and backup have the same number of samples; - # if not, assume the checkpoint has the correct number - if checkpoint_valid and backup_valid: - backup_valid = nsamples == backup_nsamples - # decide what to do based on the files' statuses - if checkpoint_valid and not backup_valid: - # copy the checkpoint to the backup - logging.info("Backup invalid; copying checkpoint file") - shutil.copy(checkpoint_file, backup_file) - backup_valid = True - elif backup_valid and not checkpoint_valid: - logging.info("Checkpoint invalid; copying backup file") - # copy the backup to the checkpoint - shutil.copy(backup_file, checkpoint_file) - checkpoint_valid = True - return checkpoint_valid - def add_low_frequency_cutoff_opt(parser): """Adds the low-frequency-cutoff option to the given parser.""" @@ -325,7 +245,6 @@ def data_from_cli(opts): precision="double") # apply gates if not waiting to overwhiten if not opts.gate_overwhitened: - logging.info("Applying gates to strain data") strain_dict = apply_gates_to_td(strain_dict, gates) # get strain time series to use for PSD estimation @@ -350,7 +269,6 @@ def data_from_cli(opts): # FFT strain and save each of the length of the FFT, delta_f, and # low frequency cutoff to a dict - logging.info("FFT strain") stilde_dict = {} length_dict = {} delta_f_dict = {} diff --git a/gwin/sampler/__init__.py b/gwin/sampler/__init__.py index 6154aee..aa7cf3a 100644 --- a/gwin/sampler/__init__.py +++ b/gwin/sampler/__init__.py @@ -17,14 +17,43 @@ This modules provides a list of implemented samplers for parameter estimation. """ -from .kombine import KombineSampler -from .emcee import (EmceeEnsembleSampler, EmceePTSampler) -from .mcmc import MCMCSampler +from __future__ import absolute_import + +from .base import (initial_dist_from_config, create_new_output_file) +# from .kombine import KombineSampler +from .emcee import EmceeEnsembleSampler +# from .emcee_pt import EmceePTSampler +# from .mcmc import MCMCSampler # list of available samplers samplers = {cls.name: cls for cls in ( - KombineSampler, + # KombineSampler, EmceeEnsembleSampler, - EmceePTSampler, - MCMCSampler, + # EmceePTSampler, + # MCMCSampler, )} + + +def load_from_config(cp, model, **kwargs): + """Loads a sampler from the given config file. + + This looks for a name in the section ``[sampler]`` to determine which + sampler class to load. That sampler's ``from_config`` is then called. + + Parameters + ---------- + cp : WorkflowConfigParser + Config parser to read from. + model : gwin.model + Which model to pass to the sampler. + \**kwargs : + All other keyword arguments are passed directly to the sampler's + ``from_config`` file. + + Returns + ------- + sampler : + The initialized sampler. + """ + name = cp.get('sampler', 'name') + return samplers[name].from_config(cp, model, **kwargs) diff --git a/gwin/sampler/base.py b/gwin/sampler/base.py index 3601c5b..41bc2b0 100644 --- a/gwin/sampler/base.py +++ b/gwin/sampler/base.py @@ -22,47 +22,50 @@ # ============================================================================= # """ -This modules provides classes and functions for using different sampler -packages for parameter estimation. +Defines the base sampler class to be inherited by all samplers. """ +from abc import ABCMeta, abstractmethod, abstractproperty +import os import numpy +import shutil from pycbc.io import FieldArray from pycbc.filter import autocorrelation import h5py import logging +from ..io import validate_checkpoint_files # # ============================================================================= # -# Samplers +# Base Sampler definition # # ============================================================================= # -class _BaseSampler(object): - """Base container class for running the inference sampler that will - generate the posterior distributions. + +class BaseSampler(object): + """Base container class for inference samplers. Parameters ---------- model : Model An instance of a model from ``gwin.models``. """ + __metaclass__ = ABCMeta name = None def __init__(self, model): self.model = model - self.lastclear = 0 - @classmethod - def from_cli(cls, opts, model, pool=None, - model_call=None): - """This function create an instance of this sampler from the given - command-line options. + # @classmethod <--uncomment when we move to python 3.3 + @abstractmethod + def from_config(cls, cp, model, nprocesses=1, use_mpi=False, + **kwargs): + """This should initialize the sampler given a config file. """ - raise NotImplementedError("from_cli function not set") + pass @property def variable_params(self): @@ -72,841 +75,196 @@ def variable_params(self): @property def sampling_params(self): - """Returns the sampling args used by the model. + """Returns the sampling params used by the model. """ return self.model.sampling_params @property - def chain(self): - """This function should return the past samples as a - [additional dimensions x] niterations x ndim array, where ndim are the - number of model params, niterations the number of iterations, and - additional dimeionions are any additional dimensions used by the - sampler (e.g, walkers, temperatures). + def static_params(self): + """Returns the model's fixed parameters. """ - return NotImplementedError("chain function not set.") + return self.model.static_params - @property + @abstractproperty def samples(self): - """This function should return the past samples as a [additional - dimensions x] niterations field array, where the fields are union - of the sampling args and the model params. - """ - return NotImplementedError("samples function not set.") + """A dict mapping variable_params to arrays of samples currently + in memory. The dictionary may also contain sampling_params. - @property - def clear_chain(self): - """This function should clear the current chain of samples from memory. + The sample arrays may have any shape, and may or may not be thinned. """ - return NotImplementedError("clear chain function not set.") + pass - @property - def niterations(self): - """Get the current number of iterations.""" - return self.chain.shape[-2] + self.lastclear - - @property - def acceptance_fraction(self): - """This function should return the fraction of steps accepted by each - walker as an array. - """ - return NotImplementedError("acceptance_fraction function not set.") - - @property - def lnpost(self): - """This function should return the natural logarithm of the likelihood - function used by the sampler as an - [additional dimensions] x niterations array. - """ - return NotImplementedError("lnpost function not set.") - - @property + @abstractproperty def model_stats(self): - """This function should return the prior and likelihood ratio of - samples as an [additional dimensions] x niterations - array. If the model did not return that info to the - sampler, it should return None. - """ - return NotImplementedError("model stats not set") + """A dict mapping model's metadata fields to arrays of values for + each sample in ``raw_samples``. - def burn_in(self, initial_values): - """This function should burn in the sampler. + The arrays may have any shape, and may or may not be thinned. """ - raise NotImplementedError("This sampler has no burn_in function.") + pass - def run(self, niterations): + @abstractmethod + def run(self): """This function should run the sampler. - """ - raise NotImplementedError("run function not set.") - @classmethod - def calculate_logevidence(cls, fp): - """This function should calculate the log evidence and its error using - the results in the given file. If the sampler does not support evidence - calculation, then this will raise a NotImplementedError. + Any checkpointing should be done internally in this function. """ - raise NotImplementedError("this sampler does not support evidence " - "calculation") + pass - # write and read functions - def write_metadata(self, fp, **kwargs): - """Writes metadata about this sampler to the given file. Metadata is - written to the file's `attrs`. + @abstractproperty + def io(self): + """A class that inherits from ``BaseInferenceFile`` to handle IO with + an hdf file. - Parameters - ---------- - fp : InferenceFile - A file handler to an open inference file. - **kwargs : - All keyword arguments are saved as separate arguments in the - file attrs. If any keyword argument is a dictionary, the keyword - will point to the list of keys in the the file's ``attrs``. Each - key is then stored as a separate attr with its corresponding value. - """ - fp.attrs['sampler'] = self.name - fp.attrs['model'] = self.model.name - fp.attrs['variable_params'] = list(self.variable_params) - fp.attrs['sampling_params'] = list(self.sampling_params) - fp.attrs["niterations"] = self.niterations - try: - fp.attrs["lognl"] = self.model.lognl - except AttributeError: - pass - for arg, val in kwargs.items(): - if val is None: - val = str(None) - if isinstance(val, dict): - fp.attrs[arg] = val.keys() - for key, item in val.items(): - if item is None: - item = str(None) - fp.attrs[key] = item - else: - fp.attrs[arg] = val - - @staticmethod - def write_logevidence(fp, lnz, dlnz): - """Writes the given log evidence and its error to the given file. - Results are saved to the file's 'log_evidence' and 'dlog_evidence' - attributes. - - Parameters - ---------- - fp : InferenceFile - A file handler to an open inference file. - lnz : float - The log of the evidence. - dlnz : float - The error in the estimate of the log evidence. + This should be a class, not an instance of class, so that the sampler + can initialize it when needed. """ - fp.attrs['log_evidence'] = lnz - fp.attrs['dlog_evidence'] = dlnz + pass - @staticmethod - def write_burn_in_iterations(fp, burn_in_iterations, is_burned_in=None): - """Writes the burn in iterations to the given file. + @abstractmethod + def set_initial_conditions(self, initial_distribution=None, + samples_file=None): + """Sets up the starting point for the sampler. - Parameters - ---------- - fp : InferenceFile - A file handler to an open inference file. - burn_in_iterations : array - Array of values giving the iteration of the burn in of each walker. - is_burned_in : array - Array of booleans indicating which chains are burned in. + Should also set the sampler's random state. """ - try: - fp['burn_in_iterations'][:] = burn_in_iterations - except KeyError: - fp['burn_in_iterations'] = burn_in_iterations - fp.attrs['burn_in_iterations'] = burn_in_iterations.max() - if is_burned_in is not None: - try: - fp['is_burned_in'][:] = is_burned_in - except KeyError: - fp['is_burned_in'] = is_burned_in - fp.attrs['is_burned_in'] = is_burned_in.all() - - @staticmethod - def write_state(fp): - """Saves the state of the sampler in a file. - """ - fp.write_random_state() + pass - @staticmethod - def set_state_from_file(fp): - """Sets the state of the sampler back to the instance saved in a file. + @abstractmethod + def checkpoint(self): + """The sampler must have a checkpoint method for dumping raw samples + and stats to the file type defined by ``io``. """ - numpy.random.set_state(fp.read_random_state()) + pass + @abstractmethod + def finalize(self): + """Do any finalization to the samples file before exiting.""" + pass -class BaseMCMCSampler(_BaseSampler): - """This class is used to construct the MCMC sampler from the kombine-like - packages. + def setup_output(self, output_file, force=False, injection_file=None): + """Sets up the sampler's checkpoint and output files. - Parameters - ---------- - sampler : sampler instance - An instance of an MCMC sampler similar to kombine or emcee. - model : model class - A model from ``gwin.models``. + The checkpoint file has the same name as the output file, but with + ``.checkpoint`` appended to the name. A backup file will also be + created. - Attributes - ---------- - sampler : - The MCMC sampler instance used. - p0 : nwalkers x ndim array - The initial position of the walkers. Set by using set_p0. If not set - yet, a ValueError is raised when the attribute is accessed. - pos : {None, array} - An array of the current walker positions. - """ - name = None - - def __init__(self, sampler, model): - self._sampler = sampler - self._pos = None - self._p0 = None - self._currentblob = None - self._nwalkers = None - self.lastclear = 0 - self.burn_in_iterations = None - # initialize - super(BaseMCMCSampler, self).__init__(model) - - @property - def sampler(self): - return self._sampler - - @property - def pos(self): - return self._pos - - def set_p0(self, samples_file=None, prior=None): - """Sets the initial position of the walkers. + If the output file already exists, an ``OSError`` will be raised. + This can be overridden by setting ``force`` to ``True``. Parameters ---------- - samples_file : InferenceFile, optional - If provided, use the last iteration in the given file for the - starting positions. - prior : JointDistribution, optional - Use the given prior to set the initial positions rather than - ``model``'s prior. - - Returns - ------- - p0 : array - An nwalkers x ndim array of the initial positions that were set. - """ - # create a (nwalker, ndim) array for initial positions - nwalkers = self.nwalkers - ndim = len(self.variable_params) - p0 = numpy.ones((nwalkers, ndim)) - # if samples are given then use those as initial positions - if samples_file is not None: - samples = self.read_samples(samples_file, self.variable_params, - iteration=-1) - # transform to sampling parameter space - samples = self.model.apply_sampling_transforms(samples) - # draw random samples if samples are not provided - else: - samples = self.model.prior_rvs(size=nwalkers, prior=prior) - # convert to 2D array - for i, param in enumerate(self.sampling_params): - p0[:, i] = samples[param] - self._p0 = p0 - return p0 - - @property - def p0(self): - if self._p0 is None: - raise ValueError("initial positions not set; run set_p0") - return self._p0 - - @property - def nwalkers(self): - """Get the number of walkers.""" - return self._nwalkers - - @property - def acceptance_fraction(self): - """Get the fraction of steps accepted by each walker as an array. - """ - return self._sampler.acceptance_fraction - - @property - def samples(self): - """Returns the samples in the chain as a FieldArray. - - If the sampling args are not the same as the model params, the - returned samples will have both the sampling and the model params. - - The returned FieldArray has dimension [additional dimensions x] - nwalkers x niterations. - """ - # chain is a [additional dimensions x] niterations x ndim array - samples = self.chain - sampling_params = self.sampling_params - # convert to dictionary to apply boundary conditions - samples = {param: samples[..., ii] for - ii, param in enumerate(sampling_params)} - samples = self.model.prior_distribution.apply_boundary_conditions( - **samples) - # now convert to field array - samples = FieldArray.from_arrays([samples[param] - for param in sampling_params], - names=sampling_params) - # apply transforms to go to model params space - if self.model.sampling_transforms is not None: - samples = self.model.sampling_transforms.apply(samples, - inverse=True) - return samples - - @property - def model_stats(self): - """Returns the model stats as a FieldArray, with field names - corresponding to the type of data returned by the model. - The returned array has shape nwalkers x niterations. If no additional - stats were returned to the sampler by the model, returns - None. - """ - stats = numpy.array(self._sampler.blobs) - if stats.size == 0: - return None - # we'll force arrays to float; this way, if there are `None`s in the - # blobs, they will be changed to `nan`s - arrays = {field: stats[..., fi].astype(float) - for fi, field in - enumerate(self.model.default_stats)} - return FieldArray.from_kwargs(**arrays).transpose() - - # write and read functions - def write_metadata(self, fp, **kwargs): - """Writes metadata about this sampler to the given file. Metadata is - written to the file's `attrs`. - - Parameters - ---------- - fp : InferenceFile - A file handler to an open inference file. - **kwargs : - All keyword args are written to the file's ``attrs``. - """ - super(BaseMCMCSampler, self).write_metadata(fp, **kwargs) - # add info about walkers, burn in - fp.attrs["nwalkers"] = self.nwalkers + sampler : sampler instance + Sampler + output_file : str + Name of the output file. + force : bool, optional + If the output file already exists, overwrite it. + injection_file : str, optional + If an injection was added to the data, write its information. + """ + # check for backup file(s) + checkpoint_file = output_file + '.checkpoint' + backup_file = output_file + '.bkup' + # check if we have a good checkpoint and/or backup file + logging.info("Looking for checkpoint file") + checkpoint_valid = validate_checkpoint_files(checkpoint_file, + backup_file) + # Create a new file if the checkpoint doesn't exist, or if it is + # corrupted + self.new_checkpoint = False # keeps track if this is a new file or not + if not checkpoint_valid: + logging.info("Checkpoint not found or not valid") + create_new_output_file(self, checkpoint_file, force=force, + injection_file=injection_file) + # now the checkpoint is valid + self.new_checkpoint = True + # copy to backup + shutil.copy(checkpoint_file, backup_file) + # write the command line, startup + for fn in [checkpoint_file, backup_file]: + with self.io(fn, "a") as fp: + fp.write_command_line() + fp.write_resume_point() + # store + self.checkpoint_file = checkpoint_file + self.backup_file = backup_file + self.checkpoint_valid = checkpoint_valid - @staticmethod - def write_samples_group(fp, samples_group, parameters, samples, - start_iteration=None, max_iterations=None): - """Writes samples to the given file. - Results are written to: - - ``fp[samples_group/{vararg}]``, - - where ``{vararg}`` is the name of a model params. The samples are - written as an ``nwalkers x niterations`` array. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - samples_group : str - Name of samples group to write. - parameters : list - The parameters to write to the file. - samples : FieldArray - The samples to write. Should be a FieldArray with fields containing - the samples to write and shape nwalkers x niterations. - start_iteration : int, optional - Write results to the file's datasets starting at the given - iteration. Default is to append after the last iteration in the - file. - max_iterations : int, optional - Set the maximum size that the arrays in the hdf file may be resized - to. Only applies if the samples have not previously been written - to file. The default (None) is to use the maximum size allowed by - h5py. - """ - nwalkers, niterations = samples.shape - if max_iterations is not None and max_iterations < niterations: - raise IndexError("The provided max size is less than the " - "number of iterations") - group = samples_group + '/{name}' - # loop over number of dimensions - for param in parameters: - dataset_name = group.format(name=param) - istart = start_iteration - try: - fp_niterations = fp[dataset_name].shape[-1] - if istart is None: - istart = fp_niterations - istop = istart + niterations - if istop > fp_niterations: - # resize the dataset - fp[dataset_name].resize(istop, axis=1) - except KeyError: - # dataset doesn't exist yet - if istart is not None and istart != 0: - raise ValueError("non-zero start_iteration provided, " - "but dataset doesn't exist yet") - istart = 0 - istop = istart + niterations - fp.create_dataset(dataset_name, (nwalkers, istop), - maxshape=(nwalkers, max_iterations), - dtype=float, fletcher32=True) - fp[dataset_name][:, istart:istop] = samples[param] - - def write_chain(self, fp, start_iteration=None, max_iterations=None): - """Writes the samples from the current chain to the given file. - - Results are written to: - - `fp[fp.samples_group/{field}/(temp{k}/)walker{i}]`, - - where `{i}` is the index of a walker, `{field}` is the name of each - field returned by ``model_stats``, and, if the sampler is - multitempered, `{k}` is the temperature. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - start_iteration : int, optional - Write results to the file's datasets starting at the given - iteration. Default is to append after the last iteration in the - file. - max_iterations : int, optional - Set the maximum size that the arrays in the hdf file may be resized - to. Only applies if the samples have not previously been written - to file. The default (None) is to use the maximum size allowed by - h5py. - samples_group : str - Name of samples group to write. - """ - # samples is a nwalkers x niterations field array - samples = self.samples - parameters = self.variable_params - samples_group = fp.samples_group - # write data - self.write_samples_group(fp, samples_group, parameters, samples, - start_iteration=start_iteration, - max_iterations=max_iterations) - - def write_model_stats(self, fp, start_iteration=None, - max_iterations=None): - """Writes the ``model_stats`` to the given file. - - Results are written to: - - `fp[fp.stats_group/{field}/(temp{k}/)walker{i}]`, - - where `{i}` is the index of a walker, `{field}` is the name of each - field returned by ``model_stats``, and, if the sampler is - multitempered, `{k}` is the temperature. If nothing is returned by - ``model_stats``, this does nothing. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - start_iteration : int, optional - Write results to the file's datasets starting at the given - iteration. Default is to append after the last iteration in the - file. - max_iterations : int, optional - Set the maximum size that the arrays in the hdf file may be resized - to. Only applies if the samples have not previously been written - to file. The default (None) is to use the maximum size allowed by - h5py. - - Returns - ------- - stats : {FieldArray, None} - The stats that were written, as a FieldArray. If there were no - stats, returns None. - """ - samples = self.model_stats - if samples is None: - return None - # ensure the prior is in the model params parameter space - if 'logjacobian' in samples.fieldnames: - samples['logprior'] -= samples['logjacobian'] - parameters = samples.fieldnames - samples_group = fp.stats_group - # write data - self.write_samples_group(fp, samples_group, parameters, samples, - start_iteration=start_iteration, - max_iterations=max_iterations) - return samples - - def write_acceptance_fraction(self, fp): - """Write acceptance_fraction data to file. Results are written to - `fp[acceptance_fraction]`. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - """ - dataset_name = "acceptance_fraction" - try: - fp[dataset_name][:] = self.acceptance_fraction - except KeyError: - # dataset doesn't exist yet, create it - fp[dataset_name] = self.acceptance_fraction - - def write_results(self, fp, start_iteration=None, - max_iterations=None, **metadata): - """Writes metadata, samples, model stats, and acceptance fraction - to the given file. Also computes and writes the autocorrleation lengths - of the chains. See the various write function for details. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - start_iteration : int, optional - Write results to the file's datasets starting at the given - iteration. Default is to append after the last iteration in the - file. - max_iterations : int, optional - Set the maximum size that the arrays in the hdf file may be resized - to. Only applies if the acceptance fraction has not previously been - written to the file. The default (None) is to use the maximum size - allowed by h5py. - \**metadata : - All other keyword arguments are passed to ``write_metadata``. - """ - self.write_metadata(fp, **metadata) - self.write_chain(fp, start_iteration=start_iteration, - max_iterations=max_iterations) - self.write_model_stats(fp, start_iteration=start_iteration, - max_iterations=max_iterations) - self.write_acceptance_fraction(fp) - self.write_state(fp) - - @staticmethod - def _read_fields(fp, fields_group, fields, array_class, - thin_start=None, thin_interval=None, thin_end=None, - iteration=None, walkers=None, flatten=True): - """Base function for reading samples and model stats. See - `read_samples` and `read_model_stats` for details. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - fields_group : str - The name of the group to retrieve the desired fields. - fields : list - The list of field names to retrieve. Must be names of groups in - `fp[fields_group/]`. - array_class : FieldArray or similar - The type of array to return. Must have a `from_kwargs` attribute. - - For other details on keyword arguments, see `read_samples` and - `read_model_stats`. - - Returns - ------- - array_class - An instance of the given array class populated with values - retrieved from the fields. - """ - # walkers to load - if walkers is not None: - widx = numpy.zeros(fp.nwalkers, dtype=bool) - widx[walkers] = True - else: - widx = slice(0, None) - # get the slice to use - if iteration is not None: - get_index = iteration - else: - if thin_end is None: - # use the number of current iterations - thin_end = fp.niterations - get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end, - thin_interval=thin_interval) - # load - arrays = {} - group = fields_group + '/{name}' - for name in fields: - arr = fp[group.format(name=name)][widx, get_index] - if flatten: - arr = arr.flatten() - arrays[name] = arr - return array_class.from_kwargs(**arrays) - - @classmethod - def read_samples(cls, fp, parameters, - thin_start=None, thin_interval=None, thin_end=None, - iteration=None, walkers=None, flatten=True, - samples_group=None, array_class=None): - """Reads samples for the given parameter(s). - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - parameters : (list of) strings - The parameter(s) to retrieve. A parameter can be the name of any - field in `fp[fp.samples_group]`, a virtual field or method of - `FieldArray` (as long as the file contains the necessary fields - to derive the virtual field or method), and/or a function of - these. - thin_start : int - Index of the sample to begin returning samples. Default is to read - samples after burn in. To start from the beginning set thin_start - to 0. - thin_interval : int - Interval to accept every i-th sample. Default is to use the - `fp.acl`. If `fp.acl` is not set, then use all samples - (set thin_interval to 1). - thin_end : int - Index of the last sample to read. If not given then - `fp.niterations` is used. - iteration : int - Get a single iteration. If provided, will override the - `thin_{start/interval/end}` arguments. - walkers : {None, (list of) int} - The walker index (or a list of indices) to retrieve. If None, - samples from all walkers will be obtained. - flatten : {True, bool} - The returned array will be one dimensional, with all desired - samples from all desired walkers concatenated together. If False, - the returned array will have dimension requested walkers - x requested iterations. - samples_group : {None, str} - The group in `fp` from which to retrieve the parameter fields. If - None, searches in `fp.samples_group`. - array_class : {None, array class} - The type of array to return. The class must have a `from_kwargs` - class method and a `parse_parameters` method. If None, will return - a FieldArray. - - Returns - ------- - array_class - Samples for the given parameters, as an instance of a the given - `array_class` (`FieldArray` if `array_class` is None). - """ - # get the group to load from - if samples_group is None: - samples_group = fp.samples_group - # get the type of array class to use - if array_class is None: - array_class = FieldArray - # get the names of fields needed for the given parameters - possible_fields = fp[samples_group].keys() - loadfields = array_class.parse_parameters(parameters, possible_fields) - return cls._read_fields(fp, samples_group, loadfields, array_class, - thin_start=thin_start, - thin_interval=thin_interval, thin_end=thin_end, - iteration=iteration, walkers=walkers, - flatten=flatten) - - @classmethod - def n_independent_samples(cls, fp): - """Returns the number of independent samples stored in a file. - - The number of independent samples are counted starting from after - burn-in. If the sampler hasn't burned in yet, then 0 is returned. +# +# ============================================================================= +# +# Convenience functions +# +# ============================================================================= +# - Parameters - ----------- - fp : InferenceFile - An open file handler to read. - - Returns - ------- - int - The number of independent samples. - """ - # check if burned in - if not fp.is_burned_in: - return 0 - # we'll just read a single parameter from the file - samples = cls.read_samples(fp, fp.variable_params[0]) - return samples.size +def create_new_output_file(sampler, filename, force=False, injection_file=None, + **kwargs): + """Creates a new output file. - @staticmethod - def read_acceptance_fraction(fp, walkers=None): - """Reads the acceptance fraction from the given file. + If the output file already exists, an ``OSError`` will be raised. This can + be overridden by setting ``force`` to ``True``. - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - walkers : {None, (list of) int} - The walker index (or a list of indices) to retrieve. If None, - samples from all walkers will be obtained. - - Returns - ------- - array - Array of acceptance fractions with shape (requested walkers,). - """ - group = 'acceptance_fraction' - if walkers is None: - wmask = numpy.ones(fp.nwalkers, dtype=bool) + Parameters + ---------- + sampler : sampler instance + Sampler + filename : str + Name of the file to create. + force : bool, optional + Create the file even if it already exists. Default is False. + injection_file : str, optional + If an injection was added to the data, write its information. + \**kwargs : + All other keyword arguments are passed through to the file's + ``write_metadata`` function. + """ + if os.path.exists(filename): + if force: + os.remove(filename) else: - wmask = numpy.zeros(fp.nwalkers, dtype=bool) - wmask[walkers] = True - return fp[group][wmask] + raise OSError("output-file already exists; use force if you " + "wish to overwrite it.") + logging.info("Creating file {}".format(filename)) + with sampler.io(filename, "w") as fp: + # save the sampler's metadata + fp.write_sampler_metadata(sampler) + # save injection parameters + if injection_file is not None: + logging.info("Writing injection file to output") + # just use the first one + fp.write_injections(injection_file) - @classmethod - def compute_acfs(cls, fp, start_index=None, end_index=None, - per_walker=False, walkers=None, parameters=None): - """Computes the autocorrleation function of the model params in the - given file. - By default, parameter values are averaged over all walkers at each - iteration. The ACF is then calculated over the averaged chain. An - ACF per-walker will be returned instead if ``per_walker=True``. +def initial_dist_from_config(cp): + """Loads a distribution for the sampler start from the given config file. - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - start_index : {None, int} - The start index to compute the acl from. If None, will try to use - the number of burn-in iterations in the file; otherwise, will start - at the first sample. - end_index : {None, int} - The end index to compute the acl to. If None, will go to the end - of the current iteration. - per_walker : optional, bool - Return the ACF for each walker separately. Default is False. - walkers : optional, int or array - Calculate the ACF using only the given walkers. If None (the - default) all walkers will be used. - parameters : optional, str or array - Calculate the ACF for only the given parameters. If None (the - default) will calculate the ACF for all of the model params. - - Returns - ------- - FieldArray - A ``FieldArray`` of the ACF vs iteration for each parameter. If - `per-walker` is True, the FieldArray will have shape - ``nwalkers x niterations``. - """ - acfs = {} - if parameters is None: - parameters = fp.variable_params - if isinstance(parameters, str) or isinstance(parameters, unicode): - parameters = [parameters] - for param in parameters: - if per_walker: - # just call myself with a single walker - if walkers is None: - walkers = numpy.arange(fp.nwalkers) - arrays = [cls.compute_acfs(fp, start_index=start_index, - end_index=end_index, - per_walker=False, walkers=ii, - parameters=param)[param] - for ii in walkers] - acfs[param] = numpy.vstack(arrays) - else: - samples = cls.read_samples(fp, param, - thin_start=start_index, - thin_interval=1, thin_end=end_index, - walkers=walkers, - flatten=False)[param] - samples = samples.mean(axis=0) - acfs[param] = autocorrelation.calculate_acf(samples).numpy() - return FieldArray.from_kwargs(**acfs) - - @classmethod - def compute_acls(cls, fp, start_index=None, end_index=None): - """Computes the autocorrleation length for all model params in the - given file. - - Parameter values are averaged over all walkers at each iteration. - The ACL is then calculated over the averaged chain. If the returned ACL - is `inf`, will default to the number of current iterations. + A distribution will only be loaded if the config file has a [initial-*] + section(s). - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - start_index : {None, int} - The start index to compute the acl from. If None, will try to use - the number of burn-in iterations in the file; otherwise, will start - at the first sample. - end_index : {None, int} - The end index to compute the acl to. If None, will go to the end - of the current iteration. - - Returns - ------- - dict - A dictionary giving the ACL for each parameter. - """ - acls = {} - for param in fp.variable_params: - samples = cls.read_samples(fp, param, - thin_start=start_index, - thin_interval=1, thin_end=end_index, - flatten=False)[param] - samples = samples.mean(axis=0) - acl = autocorrelation.calculate_acl(samples) - if numpy.isinf(acl): - acl = samples.size - acls[param] = acl - return acls - - @staticmethod - def write_acls(fp, acls): - """Writes the given autocorrelation lengths to the given file. - - The ACL of each parameter is saved to ``fp['acls/{param}']``. - The maximum over all the parameters is saved to the file's 'acl' - attribute. - - Parameters - ---------- - fp : InferenceFile - An open file handler to write the samples to. - acls : dict - A dictionary of ACLs keyed by the parameter. - - Returns - ------- - ACL - The maximum of the acls that was written to the file. - """ - group = 'acls/{}' - # write the individual acls - for param in acls: - try: - # we need to use the write_direct function because it's - # apparently the only way to update scalars in h5py - fp[group.format(param)].write_direct(numpy.array(acls[param])) - except KeyError: - # dataset doesn't exist yet - fp[group.format(param)] = acls[param] - # write the maximum over all params - fp.attrs['acl'] = numpy.array(acls.values()).max() - return fp.attrs['acl'] - - @staticmethod - def read_acls(fp): - """Reads the acls of all the parameters in the given file. - - Parameters - ---------- - fp : InferenceFile - An open file handler to read the acls from. - - Returns - ------- - dict - A dictionary of the ACLs, keyed by the parameter name. - """ - group = fp['acls'] - return {param: group[param].value for param in group.keys()} + Parameters + ---------- + cp : Config parser + The config parser to try to load from. + + Returns + ------- + JointDistribution or None : + The initial distribution. If no [initial-*] section found in the + config file, will just return None. + """ + if len(cp.get_subsections("initial")): + logging.info("Using a different distribution for the starting points " + "than the prior.") + initial_dists = distributions.read_distributions_from_config( + cp, section="initial") + constraints = distributions.read_constraints_from_config( + cp, constraint_section="initial_constraint") + init_dist = distributions.JointDistribution( + sampler.variable_params, *initial_dists, + **{"constraints": constraints}) + else: + init_dist = None + return init_dist diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py new file mode 100644 index 0000000..d5afbc0 --- /dev/null +++ b/gwin/sampler/base_mcmc.py @@ -0,0 +1,564 @@ +# Copyright (C) 2016 Christopher M. Biwer, Collin Capano +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# +"""Provides constructor classes and convenience functions for MCMC samplers.""" + +from __future__ import absolute_import + +from abc import (ABCMeta, abstractmethod, abstractproperty) +import logging +import numpy +from pycbc.filter import autocorrelation + +from ..io import validate_checkpoint_files + +# +# ============================================================================= +# +# Convenience functions +# +# ============================================================================= +# + + +def raw_samples_to_dict(sampler, raw_samples): + """Convenience function for converting ND array to a dict of samples. + + The samples are assumed to have dimension + ``[sampler.base_shape x] niterations x len(sampler.sampling_params)``. + + Parameters + ---------- + sampler : sampler instance + An instance of an MCMC sampler. + raw_samples : array + The array of samples to convert. + + Returns + ------- + dict : + A dictionary mapping the raw samples to the variable params. If the + sampling params are not the same as the variable params, they will + also be included. Each array will have shape + ``[sampler.base_shape x] niterations``. + """ + sampling_params = sampler.sampling_params + # convert to dictionary + samples = {param: raw_samples[..., ii] for + ii, param in enumerate(sampling_params)} + # apply boundary conditions + samples = sampler.model.prior_distribution.apply_boundary_conditions( + **samples) + # apply transforms to go to model's variable params space + if sampler.model.sampling_transforms is not None: + samples = sampler.model.sampling_transforms.apply( + samples, inverse=True) + return samples + + +def raw_stats_to_dict(sampler, raw_stats): + """Converts an ND array of model stats to a dict. + + The ``raw_stats`` may either be a numpy array or a list. If the + former, the stats are assumed to have shape + ``[sampler.base_shape x] niterations x nstats, where nstats are the number + of stats returned by ``sampler.model.default_stats``. If the latter, the + list is cast to an array that is assumed to be the same shape as if an + array was given. + + Parameters + ---------- + sampler : sampler instance + An instance of an MCMC sampler. + raw_stats : array or list + The stats to convert. + + Returns + ------- + dict : + A dictionary mapping the model's ``default_stats`` to arrays of values. + Each array will have shape ``[sampler.base_shape x] niterations``. + """ + if not isinstance(raw_stats, numpy.ndarray): + # Assume list. Since the model returns a tuple of values, this should + # be a [sampler.base_shape x] x niterations list of tuples. We can + # therefore immediately convert this to a ND array. + raw_stats = numpy.array(raw_stats) + return {stat: raw_stats[..., ii] + for (ii, stat) in enumerate(sampler.model.default_stats)} + +# +# ============================================================================= +# +# BaseMCMC definition +# +# ============================================================================= +# + + +class BaseMCMC(object): + """This class provides methods common to MCMCs. + + It is not a sampler class itself. Sampler classes can inherit from this + along with ``BaseSampler``. + + Attributes + ---------- + p0 : dict + A dictionary of the initial position of the walkers. Set by using + ``set_p0``. If not set yet, a ``ValueError`` is raised when the + attribute is accessed. + pos : dict + A dictionary of the current walker positions. If the sampler hasn't + been run yet, returns p0. + """ + __metaclass__ = ABCMeta + + _lastclear = None # the iteration when samples were cleared from memory + _itercounter = None # the number of iterations since the last clear + _pos = None + _p0 = None + _nwalkers = None + _burn_in = None + _checkpoint_interval = None + _target_niterations = None + _target_eff_nsamples = None + + @abstractproperty + def base_shape(self): + """What shape the sampler's samples arrays are in, excluding + the iterations dimension. + + For example, if a sampler uses 20 walkers and 3 temperatures, this + would be ``(3, 20)``. If a sampler only uses a single walker and no + temperatures this would be ``()``. + """ + pass + + @property + def nwalkers(self): + """Get the number of walkers.""" + if self._nwalkers is None: + raise ValueError("number of walkers not set") + return self._nwalkers + + @property + def niterations(self): + """Get the current number of iterations.""" + itercounter = self._itercounter + if itercounter is None: + itercounter = 0 + lastclear = self._lastclear + if lastclear is None: + lastclear = 0 + return itercounter + lastclear + + @property + def checkpoint_interval(self): + """The number of iterations to do between checkpoints.""" + return self._checkpoint_interval + + @property + def target_niterations(self): + """The number of iterations the sampler should run for.""" + return self._target_niterations + + @property + def target_eff_nsamples(self): + """The target number of effective samples the sampler should get.""" + return self._target_eff_nsamples + + def set_target(self, niterations=None, eff_nsamples=None): + """Sets the target niterations/nsamples for the sampler. + + One or the other must be provided, not both. + """ + if niterations is None and eff_nsamples is None: + raise ValueError("Must provide a target niterations or " + "eff_nsamples") + if niterations is not None and eff_nsamples is not None: + raise ValueError("Must provide a target niterations or " + "eff_nsamples, not both") + self._target_niterations = int(niterations) \ + if niterations is not None else None + self._target_eff_nsamples = int(eff_nsamples) \ + if eff_nsamples is not None else None + + @abstractmethod + def clear_samples(self): + """A method to clear samples from memory.""" + pass + + @property + def pos(self): + pos = self._pos + if pos is None: + return self.p0 + # convert to dict + pos = {param: self._pos[..., k] + for (k, param) in enumerate(self.sampling_params)} + return pos + + @property + def p0(self): + """The starting position of the walkers in the sampling param space. + + The returned object is a dict mapping the sampling parameters to the + values. + """ + if self._p0 is None: + raise ValueError("initial positions not set; run set_p0") + # convert to dict + p0 = {param: self._p0[..., k] + for (k, param) in enumerate(self.sampling_params)} + return p0 + + def set_p0(self, samples_file=None, prior=None): + """Sets the initial position of the walkers. + + Parameters + ---------- + samples_file : InferenceFile, optional + If provided, use the last iteration in the given file for the + starting positions. + prior : JointDistribution, optional + Use the given prior to set the initial positions rather than + ``model``'s prior. + + Returns + ------- + p0 : dict + A dictionary maping sampling params to the starting positions. + """ + # if samples are given then use those as initial positions + if samples_file is not None: + with self.io(samples_file, 'r') as fp: + samples = fp.read_samples(self.variable_params, + iteration=-1) + # make sure we have the same shape + assert samples.shape == self.base_shape, ( + "samples in file {} have shape {}, but I have shape {}". + format(samples_file, samples.shape, self.base_shape)) + # transform to sampling parameter space + if self.model.sampling_transforms is not None: + samples = self.model.sampling_transforms.apply(samples) + # draw random samples if samples are not provided + else: + nsamples = numpy.prod(self.base_shape) + samples = self.model.prior_rvs(size=nsamples, prior=prior).reshape( + self.base_shape) + # store as ND array with shape [base_shape] x nparams + ndim = len(self.variable_params) + p0 = numpy.ones(list(self.base_shape)+[ndim]) + for i, param in enumerate(self.sampling_params): + p0[..., i] = samples[param] + self._p0 = p0 + return self.p0 + + def set_initial_conditions(self, initial_distribution=None, + samples_file=None): + """Sets the initial starting point for the MCMC. + + If a starting samples file is provided, will also load the random + state from it. + """ + self.set_p0(samples_file=samples_file, prior=initial_distribution) + # if a samples file was provided, use it to set the state of the + # sampler + if samples_file is not None: + self.set_state_from_file(samples_file) + + @abstractmethod + def set_state_from_file(self, filename): + """Sets the state of the sampler to the instance saved in a file. + """ + pass + + def run(self): + """Runs the sampler.""" + if self.target_eff_nsamples and self.checkpoint_interval is None: + raise ValueError("A checkpoint interval must be set if " + "targetting an effective number of samples") + # get the starting number of samples: + # "nsamples" keeps track of the number of samples we've obtained (if + # target_eff_nsamples is not None, this is the effective number of + # samples; otherwise, this is the total number of samples). + # _lastclear is the number of iterations that the file already + # contains (either due to sampler burn-in, or a previous checkpoint) + if self.new_checkpoint: + self._lastclear = 0 + else: + with self.io(self.checkpoint_file, "r") as fp: + self._lastclear = fp.niterations + if self.target_eff_nsamples is not None: + target_nsamples = self.target_eff_nsamples + with self.io(self.checkpoint_file, "r") as fp: + nsamples = fp.effective_nsamples + elif self.target_niterations is not None: + # the number of samples is the number of iterations times the + # number of walkers + target_nsamples = self.nwalkers * self.target_niterations + nsamples = self._lastclear * self.nwalkers + else: + raise ValueError("must set either target_eff_nsamples or " + "target_niterations; see set_target") + self._itercounter = 0 + # figure out the interval to use + iterinterval = self.checkpoint_interval + if iterinterval is None: + iterinterval = self.target_niterations + # run sampler until we have the desired number of samples + while nsamples < target_nsamples: + # adjust the interval if we would go past the number of iterations + if self.target_niterations is not None and ( + self.niterations + iterinterval > self.target_niterations): + iterinterval = self.target_niterations - self.niterations + # run sampler and set initial values to None so that sampler + # picks up from where it left off next call + logging.info("Running sampler for {} to {} iterations".format( + self.niterations, self.niterations + iterinterval)) + # run the underlying sampler for the desired interval + self.run_mcmc(iterinterval) + # update the itercounter + self._itercounter = self._itercounter + iterinterval + # dump the current results + self.checkpoint() + # update nsamples for next loop + if self.target_eff_nsamples is not None: + nsamples = self.effective_nsamples + logging.info("Have {} effective samples post burn in".format( + nsamples)) + else: + nsamples += iterinterval * self.nwalkers + + @property + def burn_in(self): + """The class for doing burn-in tests (if specified).""" + return self._burn_in + + def set_burn_in(self, burn_in): + """Sets the object to use for doing burn-in tests.""" + self._burn_in = burn_in + + @property + def effective_nsamples(self): + """The effective number of samples post burn-in that the sampler has + acquired so far.""" + try: + acl = numpy.array(self.acls.values()).max() + except (AttributeError, TypeError): + acl = numpy.inf + if self.burn_in is None: + nperwalker = max(int(self.niterations // acl), 1) + elif self.burn_in.is_burned_in: + nperwalker = int( + (self.niterations - self.burn_in.burn_in_iteration) // acl) + # after burn in, we always have atleast 1 sample per walker + nperwalker = max(nperwalker, 1) + else: + nperwalker = 0 + return self.nwalkers * nperwalker + + @abstractmethod + def run_mcmc(self, niterations): + """Run the MCMC for the given number of iterations.""" + pass + + @abstractmethod + def write_results(self, filename): + """Should write all samples currently in memory to the given file.""" + pass + + def checkpoint(self): + """Dumps current samples to the checkpoint file.""" + # write new samples + logging.info("Writing samples to files") + for fn in [self.checkpoint_file, self.backup_file]: + self.write_results(fn) + with self.io(fn, "a") as fp: + # write the current number of iterations + fp.write_niterations(self.niterations) + # check for burn in, compute the acls + self.acls = None + if self.burn_in is not None: + logging.info("Updating burn in") + self.burn_in.evaluate(self.checkpoint_file) + burn_in_iter = self.burn_in.burn_in_iteration + logging.info("Is burned in: {}".format(self.burn_in.is_burned_in)) + if self.burn_in.is_burned_in: + logging.info("Burn-in iteration: {}".format( + self.burn_in.burn_in_iteration)) + else: + burn_in_iter = 0 + # Compute acls; the burn_in test may have calculated an acl and saved + # it, in which case we don't need to do it again. + if self.acls is None: + logging.info("Computing acls") + self.acls = self.compute_acl(self.checkpoint_file, + start_index=burn_in_iter) + logging.info("ACL: {}".format(numpy.array(self.acls.values()).max())) + # write + for fn in [self.checkpoint_file, self.backup_file]: + with self.io(fn, "a") as fp: + if self.burn_in is not None: + fp.write_burn_in(self.burn_in) + if self.acls is not None: + fp.write_acls(self.acls) + # write effective number of samples + fp.write_effective_nsamples(self.effective_nsamples) + # check validity + logging.info("Validating checkpoint and backup files") + checkpoint_valid = validate_checkpoint_files( + self.checkpoint_file, self.backup_file) + if not checkpoint_valid: + raise IOError("error writing to checkpoint file") + # clear the in-memory chain to save memory + logging.info("Clearing samples from memory") + self.clear_samples() + + @abstractmethod + def compute_acf(cls, filename, **kwargs): + """A method to compute the autocorrelation function of samples in the + given file.""" + pass + + @abstractmethod + def compute_acl(cls, filename, **kwargs): + """A method to compute the autocorrelation length of samples in the + given file.""" + pass + + +class MCMCAutocorrSupport(object): + """Provides class methods for calculating ensemble ACFs/ACLs. + """ + + @classmethod + def compute_acf(cls, filename, start_index=None, end_index=None, + per_walker=False, walkers=None, parameters=None): + """Computes the autocorrleation function of the model params in the + given file. + + By default, parameter values are averaged over all walkers at each + iteration. The ACF is then calculated over the averaged chain. An + ACF per-walker will be returned instead if ``per_walker=True``. + + Parameters + ----------- + filename : str + Name of a samples file to compute ACFs for. + start_index : {None, int} + The start index to compute the acl from. If None, will try to use + the number of burn-in iterations in the file; otherwise, will start + at the first sample. + end_index : {None, int} + The end index to compute the acl to. If None, will go to the end + of the current iteration. + per_walker : optional, bool + Return the ACF for each walker separately. Default is False. + walkers : optional, int or array + Calculate the ACF using only the given walkers. If None (the + default) all walkers will be used. + parameters : optional, str or array + Calculate the ACF for only the given parameters. If None (the + default) will calculate the ACF for all of the model params. + + Returns + ------- + dict : + Dictionary of arrays giving the ACFs for each parameter. If + ``per-walker`` is True, the arrays will have shape + ``nwalkers x niterations``. + """ + acfs = {} + with cls._io(filename, 'r') as fp: + if parameters is None: + parameters = fp.variable_params + if isinstance(parameters, str) or isinstance(parameters, unicode): + parameters = [parameters] + for param in parameters: + if per_walker: + # just call myself with a single walker + if walkers is None: + walkers = numpy.arange(fp.nwalkers) + arrays = [ + cls.compute_acf(filename, start_index=start_index, + end_index=end_index, + per_walker=False, walkers=ii, + parameters=param)[param] + for ii in walkers] + acfs[param] = numpy.vstack(arrays) + else: + samples = fp.read_raw_samples( + param, thin_start=start_index, thin_interval=1, + thin_end=end_index, walkers=walkers, + flatten=False)[param] + samples = samples.mean(axis=0) + acfs[param] = autocorrelation.calculate_acf( + samples).numpy() + return acfs + + @classmethod + def compute_acl(cls, filename, start_index=None, end_index=None): + """Computes the autocorrleation length for all model params in the + given file. + + Parameter values are averaged over all walkers at each iteration. + The ACL is then calculated over the averaged chain. If the returned ACL + is `inf`, will default to the number of current iterations. + + Parameters + ----------- + filename : str + Name of a samples file to compute ACLs for. + start_index : {None, int} + The start index to compute the acl from. If None, will try to use + the number of burn-in iterations in the file; otherwise, will start + at the first sample. + end_index : {None, int} + The end index to compute the acl to. If None, will go to the end + of the current iteration. + + Returns + ------- + dict + A dictionary giving the ACL for each parameter. + """ + acls = {} + with cls._io(filename, 'r') as fp: + for param in fp.variable_params: + samples = fp.read_raw_samples( + param, thin_start=start_index, thin_interval=1, + thin_end=end_index, flatten=False)[param] + samples = samples.mean(axis=0) + # if < 10 samples, just set to inf + # Note: this should be done inside of pycbc's autocorrelation + # function + if samples.size < 10: + acl = numpy.inf + else: + acl = autocorrelation.calculate_acl(samples) + if acl <= 0: + acl = numpy.inf + acls[param] = acl + return acls diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py index 97786b5..443f89d 100644 --- a/gwin/sampler/emcee.py +++ b/gwin/sampler/emcee.py @@ -29,10 +29,16 @@ from __future__ import absolute_import import numpy -from pycbc.io import FieldArray -from pycbc.filter import autocorrelation +import emcee +from pycbc.pool import choose_pool +from pycbc.workflow import ConfigParser -from .base import BaseMCMCSampler +from .base import BaseSampler +from .base_mcmc import (BaseMCMC, MCMCAutocorrSupport, raw_samples_to_dict, + raw_stats_to_dict) +from ..burn_in import MCMCBurnInTests +from ..io import EmceeFile +from .. import models # @@ -43,7 +49,7 @@ # ============================================================================= # -class EmceeEnsembleSampler(BaseMCMCSampler): +class EmceeEnsembleSampler(MCMCAutocorrSupport, BaseMCMC, BaseSampler): """This class is used to construct an MCMC sampler from the emcee package's EnsembleSampler. @@ -59,896 +65,168 @@ class EmceeEnsembleSampler(BaseMCMCSampler): cores/nodes/etc. """ name = "emcee" - - def __init__(self, model, nwalkers, pool=None, - model_call=None): - try: - import emcee - except ImportError: - raise ImportError("emcee is not installed.") - - if model_call is None: - model_call = model - + _io = EmceeFile + burn_in_class = MCMCBurnInTests + + def __init__(self, model, nwalkers, checkpoint_interval=None, + logpost_function=None, nprocesses=1, use_mpi=False): + + self.model = model + # create a wrapper for calling the model + if logpost_function is None: + logpost_function = 'logposterior' + model_call = models.CallModel(model, logpost_function) + + # Set up the pool + if nprocesses > 1: + # these are used to help paralleize over multiple cores / MPI + models._global_instance = model_call + model_call = models._call_global_model + pool = choose_pool(mpi=use_mpi, processes=nprocesses) + if pool is not None: + pool.count = nprocesses + + # set up emcee + self._nwalkers = nwalkers ndim = len(model.variable_params) - sampler = emcee.EnsembleSampler(nwalkers, ndim, - model_call, - pool=pool) + self._sampler = emcee.EnsembleSampler(nwalkers, ndim, model_call, + pool=pool) # emcee uses it's own internal random number generator; we'll set it # to have the same state as the numpy generator rstate = numpy.random.get_state() - sampler.random_state = rstate - # initialize - super(EmceeEnsembleSampler, self).__init__( - sampler, model) - self._nwalkers = nwalkers - - @classmethod - def from_cli(cls, opts, model, pool=None, - model_call=None): - """Create an instance of this sampler from the given command-line - options. + self._sampler.random_state = rstate + self._checkpoint_interval = checkpoint_interval - Parameters - ---------- - opts : ArgumentParser options - The options to parse. - model : LikelihoodEvaluator - The model to use with the sampler. + @property + def io(self): + return self._io - Returns - ------- - EmceeEnsembleSampler - An emcee sampler initialized based on the given arguments. - """ - return cls(model, opts.nwalkers, - pool=pool, model_call=model_call) + @property + def base_shape(self): + return (self.nwalkers,) @property - def lnpost(self): - """Get the natural logarithm of the likelihood as an - nwalkers x niterations array. + def samples(self): + """A dict mapping ``variable_params`` to arrays of samples currently + in memory. + + The arrays have shape ``nwalkers x niterations``. """ - # emcee returns nwalkers x niterations - return self._sampler.lnprobability + # emcee stores samples to it's chain attribute as a + # nwalker x niterations x ndim array + raw_samples = self._sampler.chain + return raw_samples_to_dict(self, raw_samples) @property - def chain(self): - """Get all past samples as an nwalker x niterations x ndim array.""" - # emcee returns the chain as nwalker x niterations x ndim - return self._sampler.chain + def model_stats(self): + """A dict mapping the model's ``default_stats`` to arrays of values. - def clear_chain(self): - """Clears the chain and blobs from memory. + The returned array has shape ``nwalkers x niterations``. + """ + raw_stats = numpy.array(self._sampler.blobs) + # raw_stats has shape niterations x nwalkers x nstats; transpose + # so that it has shape nwalkers x niterations x nstats + raw_stats = raw_stats.transpose((1, 0, 2)) + return raw_stats_to_dict(self, raw_stats) + + def clear_samples(self): + """Clears the samples and stats from memory. """ # store the iteration that the clear is occuring on - self.lastclear = self.niterations + self._lastclear = self.niterations + self._itercounter = 0 # now clear the chain self._sampler.reset() self._sampler.clear_blobs() - def set_p0(self, samples_file=None, prior=None): - """Sets the initial position of the walkers. - - Parameters - ---------- - samples_file : InferenceFile, optional - If provided, use the last iteration in the given file for the - starting positions. - prior : JointDistribution, optional - Use the given prior to set the initial positions rather than - ``model``'s prior. - - Returns - ------- - p0 : array - An nwalkers x ndim array of the initial positions that were set. - """ - # we define set_p0 here to ensure that emcee's internal random number - # generator is set to numpy's after the distributions' rvs functions - # are called - super(EmceeEnsembleSampler, self).set_p0(samples_file=samples_file, - prior=prior) - # update the random state - self._sampler.random_state = numpy.random.get_state() - - def write_state(self, fp): - """Saves the state of the sampler in a file. - """ - fp.write_random_state(state=self._sampler.random_state) - - def set_state_from_file(self, fp): + def set_state_from_file(self, filename): """Sets the state of the sampler back to the instance saved in a file. """ - rstate = fp.read_random_state() + with self.io(filename, 'r') as fp: + rstate = fp.read_random_state() # set the numpy random state numpy.random.set_state(rstate) # set emcee's generator to the same state self._sampler.random_state = rstate - def run(self, niterations, **kwargs): + def run_mcmc(self, niterations, **kwargs): """Advance the ensemble for a number of samples. Parameters ---------- niterations : int - Number of samples to get from sampler. - - Returns - ------- - p : numpy.array - An array of current walker positions with shape (nwalkers, ndim). - lnpost : numpy.array - The list of log posterior probabilities for the walkers at - positions p, with shape (nwalkers, ndim). - rstate : - The current state of the random number generator. + Number of iterations to run the sampler for. + \**kwargs : + All other keyword arguments are passed to the emcee sampler. """ pos = self._pos if pos is None: - pos = self.p0 + pos = self._p0 res = self._sampler.run_mcmc(pos, niterations, **kwargs) - p, lnpost, rstate = res[0], res[1], res[2] + p, _, _ = res[0], res[1], res[2] # update the positions self._pos = p - return p, lnpost, rstate - def write_results(self, fp, start_iteration=None, - max_iterations=None, **metadata): - """Writes metadata, samples, model stats, and acceptance fraction - to the given file. See the write function for each of those for - details. + def write_results(self, filename): + """Writes samples, model stats, acceptance fraction, and random state + to the given file. Parameters ----------- - fp : InferenceFile - A file handler to an open inference file. - start_iteration : int, optional - Write results to the file's datasets starting at the given - iteration. Default is to append after the last iteration in the - file. - max_iterations : int, optional - Set the maximum size that the arrays in the hdf file may be resized - to. Only applies if the samples have not previously been written - to file. The default (None) is to use the maximum size allowed by - h5py. - \**metadata : - All other keyword arguments are passed to ``write_metadata``. - """ - self.write_metadata(fp, **metadata) - self.write_chain(fp, start_iteration=start_iteration, - max_iterations=max_iterations) - self.write_model_stats(fp, start_iteration=start_iteration, - max_iterations=max_iterations) - self.write_acceptance_fraction(fp) - self.write_state(fp) - - -# This is needed for two reason -# 1) pools freeze state when created and so classes *cannot be updated* -# 2) methods cannot be pickled. -class _callprior(object): - """Calls the model's prior function, and ensures that no - metadata is returned.""" - def __init__(self, model_call): - self.callable = model_call - - def __call__(self, args): - prior = self.callable(args, callstat='logprior', - return_all_stats=False) - return prior - - -class _callloglikelihood(object): - """Calls the model's loglikelihood function. - """ - def __init__(self, model_call): - self.callable = model_call - - def __call__(self, args): - return self.callable(args, callstat='loglikelihood', - return_all_stats=False) - - -class EmceePTSampler(BaseMCMCSampler): - """This class is used to construct a parallel-tempered MCMC sampler from - the emcee package's PTSampler. - - Parameters - ---------- - model : model - A model from ``gwin.models``. - ntemps : int - Number of temeratures to use in the sampler. - nwalkers : int - Number of walkers to use in sampler. - pool : function with map, Optional - A provider of a map function that allows a function call to be run - over multiple sets of arguments and possibly maps them to - cores/nodes/etc. - """ - name = "emcee_pt" - - def __init__(self, model, ntemps, nwalkers, pool=None, - model_call=None): - - try: - import emcee - except ImportError: - raise ImportError("emcee is not installed.") - - if model_call is None: - model_call = model - - # construct the sampler: PTSampler needs the likelihood and prior - # functions separately - ndim = len(model.variable_params) - sampler = emcee.PTSampler(ntemps, nwalkers, ndim, - _callloglikelihood(model_call), - _callprior(model_call), - pool=pool) - # initialize - super(EmceePTSampler, self).__init__( - sampler, model) - self._nwalkers = nwalkers - self._ntemps = ntemps + filename : str + The file to write to. The file is opened using the ``io`` class + in an an append state. + """ + with self.io(filename, 'a') as fp: + # write samples + fp.write_samples(self.samples, self.model.variable_params) + # write stats + fp.write_samples(self.model_stats) + # write accpetance + fp.write_acceptance_fraction(self._sampler.acceptance_fraction) + # write random state + fp.write_random_state(state=self._sampler.random_state) + + def finalize(self): + """All data is written by the last checkpoint in the run method, so + this just passes.""" + pass @classmethod - def from_cli(cls, opts, model, pool=None, - model_call=None): - """Create an instance of this sampler from the given command-line - options. - - Parameters - ---------- - opts : ArgumentParser options - The options to parse. - model : LikelihoodEvaluator - The model to use with the sampler. - - Returns - ------- - EmceePTSampler - An emcee sampler initialized based on the given arguments. - """ - return cls(model, opts.ntemps, opts.nwalkers, - pool=pool, model_call=model_call) - - @property - def ntemps(self): - return self._ntemps - - @property - def chain(self): - """Get all past samples as an ntemps x nwalker x niterations x ndim - array. - """ - # emcee returns the chain as ntemps x nwalker x niterations x ndim - return self._sampler.chain - - def clear_chain(self): - """Clears the chain and blobs from memory. - """ - # store the iteration that the clear is occuring on - self.lastclear = self.niterations - # now clear the chain - self._sampler.reset() - - @property - def model_stats(self): - """Returns the log likelihood ratio and log prior as a FieldArray. - The returned array has shape ntemps x nwalkers x niterations. - """ - # likelihood has shape ntemps x nwalkers x niterations - logl = self._sampler.lnlikelihood - # get prior from posterior - logp = self._sampler.lnprobability - logl - # compute the likelihood ratio - loglr = logl - self.model.lognl - kwargs = {'loglr': loglr, 'logprior': logp} - # if different coordinates were used for sampling, get the jacobian - if self.model.sampling_transforms is not None: - samples = self.samples - # convert to dict - d = {param: samples[param] for param in samples.fieldnames} - logj = self.model.logjacobian(**d) - kwargs['logjacobian'] = logj - return FieldArray.from_kwargs(**kwargs) - - @property - def lnpost(self): - """Get the natural logarithm of the likelihood + the prior as an - ntemps x nwalkers x niterations array. - """ - # emcee returns ntemps x nwalkers x niterations - return self._sampler.lnprobability - - def set_p0(self, samples_file=None, prior=None): - """Sets the initial position of the walkers. - - Parameters - ---------- - samples_file : InferenceFile, optional - If provided, use the last iteration in the given file for the - starting positions. - prior : JointDistribution, optional - Use the given prior to set the initial positions rather than - ``model``'s prior. - - Returns - ------- - p0 : array - An ntemps x nwalkers x ndim array of the initial positions that - were set. - """ - # create a (nwalker, ndim) array for initial positions - ntemps = self.ntemps - nwalkers = self.nwalkers - ndim = len(self.variable_params) - p0 = numpy.ones((ntemps, nwalkers, ndim)) - # if samples are given then use those as initial positions - if samples_file is not None: - samples = self.read_samples(samples_file, self.variable_params, - iteration=-1, temps='all', - flatten=False)[..., 0] - # transform to sampling parameter space - samples = self.model.apply_sampling_transforms( - samples) - # draw random samples if samples are not provided + def from_config(cls, cp, model, nprocesses=1, use_mpi=False): + """Loads the sampler from the given config file.""" + section = "sampler" + # check name + assert cp.get(section, "name") == cls.name, ( + "name in section [sampler] must match mine") + # get the number of walkers to use + nwalkers = int(cp.get(section, "nwalkers")) + # get the checkpoint interval, if it's specified + if cp.has_option(section, "checkpoint-interval"): + checkpoint_interval = int(cp.get(section, "checkpoint-interval")) else: - samples = self.model.prior_rvs( - size=nwalkers*ntemps, prior=prior).reshape((ntemps, nwalkers)) - # convert to array - for i, param in enumerate(self.sampling_params): - p0[..., i] = samples[param] - self._p0 = p0 - return p0 - - def run(self, niterations, **kwargs): - """Advance the ensemble for a number of samples. - - Parameters - ---------- - niterations : int - Number of samples to get from sampler. - - Returns - ------- - p : numpy.array - An array of current walker positions with shape (nwalkers, ndim). - lnpost : numpy.array - The list of log posterior probabilities for the walkers at - positions p, with shape (nwalkers, ndim). - rstate : - The current state of the random number generator. - """ - pos = self._pos - if pos is None: - pos = self.p0 - res = self._sampler.run_mcmc(pos, niterations, **kwargs) - p, lnpost, rstate = res[0], res[1], res[2] - # update the positions - self._pos = p - return p, lnpost, rstate - - # read/write functions - - # add ntemps and betas to metadata - def write_metadata(self, fp, **kwargs): - """Writes metadata about this sampler to the given file. Metadata is - written to the file's `attrs`. - - Parameters - ---------- - fp : InferenceFile - A file handler to an open inference file. - **kwargs : - All keyword arguments are saved as separate arguments in the - file attrs. If any keyword argument is a dictionary, the keyword - will point to the list of keys in the the file's ``attrs``. Each - key is then stored as a separate attr with its corresponding value. - """ - super(EmceePTSampler, self).write_metadata(fp, **kwargs) - fp.attrs["ntemps"] = self.ntemps - fp.attrs["betas"] = self._sampler.betas - - def write_acceptance_fraction(self, fp): - """Write acceptance_fraction data to file. Results are written to - `fp[acceptance_fraction/temp{k}]` where k is the temperature. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - """ - group = "acceptance_fraction/temp{tk}" - # acf has shape ntemps x nwalkers - acf = self.acceptance_fraction - for tk in range(fp.ntemps): - try: - fp[group.format(tk=tk)][:] = acf[tk, :] - except KeyError: - # dataset doesn't exist yet, create it - fp[group.format(tk=tk)] = acf[tk, :] - - @staticmethod - def read_acceptance_fraction(fp, temps=None, walkers=None): - """Reads the acceptance fraction from the given file. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - temps : {None, (list of) int} - The temperature index (or a list of indices) to retrieve. If None, - acfs from all temperatures and all walkers will be retrieved. - walkers : {None, (list of) int} - The walker index (or a list of indices) to retrieve. If None, - samples from all walkers will be obtained. - - Returns - ------- - array - Array of acceptance fractions with shape (requested temps, - requested walkers). - """ - group = 'acceptance_fraction/temp{tk}' - if temps is None: - temps = numpy.arange(fp.ntemps) - if walkers is None: - wmask = numpy.ones(fp.nwalkers, dtype=bool) - else: - wmask = numpy.zeros(fp.nwalkers, dtype=bool) - wmask[walkers] = True - arrays = [] - for tk in temps: - arrays.extend(fp[group.format(tk=tk)][wmask]) - return arrays - - @staticmethod - def write_samples_group(fp, samples_group, parameters, samples, - start_iteration=None, max_iterations=None): - """Writes samples to the given file. - - Results are written to: - - ``fp[samples_group/{vararg}]``, - - where ``{vararg}`` is the name of a variable arg. The samples are - written as an ``ntemps x nwalkers x niterations`` array. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - samples_group : str - Name of samples group to write. - parameters : list - The parameters to write to the file. - samples : FieldArray - The samples to write. Should be a FieldArray with fields containing - the samples to write and shape nwalkers x niterations. - start_iteration : int, optional - Write results to the file's datasets starting at the given - iteration. Default is to append after the last iteration in the - file. - max_iterations : int, optional - Set the maximum size that the arrays in the hdf file may be resized - to. Only applies if the samples have not previously been written - to file. The default (None) is to use the maximum size allowed by - h5py. - """ - ntemps, nwalkers, niterations = samples.shape - if max_iterations is not None and max_iterations < niterations: - raise IndexError("The provided max size is less than the " - "number of iterations") - group = samples_group + '/{name}' - # loop over number of dimensions - for param in parameters: - dataset_name = group.format(name=param) - istart = start_iteration - try: - fp_niterations = fp[dataset_name].shape[-1] - if istart is None: - istart = fp_niterations - istop = istart + niterations - if istop > fp_niterations: - # resize the dataset - fp[dataset_name].resize(istop, axis=2) - except KeyError: - # dataset doesn't exist yet - if istart is not None and istart != 0: - raise ValueError("non-zero start_iteration provided, but " - "dataset doesn't exist yet") - istart = 0 - istop = istart + niterations - fp.create_dataset(dataset_name, (ntemps, nwalkers, istop), - maxshape=(ntemps, nwalkers, max_iterations), - dtype=float, fletcher32=True) - fp[dataset_name][:, :, istart:istop] = samples[param] - - def write_results(self, fp, start_iteration=None, max_iterations=None, - **metadata): - """Writes metadata, samples, model stats, and acceptance fraction - to the given file. See the write function for each of those for - details. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - start_iteration : int, optional - Write results to the file's datasets starting at the given - iteration. Default is to append after the last iteration in the - file. - max_iterations : int, optional - Set the maximum size that the arrays in the hdf file may be resized - to. Only applies if the samples have not previously been written - to file. The default (None) is to use the maximum size allowed by - h5py. - \**metadata : - All other keyword arguments are passed to ``write_metadata``. - """ - self.write_metadata(fp, **metadata) - self.write_chain(fp, start_iteration=start_iteration, - max_iterations=max_iterations) - self.write_model_stats(fp, start_iteration=start_iteration, - max_iterations=max_iterations) - self.write_acceptance_fraction(fp) - self.write_state(fp) - - @staticmethod - def _read_fields(fp, fields_group, fields, array_class, - thin_start=None, thin_interval=None, thin_end=None, - iteration=None, temps=None, walkers=None, flatten=True): - """Base function for reading samples and model stats. See - `read_samples` and `read_model_stats` for details. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - fields_group : str - The name of the group to retrieve the desired fields. - fields : list - The list of field names to retrieve. Must be names of groups in - `fp[fields_group/]`. - array_class : FieldArray or similar - The type of array to return. Must have a `from_kwargs` attribute. - - For other details on keyword arguments, see `read_samples` and - `read_model_stats`. - - Returns - ------- - array_class - An instance of the given array class populated with values - retrieved from the fields. - """ - # walkers to load - if walkers is not None: - widx = numpy.zeros(fp.nwalkers, dtype=bool) - widx[walkers] = True - nwalkers = widx.sum() + checkpoint_interval = None + if cp.has_option(section, "logpost-function"): + lnpost = cp.get(section, "logpost-function") else: - widx = slice(None, None) - nwalkers = fp.nwalkers - # temperatures to load - selecttemps = False - if temps is None: - tidx = 0 - ntemps = 1 - elif isinstance(temps, int): - tidx = temps - ntemps = 1 + lnpost = None + obj = cls(model, nwalkers, checkpoint_interval=checkpoint_interval, + logpost_function=lnpost, nprocesses=nprocesses, + use_mpi=use_mpi) + # get target + if cp.has_option(section, "niterations"): + niterations = int(cp.get(section, "niterations")) else: - # temps is either 'all' or a list of temperatures; - # in either case, we'll get all of the temperatures from the file; - # if not 'all', then we'll pull out the ones we want - tidx = slice(None, None) - selecttemps = temps != 'all' - if selecttemps: - ntemps = len(temps) - else: - ntemps = fp.ntemps - # get the slice to use - if iteration is not None: - get_index = iteration - niterations = 1 - else: - if thin_end is None: - # use the number of current iterations - thin_end = fp.niterations - get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end, - thin_interval=thin_interval) - # we'll just get the number of iterations from the returned shape niterations = None - # load - arrays = {} - group = fields_group + '/{name}' - for name in fields: - arr = fp[group.format(name=name)][tidx, widx, get_index] - if niterations is None: - niterations = arr.shape[-1] - # pull out the temperatures we need - if selecttemps: - arr = arr[temps, ...] - if flatten: - arr = arr.flatten() - else: - # ensure that the returned array is 3D - arr = arr.reshape((ntemps, nwalkers, niterations)) - arrays[name] = arr - return array_class.from_kwargs(**arrays) - - @classmethod - def read_samples(cls, fp, parameters, - thin_start=None, thin_interval=None, thin_end=None, - iteration=None, temps=0, walkers=None, flatten=True, - samples_group=None, array_class=None): - """Reads samples for the given parameter(s). - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - parameters : (list of) strings - The parameter(s) to retrieve. A parameter can be the name of any - field in `fp[fp.samples_group]`, a virtual field or method of - `FieldArray` (as long as the file contains the necessary fields - to derive the virtual field or method), and/or a function of - these. - thin_start : int - Index of the sample to begin returning samples. Default is to read - samples after burn in. To start from the beginning set thin_start - to 0. - thin_interval : int - Interval to accept every i-th sample. Default is to use the - `fp.acl`. If `fp.acl` is not set, then use all samples - (set thin_interval to 1). - thin_end : int - Index of the last sample to read. If not given then - `fp.niterations` is used. - iteration : int - Get a single iteration. If provided, will override the - `thin_{start/interval/end}` arguments. - walkers : {None, (list of) int} - The walker index (or a list of indices) to retrieve. If None, - samples from all walkers will be obtained. - temps : {None, (list of) int, 'all'} - The temperature index (or list of indices) to retrieve. If None, - only samples from the coldest (= 0) temperature chain will be - retrieved. To retrieve all temperates pass 'all', or a list of - all of the temperatures. - flatten : {True, bool} - The returned array will be one dimensional, with all desired - samples from all desired walkers concatenated together. If False, - the returned array will have dimension requested temps x requested - walkers x requested iterations. - samples_group : {None, str} - The group in `fp` from which to retrieve the parameter fields. If - None, searches in `fp.samples_group`. - array_class : {None, array class} - The type of array to return. The class must have a `from_kwargs` - class method and a `parse_parameters` method. If None, will return - a FieldArray. - - Returns - ------- - array_class - Samples for the given parameters, as an instance of a the given - `array_class` (`FieldArray` if `array_class` is None). - """ - # get the group to load from - if samples_group is None: - samples_group = fp.samples_group - # get the type of array class to use - if array_class is None: - array_class = FieldArray - # get the names of fields needed for the given parameters - possible_fields = fp[samples_group].keys() - loadfields = array_class.parse_parameters(parameters, possible_fields) - return cls._read_fields( - fp, samples_group, loadfields, array_class, - thin_start=thin_start, thin_interval=thin_interval, - thin_end=thin_end, iteration=iteration, temps=temps, - walkers=walkers, flatten=flatten) - - @classmethod - def compute_acfs(cls, fp, start_index=None, end_index=None, - per_walker=False, walkers=None, parameters=None, - temps=None): - """Computes the autocorrleation function of the model params in the - given file. - - By default, parameter values are averaged over all walkers at each - iteration. The ACF is then calculated over the averaged chain for each - temperature. An ACF per-walker will be returned instead if - ``per_walker=True``. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - start_index : {None, int} - The start index to compute the acl from. If None, will try to use - the number of burn-in iterations in the file; otherwise, will start - at the first sample. - end_index : {None, int} - The end index to compute the acl to. If None, will go to the end - of the current iteration. - per_walker : optional, bool - Return the ACF for each walker separately. Default is False. - walkers : optional, int or array - Calculate the ACF using only the given walkers. If None (the - default) all walkers will be used. - parameters : optional, str or array - Calculate the ACF for only the given parameters. If None (the - default) will calculate the ACF for all of the model params. - temps : optional, (list of) int or 'all' - The temperature index (or list of indices) to retrieve. If None - (the default), the ACF will only be computed for the coldest (= 0) - temperature chain. To compute an ACF for all temperates pass 'all', - or a list of all of the temperatures. - - Returns - ------- - FieldArray - A ``FieldArray`` of the ACF vs iteration for each parameter. If - `per-walker` is True, the FieldArray will have shape - ``ntemps x nwalkers x niterations``. Otherwise, the returned - array will have shape ``ntemps x niterations``. - """ - acfs = {} - if parameters is None: - parameters = fp.variable_params - if isinstance(parameters, str) or isinstance(parameters, unicode): - parameters = [parameters] - if isinstance(temps, int): - temps = [temps] - elif temps == 'all': - temps = numpy.arange(fp.ntemps) - elif temps is None: - temps = [0] - for param in parameters: - subacfs = [] - for tk in temps: - if per_walker: - # just call myself with a single walker - if walkers is None: - walkers = numpy.arange(fp.nwalkers) - arrays = [cls.compute_acfs(fp, start_index=start_index, - end_index=end_index, - per_walker=False, walkers=ii, - parameters=param, - temps=tk)[param][0, :] - for ii in walkers] - # we'll stack all of the walker arrays to make a single - # nwalkers x niterations array; when these are stacked - # below, we'll get a ntemps x nwalkers x niterations array - subacfs.append(numpy.vstack(arrays)) - else: - samples = cls.read_samples(fp, param, - thin_start=start_index, - thin_interval=1, - thin_end=end_index, - walkers=walkers, temps=tk, - flatten=False)[param] - # contract the walker dimension using the mean, and flatten - # the (length 1) temp dimension - samples = samples.mean(axis=1)[0, :] - thisacf = autocorrelation.calculate_acf(samples).numpy() - subacfs.append(thisacf) - # stack the temperatures - # FIXME: the following if/else can be condensed to a single line - # using numpy.stack, once the version requirements are bumped to - # numpy >= 1.10 - if per_walker: - nw, ni = subacfs[0].shape - acfs[param] = numpy.zeros((len(temps), nw, ni), dtype=float) - for tk in range(len(temps)): - acfs[param][tk, ...] = subacfs[tk] - else: - acfs[param] = numpy.vstack(subacfs) - return FieldArray.from_kwargs(**acfs) - - @classmethod - def compute_acls(cls, fp, start_index=None, end_index=None): - """Computes the autocorrleation length for all model params and - temperatures in the given file. - - Parameter values are averaged over all walkers at each iteration and - temperature. The ACL is then calculated over the averaged chain. If - the returned ACL is `inf`, will default to the number of current - iterations. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - start_index : {None, int} - The start index to compute the acl from. If None, will try to use - the number of burn-in iterations in the file; otherwise, will start - at the first sample. - end_index : {None, int} - The end index to compute the acl to. If None, will go to the end - of the current iteration. - - Returns - ------- - dict - A dictionary of ntemps-long arrays of the ACLs of each parameter. - """ - acls = {} - if end_index is None: - end_index = fp.niterations - tidx = numpy.arange(fp.ntemps) - for param in fp.variable_params: - these_acls = numpy.zeros(fp.ntemps, dtype=int) - for tk in tidx: - samples = cls.read_samples(fp, param, thin_start=start_index, - thin_interval=1, thin_end=end_index, - temps=tk, flatten=False)[param] - # contract the walker dimension using the mean, and flatten - # the (length 1) temp dimension - samples = samples.mean(axis=1)[0, :] - acl = autocorrelation.calculate_acl(samples) - if numpy.isinf(acl): - acl = samples.size - these_acls[tk] = acl - acls[param] = these_acls - return acls - - @classmethod - def calculate_logevidence(cls, fp, thin_start=None, thin_end=None, - thin_interval=None): - """Calculates the log evidence from the given file using emcee's - thermodynamic integration. - - Parameters - ---------- - fp : InferenceFile - An open file handler to read the stats from. - thin_start : int - Index of the sample to begin returning stats. Default is to read - stats after burn in. To start from the beginning set thin_start - to 0. - thin_interval : int - Interval to accept every i-th sample. Default is to use the - `fp.acl`. If `fp.acl` is not set, then use all stats - (set thin_interval to 1). - thin_end : int - Index of the last sample to read. If not given then - `fp.niterations` is used. - - Returns - ------- - lnZ : float - The estimate of log of the evidence. - dlnZ : float - The error on the estimate. - """ + if cp.has_option(section, "effective-nsamples"): + nsamples = int(cp.get(section, "effective-nsamples")) + else: + nsamples = None + obj.set_target(niterations=niterations, eff_nsamples=nsamples) + # add burn-in if it's specified try: - import emcee - except ImportError: - raise ImportError("emcee is not installed.") - - stats_group = fp.stats_group - parameters = fp[stats_group].keys() - logstats = cls.read_samples(fp, parameters, samples_group=stats_group, - thin_start=thin_start, thin_end=thin_end, - thin_interval=thin_interval, - temps='all', flatten=False) - # get the likelihoods - logls = logstats['loglr'] + fp.lognl - # we need the betas that were used - betas = fp.attrs['betas'] - # annoyingly, theromdynaimc integration in PTSampler is an instance - # method, so we'll implement a dummy one - ntemps = fp.ntemps - nwalkers = fp.nwalkers - ndim = len(fp.variable_params) - dummy_sampler = emcee.PTSampler(ntemps, nwalkers, ndim, None, - None, betas=betas) - return dummy_sampler.thermodynamic_integration_log_evidence( - logls=logls, fburnin=0.) + bit = obj.burn_in_class.from_config(cp, obj) + except ConfigParser.Error: + bit = None + obj.set_burn_in(bit) + return obj diff --git a/gwin/sampler/emcee_pt.py b/gwin/sampler/emcee_pt.py new file mode 100644 index 0000000..cef83fd --- /dev/null +++ b/gwin/sampler/emcee_pt.py @@ -0,0 +1,756 @@ +# Copyright (C) 2016 Collin Capano +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# +""" +This modules provides classes and functions for using the emcee sampler +packages for parameter estimation. +""" + +# The following two classes are needed for two reason +# 1) pools freeze state when created and so classes *cannot be updated* +# 2) methods cannot be pickled. + + +class _callprior(object): + """Calls the model's prior function, and ensures that no + metadata is returned.""" + def __init__(self, model_call): + self.callable = model_call + + def __call__(self, args): + prior = self.callable(args, callfunc='prior') + return prior if isinstance(prior, numpy.float64) else prior[0] + + +class _callloglikelihood(object): + """Calls the model's loglikelihood function. + """ + def __init__(self, model_call): + self.callable = model_call + + def __call__(self, args): + return self.callable(args, callfunc='loglikelihood') + + +class EmceePTSampler(BaseMCMCSampler): + """This class is used to construct a parallel-tempered MCMC sampler from + the emcee package's PTSampler. + + Parameters + ---------- + model : model + A model from ``gwin.models``. + ntemps : int + Number of temeratures to use in the sampler. + nwalkers : int + Number of walkers to use in sampler. + pool : function with map, Optional + A provider of a map function that allows a function call to be run + over multiple sets of arguments and possibly maps them to + cores/nodes/etc. + """ + name = "emcee_pt" + + def __init__(self, model, ntemps, nwalkers, pool=None, + model_call=None): + + try: + import emcee + except ImportError: + raise ImportError("emcee is not installed.") + + if model_call is None: + model_call = model + + # construct the sampler: PTSampler needs the likelihood and prior + # functions separately + ndim = len(model.variable_params) + sampler = emcee.PTSampler(ntemps, nwalkers, ndim, + _callloglikelihood(model_call), + _callprior(model_call), + pool=pool) + # initialize + super(EmceePTSampler, self).__init__( + sampler, model) + self._nwalkers = nwalkers + self._ntemps = ntemps + + @classmethod + def from_cli(cls, opts, model, pool=None, + model_call=None): + """Create an instance of this sampler from the given command-line + options. + + Parameters + ---------- + opts : ArgumentParser options + The options to parse. + model : LikelihoodEvaluator + The model to use with the sampler. + + Returns + ------- + EmceePTSampler + An emcee sampler initialized based on the given arguments. + """ + return cls(model, opts.ntemps, opts.nwalkers, + pool=pool, model_call=model_call) + + @property + def ntemps(self): + return self._ntemps + + @property + def chain(self): + """Get all past samples as an ntemps x nwalker x niterations x ndim + array. + """ + # emcee returns the chain as ntemps x nwalker x niterations x ndim + return self._sampler.chain + + def clear_chain(self): + """Clears the chain and blobs from memory. + """ + # store the iteration that the clear is occuring on + self.lastclear = self.niterations + # now clear the chain + self._sampler.reset() + + @property + def model_stats(self): + """Returns the log likelihood ratio and log prior as a FieldArray. + The returned array has shape ntemps x nwalkers x niterations. + """ + # likelihood has shape ntemps x nwalkers x niterations + logl = self._sampler.lnlikelihood + # get prior from posterior + logp = self._sampler.lnprobability - logl + # compute the likelihood ratio + loglr = logl - self.model.lognl + kwargs = {'loglr': loglr, 'prior': logp} + # if different coordinates were used for sampling, get the jacobian + if self.model.sampling_transforms is not None: + samples = self.samples + # convert to dict + d = {param: samples[param] for param in samples.fieldnames} + logj = self.model.logjacobian(**d) + kwargs['logjacobian'] = logj + return FieldArray.from_kwargs(**kwargs) + + @property + def lnpost(self): + """Get the natural logarithm of the likelihood + the prior as an + ntemps x nwalkers x niterations array. + """ + # emcee returns ntemps x nwalkers x niterations + return self._sampler.lnprobability + + def set_p0(self, samples_file=None, prior=None): + """Sets the initial position of the walkers. + + Parameters + ---------- + samples_file : InferenceFile, optional + If provided, use the last iteration in the given file for the + starting positions. + prior : JointDistribution, optional + Use the given prior to set the initial positions rather than + ``model``'s prior. + + Returns + ------- + p0 : array + An ntemps x nwalkers x ndim array of the initial positions that + were set. + """ + # create a (nwalker, ndim) array for initial positions + ntemps = self.ntemps + nwalkers = self.nwalkers + ndim = len(self.variable_params) + p0 = numpy.ones((ntemps, nwalkers, ndim)) + # if samples are given then use those as initial positions + if samples_file is not None: + samples = self.read_samples(samples_file, self.variable_params, + iteration=-1, temps='all', + flatten=False)[..., 0] + # transform to sampling parameter space + samples = self.model.apply_sampling_transforms( + samples) + # draw random samples if samples are not provided + else: + samples = self.model.prior_rvs( + size=nwalkers*ntemps, prior=prior).reshape((ntemps, nwalkers)) + # convert to array + for i, param in enumerate(self.sampling_params): + p0[..., i] = samples[param] + self._p0 = p0 + return p0 + + def run(self, niterations, **kwargs): + """Advance the ensemble for a number of samples. + + Parameters + ---------- + niterations : int + Number of samples to get from sampler. + + Returns + ------- + p : numpy.array + An array of current walker positions with shape (nwalkers, ndim). + lnpost : numpy.array + The list of log posterior probabilities for the walkers at + positions p, with shape (nwalkers, ndim). + rstate : + The current state of the random number generator. + """ + pos = self._pos + if pos is None: + pos = self.p0 + res = self._sampler.run_mcmc(pos, niterations, **kwargs) + p, lnpost, rstate = res[0], res[1], res[2] + # update the positions + self._pos = p + return p, lnpost, rstate + + # read/write functions + + # add ntemps and betas to metadata + def write_metadata(self, fp, **kwargs): + """Writes metadata about this sampler to the given file. Metadata is + written to the file's `attrs`. + + Parameters + ---------- + fp : InferenceFile + A file handler to an open inference file. + **kwargs : + All keyword arguments are saved as separate arguments in the + file attrs. If any keyword argument is a dictionary, the keyword + will point to the list of keys in the the file's ``attrs``. Each + key is then stored as a separate attr with its corresponding value. + """ + super(EmceePTSampler, self).write_metadata(fp, **kwargs) + fp.attrs["ntemps"] = self.ntemps + fp.attrs["betas"] = self._sampler.betas + + def write_acceptance_fraction(self, fp): + """Write acceptance_fraction data to file. Results are written to + `fp[acceptance_fraction/temp{k}]` where k is the temperature. + + Parameters + ----------- + fp : InferenceFile + A file handler to an open inference file. + """ + group = "acceptance_fraction/temp{tk}" + # acf has shape ntemps x nwalkers + acf = self.acceptance_fraction + for tk in range(fp.ntemps): + try: + fp[group.format(tk=tk)][:] = acf[tk, :] + except KeyError: + # dataset doesn't exist yet, create it + fp[group.format(tk=tk)] = acf[tk, :] + + @staticmethod + def read_acceptance_fraction(fp, temps=None, walkers=None): + """Reads the acceptance fraction from the given file. + + Parameters + ----------- + fp : InferenceFile + An open file handler to read the samples from. + temps : {None, (list of) int} + The temperature index (or a list of indices) to retrieve. If None, + acfs from all temperatures and all walkers will be retrieved. + walkers : {None, (list of) int} + The walker index (or a list of indices) to retrieve. If None, + samples from all walkers will be obtained. + + Returns + ------- + array + Array of acceptance fractions with shape (requested temps, + requested walkers). + """ + group = 'acceptance_fraction/temp{tk}' + if temps is None: + temps = numpy.arange(fp.ntemps) + if walkers is None: + wmask = numpy.ones(fp.nwalkers, dtype=bool) + else: + wmask = numpy.zeros(fp.nwalkers, dtype=bool) + wmask[walkers] = True + arrays = [] + for tk in temps: + arrays.extend(fp[group.format(tk=tk)][wmask]) + return arrays + + @staticmethod + def write_samples_group(fp, samples_group, parameters, samples, + start_iteration=None, max_iterations=None): + """Writes samples to the given file. + + Results are written to: + + ``fp[samples_group/{vararg}]``, + + where ``{vararg}`` is the name of a variable arg. The samples are + written as an ``ntemps x nwalkers x niterations`` array. + + Parameters + ----------- + fp : InferenceFile + A file handler to an open inference file. + samples_group : str + Name of samples group to write. + parameters : list + The parameters to write to the file. + samples : FieldArray + The samples to write. Should be a FieldArray with fields containing + the samples to write and shape nwalkers x niterations. + start_iteration : int, optional + Write results to the file's datasets starting at the given + iteration. Default is to append after the last iteration in the + file. + max_iterations : int, optional + Set the maximum size that the arrays in the hdf file may be resized + to. Only applies if the samples have not previously been written + to file. The default (None) is to use the maximum size allowed by + h5py. + """ + ntemps, nwalkers, niterations = samples.shape + if max_iterations is not None and max_iterations < niterations: + raise IndexError("The provided max size is less than the " + "number of iterations") + group = samples_group + '/{name}' + # loop over number of dimensions + for param in parameters: + dataset_name = group.format(name=param) + istart = start_iteration + try: + fp_niterations = fp[dataset_name].shape[-1] + if istart is None: + istart = fp_niterations + istop = istart + niterations + if istop > fp_niterations: + # resize the dataset + fp[dataset_name].resize(istop, axis=2) + except KeyError: + # dataset doesn't exist yet + if istart is not None and istart != 0: + raise ValueError("non-zero start_iteration provided, but " + "dataset doesn't exist yet") + istart = 0 + istop = istart + niterations + fp.create_dataset(dataset_name, (ntemps, nwalkers, istop), + maxshape=(ntemps, nwalkers, max_iterations), + dtype=float, fletcher32=True) + fp[dataset_name][:, :, istart:istop] = samples[param] + + def write_results(self, fp, start_iteration=None, max_iterations=None, + **metadata): + """Writes metadata, samples, model stats, and acceptance fraction + to the given file. See the write function for each of those for + details. + + Parameters + ----------- + fp : InferenceFile + A file handler to an open inference file. + start_iteration : int, optional + Write results to the file's datasets starting at the given + iteration. Default is to append after the last iteration in the + file. + max_iterations : int, optional + Set the maximum size that the arrays in the hdf file may be resized + to. Only applies if the samples have not previously been written + to file. The default (None) is to use the maximum size allowed by + h5py. + \**metadata : + All other keyword arguments are passed to ``write_metadata``. + """ + self.write_metadata(fp, **metadata) + self.write_chain(fp, start_iteration=start_iteration, + max_iterations=max_iterations) + self.write_model_stats(fp, start_iteration=start_iteration, + max_iterations=max_iterations) + self.write_acceptance_fraction(fp) + self.write_state(fp) + + @staticmethod + def _read_fields(fp, fields_group, fields, array_class, + thin_start=None, thin_interval=None, thin_end=None, + iteration=None, temps=None, walkers=None, flatten=True): + """Base function for reading samples and model stats. See + `read_samples` and `read_model_stats` for details. + + Parameters + ----------- + fp : InferenceFile + An open file handler to read the samples from. + fields_group : str + The name of the group to retrieve the desired fields. + fields : list + The list of field names to retrieve. Must be names of groups in + `fp[fields_group/]`. + array_class : FieldArray or similar + The type of array to return. Must have a `from_kwargs` attribute. + + For other details on keyword arguments, see `read_samples` and + `read_model_stats`. + + Returns + ------- + array_class + An instance of the given array class populated with values + retrieved from the fields. + """ + # walkers to load + if walkers is not None: + widx = numpy.zeros(fp.nwalkers, dtype=bool) + widx[walkers] = True + nwalkers = widx.sum() + else: + widx = slice(None, None) + nwalkers = fp.nwalkers + # temperatures to load + selecttemps = False + if temps is None: + tidx = 0 + ntemps = 1 + elif isinstance(temps, int): + tidx = temps + ntemps = 1 + else: + # temps is either 'all' or a list of temperatures; + # in either case, we'll get all of the temperatures from the file; + # if not 'all', then we'll pull out the ones we want + tidx = slice(None, None) + selecttemps = temps != 'all' + if selecttemps: + ntemps = len(temps) + else: + ntemps = fp.ntemps + # get the slice to use + if iteration is not None: + get_index = iteration + niterations = 1 + else: + if thin_end is None: + # use the number of current iterations + thin_end = fp.niterations + get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end, + thin_interval=thin_interval) + # we'll just get the number of iterations from the returned shape + niterations = None + # load + arrays = {} + group = fields_group + '/{name}' + for name in fields: + arr = fp[group.format(name=name)][tidx, widx, get_index] + if niterations is None: + niterations = arr.shape[-1] + # pull out the temperatures we need + if selecttemps: + arr = arr[temps, ...] + if flatten: + arr = arr.flatten() + else: + # ensure that the returned array is 3D + arr = arr.reshape((ntemps, nwalkers, niterations)) + arrays[name] = arr + return array_class.from_kwargs(**arrays) + + @classmethod + def read_samples(cls, fp, parameters, + thin_start=None, thin_interval=None, thin_end=None, + iteration=None, temps=0, walkers=None, flatten=True, + samples_group=None, array_class=None): + """Reads samples for the given parameter(s). + + Parameters + ----------- + fp : InferenceFile + An open file handler to read the samples from. + parameters : (list of) strings + The parameter(s) to retrieve. A parameter can be the name of any + field in `fp[fp.samples_group]`, a virtual field or method of + `FieldArray` (as long as the file contains the necessary fields + to derive the virtual field or method), and/or a function of + these. + thin_start : int + Index of the sample to begin returning samples. Default is to read + samples after burn in. To start from the beginning set thin_start + to 0. + thin_interval : int + Interval to accept every i-th sample. Default is to use the + `fp.acl`. If `fp.acl` is not set, then use all samples + (set thin_interval to 1). + thin_end : int + Index of the last sample to read. If not given then + `fp.niterations` is used. + iteration : int + Get a single iteration. If provided, will override the + `thin_{start/interval/end}` arguments. + walkers : {None, (list of) int} + The walker index (or a list of indices) to retrieve. If None, + samples from all walkers will be obtained. + temps : {None, (list of) int, 'all'} + The temperature index (or list of indices) to retrieve. If None, + only samples from the coldest (= 0) temperature chain will be + retrieved. To retrieve all temperates pass 'all', or a list of + all of the temperatures. + flatten : {True, bool} + The returned array will be one dimensional, with all desired + samples from all desired walkers concatenated together. If False, + the returned array will have dimension requested temps x requested + walkers x requested iterations. + samples_group : {None, str} + The group in `fp` from which to retrieve the parameter fields. If + None, searches in `fp.samples_group`. + array_class : {None, array class} + The type of array to return. The class must have a `from_kwargs` + class method and a `parse_parameters` method. If None, will return + a FieldArray. + + Returns + ------- + array_class + Samples for the given parameters, as an instance of a the given + `array_class` (`FieldArray` if `array_class` is None). + """ + # get the group to load from + if samples_group is None: + samples_group = fp.samples_group + # get the type of array class to use + if array_class is None: + array_class = FieldArray + # get the names of fields needed for the given parameters + possible_fields = fp[samples_group].keys() + loadfields = array_class.parse_parameters(parameters, possible_fields) + return cls._read_fields( + fp, samples_group, loadfields, array_class, + thin_start=thin_start, thin_interval=thin_interval, + thin_end=thin_end, iteration=iteration, temps=temps, + walkers=walkers, flatten=flatten) + + @classmethod + def compute_acfs(cls, fp, start_index=None, end_index=None, + per_walker=False, walkers=None, parameters=None, + temps=None): + """Computes the autocorrleation function of the model params in the + given file. + + By default, parameter values are averaged over all walkers at each + iteration. The ACF is then calculated over the averaged chain for each + temperature. An ACF per-walker will be returned instead if + ``per_walker=True``. + + Parameters + ----------- + fp : InferenceFile + An open file handler to read the samples from. + start_index : {None, int} + The start index to compute the acl from. If None, will try to use + the number of burn-in iterations in the file; otherwise, will start + at the first sample. + end_index : {None, int} + The end index to compute the acl to. If None, will go to the end + of the current iteration. + per_walker : optional, bool + Return the ACF for each walker separately. Default is False. + walkers : optional, int or array + Calculate the ACF using only the given walkers. If None (the + default) all walkers will be used. + parameters : optional, str or array + Calculate the ACF for only the given parameters. If None (the + default) will calculate the ACF for all of the model params. + temps : optional, (list of) int or 'all' + The temperature index (or list of indices) to retrieve. If None + (the default), the ACF will only be computed for the coldest (= 0) + temperature chain. To compute an ACF for all temperates pass 'all', + or a list of all of the temperatures. + + Returns + ------- + FieldArray + A ``FieldArray`` of the ACF vs iteration for each parameter. If + `per-walker` is True, the FieldArray will have shape + ``ntemps x nwalkers x niterations``. Otherwise, the returned + array will have shape ``ntemps x niterations``. + """ + acfs = {} + if parameters is None: + parameters = fp.variable_params + if isinstance(parameters, str) or isinstance(parameters, unicode): + parameters = [parameters] + if isinstance(temps, int): + temps = [temps] + elif temps == 'all': + temps = numpy.arange(fp.ntemps) + elif temps is None: + temps = [0] + for param in parameters: + subacfs = [] + for tk in temps: + if per_walker: + # just call myself with a single walker + if walkers is None: + walkers = numpy.arange(fp.nwalkers) + arrays = [cls.compute_acfs(fp, start_index=start_index, + end_index=end_index, + per_walker=False, walkers=ii, + parameters=param, + temps=tk)[param][0, :] + for ii in walkers] + # we'll stack all of the walker arrays to make a single + # nwalkers x niterations array; when these are stacked + # below, we'll get a ntemps x nwalkers x niterations array + subacfs.append(numpy.vstack(arrays)) + else: + samples = cls.read_samples(fp, param, + thin_start=start_index, + thin_interval=1, + thin_end=end_index, + walkers=walkers, temps=tk, + flatten=False)[param] + # contract the walker dimension using the mean, and flatten + # the (length 1) temp dimension + samples = samples.mean(axis=1)[0, :] + thisacf = autocorrelation.calculate_acf(samples).numpy() + subacfs.append(thisacf) + # stack the temperatures + # FIXME: the following if/else can be condensed to a single line + # using numpy.stack, once the version requirements are bumped to + # numpy >= 1.10 + if per_walker: + nw, ni = subacfs[0].shape + acfs[param] = numpy.zeros((len(temps), nw, ni), dtype=float) + for tk in range(len(temps)): + acfs[param][tk, ...] = subacfs[tk] + else: + acfs[param] = numpy.vstack(subacfs) + return FieldArray.from_kwargs(**acfs) + + @classmethod + def compute_acls(cls, fp, start_index=None, end_index=None): + """Computes the autocorrleation length for all model params and + temperatures in the given file. + + Parameter values are averaged over all walkers at each iteration and + temperature. The ACL is then calculated over the averaged chain. If + the returned ACL is `inf`, will default to the number of current + iterations. + + Parameters + ----------- + fp : InferenceFile + An open file handler to read the samples from. + start_index : {None, int} + The start index to compute the acl from. If None, will try to use + the number of burn-in iterations in the file; otherwise, will start + at the first sample. + end_index : {None, int} + The end index to compute the acl to. If None, will go to the end + of the current iteration. + + Returns + ------- + dict + A dictionary of ntemps-long arrays of the ACLs of each parameter. + """ + acls = {} + if end_index is None: + end_index = fp.niterations + tidx = numpy.arange(fp.ntemps) + for param in fp.variable_params: + these_acls = numpy.zeros(fp.ntemps, dtype=int) + for tk in tidx: + samples = cls.read_samples(fp, param, thin_start=start_index, + thin_interval=1, thin_end=end_index, + temps=tk, flatten=False)[param] + # contract the walker dimension using the mean, and flatten + # the (length 1) temp dimension + samples = samples.mean(axis=1)[0, :] + acl = autocorrelation.calculate_acl(samples) + if numpy.isinf(acl): + acl = samples.size + these_acls[tk] = acl + acls[param] = these_acls + return acls + + @classmethod + def calculate_logevidence(cls, fp, thin_start=None, thin_end=None, + thin_interval=None): + """Calculates the log evidence from the given file using emcee's + thermodynamic integration. + + Parameters + ---------- + fp : InferenceFile + An open file handler to read the stats from. + thin_start : int + Index of the sample to begin returning stats. Default is to read + stats after burn in. To start from the beginning set thin_start + to 0. + thin_interval : int + Interval to accept every i-th sample. Default is to use the + `fp.acl`. If `fp.acl` is not set, then use all stats + (set thin_interval to 1). + thin_end : int + Index of the last sample to read. If not given then + `fp.niterations` is used. + + Returns + ------- + lnZ : float + The estimate of log of the evidence. + dlnZ : float + The error on the estimate. + """ + try: + import emcee + except ImportError: + raise ImportError("emcee is not installed.") + + stats_group = fp.stats_group + parameters = fp[stats_group].keys() + logstats = cls.read_samples(fp, parameters, samples_group=stats_group, + thin_start=thin_start, thin_end=thin_end, + thin_interval=thin_interval, + temps='all', flatten=False) + # get the likelihoods + logls = logstats['loglr'] + fp.lognl + # we need the betas that were used + betas = fp.attrs['betas'] + # annoyingly, theromdynaimc integration in PTSampler is an instance + # method, so we'll implement a dummy one + ntemps = fp.ntemps + nwalkers = fp.nwalkers + ndim = len(fp.variable_params) + dummy_sampler = emcee.PTSampler(ntemps, nwalkers, ndim, None, + None, betas=betas) + return dummy_sampler.thermodynamic_integration_log_evidence( + logls=logls, fburnin=0.) From 3f603715fa7e4688c58b4ab036cbd66b2d40b793 Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Tue, 25 Sep 2018 11:13:14 +0200 Subject: [PATCH 2/3] Update plotting (#71) * move results from cli functions to io module; add ability to specify different options for different input files * make write_kwargs_to_attrs a classmethod to cyclic imports * use argparse action to parse parameters option * move injections_from_cli to __init__; add a read_injections method to base_hdf * use parameters opt to load parameters in base_mcmc * move get_file_type to io; fix bugs * fix import error * create a custom ArgumentParser * allow input-file option to accept labels * move param parsing to option_utils and remove unused functions * update create_multidim plot * update plot_posterior * switch from normed to density to please matplotlib * update plot_posterior * fix plotting injection parameters * move thin options to base_mcmc; add ability to skip arguments * update plot_movie * fix pep8 issues --- bin/gwin_plot_movie | 61 ++-- bin/gwin_plot_posterior | 87 ++---- gwin/io/__init__.py | 439 +++++++++++++++++++++++++- gwin/io/base_hdf.py | 169 ++++++++-- gwin/io/base_mcmc.py | 75 ++++- gwin/models/base.py | 3 +- gwin/option_utils.py | 478 ++++++----------------------- gwin/results/scatter_histograms.py | 12 +- 8 files changed, 824 insertions(+), 500 deletions(-) diff --git a/bin/gwin_plot_movie b/bin/gwin_plot_movie index 7c79530..0face6a 100644 --- a/bin/gwin_plot_movie +++ b/bin/gwin_plot_movie @@ -41,7 +41,7 @@ from matplotlib import pyplot import pycbc.results from pycbc import transforms -from gwin import (__version__, option_utils) +from gwin import (__version__, option_utils, io) from gwin.results.scatter_histograms import (create_multidim_plot, get_scale_fac) @@ -91,12 +91,12 @@ def integer_logspace(start, end, num): out[start_idx:len(x)+start_idx] = x return out -parser = argparse.ArgumentParser() - +# we won't add thinning arguments nor iteration, since this is determined by +# the frame number/step options +skip_args = ['thin-start', 'thin-interval', 'thin-end', 'iteration'] +parser = io.ResultsArgumentParser(skip_args=skip_args) parser.add_argument("--version", action="version", version=__version__, help="show version number and exit") -parser.add_argument("--input-file", type=str, required=True, - help="Results file path.") parser.add_argument("--start-sample", type=int, default=1, help="Start sample for the first frame. Note: sample " "counting starts from 1. Default is 1.") @@ -117,10 +117,6 @@ parser.add_argument("--log-steps", action="store_true", default=False, parser.add_argument("--output-prefix", type=str, required=True, help="Output path and prefix for the frame files " "(without extension).") -parser.add_argument("--parameters", type=str, nargs="+", - metavar="PARAM[:LABEL]", - help="Name of parameters to plot in same format " - "as for pycbc_inference_plot_posterior.") parser.add_argument('--verbose', action='store_true') parser.add_argument('--dpi', type=int, default=200, help='Set the dpi for each frame; default is 200') @@ -144,10 +140,12 @@ option_utils.add_density_option_group(parser) opts = parser.parse_args() pycbc.init_logging(opts.verbose) +if len(opts.input_file) > 1: + raise ValueError("this program can only plot one file at a time") + # Get data logging.info('Loading parameters') -fp, parameters, labels, _ = option_utils.results_from_cli(opts, - load_samples=False) +fp, parameters, labels, _ = io.results_from_cli(opts, load_samples=False) if opts.end_sample is None: opts.end_sample = fp.niterations @@ -190,31 +188,29 @@ else: raise ValueError("At least one of frame-number or frame-step must be " "provided.") -# get samples from InferenceFile +# get the samples file_parameters, trans = transforms.get_common_cbc_transforms( parameters, fp.variable_params) -samples = fp.read_samples(file_parameters, thin_start=thin_start, - thin_interval=thinint, thin_end=thin_end, - iteration=itermask, flatten=False) +samples = fp.samples_from_cli(opts, file_parameters, thin_start=thin_start, + thin_interval=thinint, thin_end=thin_end, + iteration=itermask, flatten=False) samples = transforms.apply_transforms(samples, trans) if samples.ndim > 2: - # multi-tempered samplers will return a 3 dims, so flatten + # multi-tempered samplers will return 3 dims, so flatten _, ii, jj = samples.shape samples = samples.reshape((ii, jj)) # Get z-values if opts.z_arg is not None: - logging.info("Getting model stats") - z_arg, zlbl = option_utils.parse_parameters_opt([opts.z_arg]) - z_arg = z_arg[0] - zlbl = zlbl[z_arg] - model_stats = fp.read_model_stats(thin_start=thin_start, - thin_end=thin_end, thin_interval=thinint, iteration=itermask, - flatten=False) - if model_stats.ndim > 2: - _, ii, jj = model_stats.shape - model_stats = model_stats.reshape((ii, jj)) - zvals = model_stats[z_arg] + logging.info("Getting samples for colorbar") + zsamples = fp.samples_from_cli(opts, opts.z_arg, thin_start=thin_start, + thin_interval=thinint, thin_end=thin_end, + iteration=itermask, flatten=False) + zlbl = opts.z_arg_labels[opts.z_arg] + if zsamples.ndim > 2: + _, ii, jj = zsamples.shape + zsamples = zsamples.reshape((ii, jj)) + zvals = zsamples[opts.z_arg] show_colorbar = True # Set common min and max for colorbar in all plots if opts.vmin is None: @@ -236,10 +232,16 @@ fp.close() # get injection values if desired expected_parameters = {} if opts.plot_injection_parameters: - injections = option_utils.injections_from_cli(opts) + injections = io.injections_from_cli(opts) for p in parameters: # check that all of the injections are the same - unique_vals = numpy.unique(injections[p]) + try: + vals = injections[p] + except NameError: + # injection doesn't have this parameter, skip + logging.warn("Could not find injection parameter {}".format(p)) + continue + unique_vals = numpy.unique(vals) if unique_vals.size != 1: raise ValueError("More than one injection found! To use " "plot-injection-parameters, there must be a single unique " @@ -247,6 +249,7 @@ if opts.plot_injection_parameters: "option to specify an expected parameter instead.") # passed: use the value for the expected expected_parameters[p] = unique_vals[0] + # get expected parameter values from command line expected_parameters.update(option_utils.expected_parameters_from_cli(opts)) expected_parameters_color = opts.expected_parameters_color diff --git a/bin/gwin_plot_posterior b/bin/gwin_plot_posterior index ce2026f..95598ab 100644 --- a/bin/gwin_plot_posterior +++ b/bin/gwin_plot_posterior @@ -37,88 +37,62 @@ from matplotlib import (patches, use) import pycbc import pycbc.version from pycbc.results import metadata -from pycbc.results.scatter_histograms import create_multidim_plot -from gwin import (__version__, option_utils) +from gwin import (__version__, option_utils, io) +from gwin.results.scatter_histograms import create_multidim_plot use('agg') # add options to command line -parser = argparse.ArgumentParser() +parser = io.ResultsArgumentParser() +# program-specific parser.add_argument("--version", action="version", version=__version__, help="Prints version information.") parser.add_argument("--output-file", type=str, required=True, help="Output plot path.") parser.add_argument("--verbose", action="store_true", default=False, help="Be verbose") -parser.add_argument("--input-file-labels", nargs="+", default=None, - help="Labels to add to plot if using more than one" - "input file.") - # add options for what plots to create option_utils.add_plot_posterior_option_group(parser) - # scatter configuration option_utils.add_scatter_option_group(parser) - # density configuration option_utils.add_density_option_group(parser) -# add standard option utils -option_utils.add_inference_results_option_group(parser) - # parse command line opts = parser.parse_args() # set logging pycbc.init_logging(opts.verbose) -# get parameters -logging.info("Loading parameters") -fp, parameters, labels, samples = option_utils.results_from_cli(opts) +# load the samples +fps, parameters, labels, samples = io.results_from_cli(opts) # typecast to list so the input files can be iterated over -fp = fp if isinstance(fp, list) else [fp] -parameters = parameters if isinstance(parameters[0], list) else [parameters] -labels = labels if isinstance(labels[0], list) else [labels] +fps = fps if isinstance(fps, list) else [fps] samples = samples if isinstance(samples, list) else [samples] -# get likelihood statistic values +# if a z-arg is specified, load samples for it if opts.z_arg is not None: - logging.info("Getting model stats") - - z_arg, f_zlbl = option_utils.parse_parameters_opt([opts.z_arg]) - z_arg = z_arg[0] - f_zlbl = f_zlbl[z_arg] - - # lists to hold z-axis values and labels for each input file + logging.info("Getting samples for colorbar") + zlbl = opts.z_arg_labels[opts.z_arg] zvals = [] - zlbl = [] - - # loop over each input file and append z-axis values and labels to lists - for f in fp: - model_stats = f.read_model_stats( - thin_start=opts.thin_start, thin_end=opts.thin_end, - thin_interval=opts.thin_interval, iteration=opts.iteration) - zvals.append(model_stats[z_arg]) - zlbl.append(f_zlbl) - f.close() - -# else there are no z-axis values + for fp in fps: + zsamples = fp.samples_from_cli(opts, parameters=opts.z_arg) + zvals.append(zsamples[opts.z_arg]) else: zvals = None zlbl = None - for f in fp: - f.close() -# determine if colorbar should be shown -show_colorbar = True if opts.z_arg else False +# closet the files, we don't need them anymore +for fp in fps: + fp.close() # if no plotting options selected, then the default options are based # on the number of parameters plot_options = [opts.plot_marginal, opts.plot_scatter, opts.plot_density] if not numpy.any(plot_options): - if len(parameters[0]) == 1: + if len(parameters) == 1: opts.plot_marginal = True else: opts.plot_scatter = True @@ -132,7 +106,7 @@ if not numpy.any(plot_options): mins, maxs = option_utils.plot_ranges_from_cli(opts) # add any missing parameters -for p in parameters[0]: +for p in parameters: if p not in mins: mins[p] = numpy.array([s[p].min() for s in samples]).min() if p not in maxs: @@ -141,10 +115,16 @@ for p in parameters[0]: # get injection values if desired expected_parameters = {} if opts.plot_injection_parameters: - injections = option_utils.injections_from_cli(opts) - for p in parameters[0]: + injections = io.injections_from_cli(opts) + for p in parameters: # check that all of the injections are the same - unique_vals = numpy.unique(injections[p]) + try: + vals = injections[p] + except NameError: + # injection doesn't have this parameter, skip + logging.warn("Could not find injection parameter {}".format(p)) + continue + unique_vals = numpy.unique(vals) if unique_vals.size != 1: raise ValueError("More than one injection found! To use " "plot-injection-parameters, there must be a single unique " @@ -162,7 +142,7 @@ colors = itertools.cycle(["black"] + ["C{}".format(i) for i in range(10)]) # plot each input file logging.info("Plotting") hist_colors = [] -for i, (p, l, s) in enumerate(zip(parameters, labels, samples)): +for (i, s) in enumerate(samples): # on first iteration create figure otherwise update old figure if i == 0: @@ -185,13 +165,13 @@ for i, (p, l, s) in enumerate(zip(parameters, labels, samples)): # plot fig, axis_dict = create_multidim_plot( - p, s, labels=l, fig=fig, axis_dict=axis_dict, + parameters, s, labels=labels, fig=fig, axis_dict=axis_dict, plot_marginal=opts.plot_marginal, marginal_percentiles=opts.marginal_percentiles, plot_scatter=opts.plot_scatter, zvals=zvals[i] if zvals is not None else None, - show_colorbar=show_colorbar, - cbar_label=zlbl[i] if zlbl is not None else None, + show_colorbar=opts.z_arg is not None, + cbar_label=zlbl, vmin=opts.vmin, vmax=opts.vmax, scatter_cmap=opts.scatter_cmap, plot_density=opts.plot_density, @@ -208,7 +188,7 @@ for i, (p, l, s) in enumerate(zip(parameters, labels, samples)): expected_parameters_color=opts.expected_parameters_color) # add legend to upper right for input files -if opts.input_file_labels: +if len(opts.input_file) > 1: handles = [] for color, label in zip(hist_colors, opts.input_file_labels): handles.append(patches.Patch(color=color, label=label)) @@ -218,9 +198,6 @@ if opts.input_file_labels: # set DPI fig.set_dpi(200) -# set tight layout -fig.set_tight_layout(True) - # save metadata.save_fig_with_metadata( fig, opts.output_file, {}, diff --git a/gwin/io/__init__.py b/gwin/io/__init__.py index c284bf6..7c15d3b 100644 --- a/gwin/io/__init__.py +++ b/gwin/io/__init__.py @@ -18,12 +18,21 @@ """ from __future__ import absolute_import +from __future__ import print_function import os +import sys +import argparse import shutil +import textwrap +import numpy import logging import h5py as _h5py +from pycbc.io.record import FieldArray, _numpy_function_lib +from pycbc import transforms as _transforms +from pycbc import waveform as _waveform +from ..option_utils import (ParseLabelArg, ParseParametersArg) from .emcee import EmceeFile from .txt import InferenceTXTFile @@ -32,6 +41,32 @@ } +def get_file_type(filename): + """ Returns I/O object to use for file. + + Parameters + ---------- + filename : str + Name of file. + + Returns + ------- + file_type : {InferenceFile, InferenceTXTFile} + The type of inference file object to use. + """ + txt_extensions = [".txt", ".dat", ".csv"] + hdf_extensions = [".hdf", ".h5", ".bkup", ".checkpoint"] + for ext in hdf_extensions: + if filename.endswith(ext): + with _h5py.File(filename, 'r') as fp: + filetype = fp.attrs['filetype'] + return filetypes[filetype] + for ext in txt_extensions: + if filename.endswith(ext): + return InferenceTXTFile + raise TypeError("Extension is not supported.") + + def loadfile(path, mode=None, filetype=None, **kwargs): """Loads the given file using the appropriate InferenceFile class. @@ -60,13 +95,15 @@ def loadfile(path, mode=None, filetype=None, **kwargs): if filetype is None: # try to read the file to get its filetype try: - with _h5py.File(path, 'r') as fp: - filetype = fp.attrs['filetype'] + fileclass = get_file_type(path) except IOError: # file doesn't exist, filetype must be provided raise IOError("The file appears not to exist. In this case, " "filetype must be provided.") - return filetypes[filetype](path, mode=mode, **kwargs) + else: + fileclass = filetypes[filetype] + return fileclass(path, mode=mode, **kwargs) + # # ============================================================================= @@ -204,3 +241,399 @@ def validate_checkpoint_files(checkpoint_file, backup_file): shutil.copy(backup_file, checkpoint_file) checkpoint_valid = True return checkpoint_valid + + +# +# ============================================================================= +# +# Command-line Utilities +# +# ============================================================================= +# +def get_common_parameters(input_files, collection=None): + """Gets a list of variable params that are common across all input files. + + If no common parameters are found, a ``ValueError`` is raised. + + Parameters + ---------- + input_files : list of str + List of input files to load. + collection : str, optional + What group of parameters to load. Can be the name of a list of + parameters stored in the files' attrs (e.g., "variable_params"), or + "all". If "all", will load all of the parameters in the files' + samples group. Default is to load all. + + Returns + ------- + list : + List of the parameter names. + """ + if collection is None: + collection = "all" + parameters = [] + for fn in input_files: + fp = loadfile(fn, 'r') + if collection == 'all': + ps = fp[fp.samples_group].keys() + else: + ps = fp.attrs[collection] + parameters.append(set(ps)) + fp.close() + parameters = list(set.intersection(*parameters)) + if parameters == []: + raise ValueError("no common parameters found for collection {} in " + "files {}".format(collection, ', '.join(input_files))) + return parameters + + +class NoInputFileError(Exception): + """Raised in custom argparse Actions by arguments needing input-files when + no file(s) were provided.""" + pass + + +class PrintFileParams(argparse.Action): + """Argparse action that will load input files and print possible parameters + to screen. Once this is done, the program is forced to exit immediately. + + The behvior is similar to --help, except that the input-file is read. + + .. note:: + The ``input_file`` attribute must be set in the parser namespace before + this action is called. Otherwise, a ``NoInputFileError`` is raised. + """ + def __init__(self, skip_args=None, nargs=0, **kwargs): + if nargs != 0: + raise ValueError("nargs for this action must be 0") + super(PrintFileParams, self).__init__(nargs=nargs, **kwargs) + self.skip_args = skip_args + + def __call__(self, parser, namespace, values, option_string=None): + # get the input file(s) + input_files = namespace.input_file + if input_files is None: + # see if we should raise an error + try: + raise_err = not parser.no_input_file_err + except AttributeError: + raise_err = True + if raise_err: + raise NoInputFileError("must provide at least one input file") + else: + # just return to stop further processing + return + filesbytype = {} + fileparsers = {} + for fn in input_files: + fp = loadfile(fn, 'r') + try: + filesbytype[fp.name].append(fn) + except KeyError: + filesbytype[fp.name] = [fn] + # get any extra options + fileparsers[fp.name], _ = fp.extra_args_parser( + skip_args=self.skip_args, add_help=False) + fp.close() + # now print information about the intersection of all parameters + parameters = get_common_parameters(input_files, collection='all') + print("\n"+textwrap.fill("Parameters available with this (these) " + "input file(s):"), end="\n\n") + print(textwrap.fill(' '.join(sorted(parameters))), + end="\n\n") + # information about the pycbc functions + pfuncs = sorted(FieldArray.functionlib.fget(FieldArray).keys()) + print(textwrap.fill("Available pycbc functions (see " + "http://pycbc.org/pycbc/latest/html for " + "more details):"), end="\n\n") + print(textwrap.fill(', '.join(pfuncs)), end="\n\n") + # numpy funcs + npfuncs = sorted([name for (name, obj) in _numpy_function_lib.items() + if isinstance(obj, numpy.ufunc)]) + print(textwrap.fill("Available numpy functions:"), + end="\n\n") + print(textwrap.fill(', '.join(npfuncs)), end="\n\n") + # misc + consts = "e euler_gamma inf nan pi" + print(textwrap.fill("Recognized constants:"), + end="\n\n") + print(consts, end="\n\n") + print(textwrap.fill("Python arthimetic (+ - * / // ** %), " + "binary (&, |, etc.), and comparison (>, <, >=, " + "etc.) operators may also be used."), end="\n\n") + # print out the extra arguments that may be used + outstr = textwrap.fill("The following are additional command-line " + "options that may be provided, along with the " + "input files that understand them:") + print("\n"+outstr, end="\n\n") + for ftype, fparser in fileparsers.items(): + fnames = ', '.join(filesbytype[ftype]) + if fparser is None: + outstr = textwrap.fill( + "File(s) {} use no additional options.".format(fnames)) + print(outstr, end="\n\n") + else: + fparser.usage = fnames + fparser.print_help() + parser.exit(0) + + +class ResultsArgumentParser(argparse.ArgumentParser): + """Wraps argument parser, and preloads arguments needed for loading samples + from a file. + + This parser class should be used by any program that wishes to use the + standard arguments for loading samples. It provides functionality to parse + file specific options. These file-specific arguments are not included in + the standard ``--help`` (since they depend on what input files are given), + but can be seen by running ``--file-help/-H``. The ``--file-help`` will + also print off information about what parameters may be used given the + input files. + + As with the standard ``ArgumentParser``, running this class's + ``parse_args`` will result in an error if arguments are provided that are + not recognized by the parser, nor by any of the file-specific arguments. + For example, ``parse_args`` would work on the command + ``--input-file results.hdf --walker 0`` if + ``results.hdf`` was created by a sampler that recognizes a ``--walker`` + argument, but would raise an error if ``results.hdf`` was created by a + sampler that does not recognize a ``--walker`` argument. The extra + arguments that are recognized are determined by the sampler IO class's + ``extra_args_parser``. + + Some arguments may be excluded from the parser using the ``skip_args`` + optional parameter. + + Parameters + ---------- + skip_args : list of str, optional + Do not add the given arguments to the parser. Arguments should be + specified as the option string minus the leading '--'; e.g., + ``skip_args=['thin-start']`` would cause the ``thin-start`` argument + to not be included. May also specify sampler-specific arguments. Note + that ``input-file``, ``file-help``, and ``parameters`` are always + added. + \**kwargs : + All other keyword arguments are passed to ``argparse.ArgumentParser``. + """ + def __init__(self, skip_args=None, **kwargs): + super(ResultsArgumentParser, self).__init__(**kwargs) + # add attribute to communicate to arguments what to do when there is + # no input files + self.no_input_file_err = False + if skip_args is None: + skip_args = [] + self.skip_args = skip_args + # add the results option grup + self.add_results_option_group() + + @property + def actions(self): + """Exposes the actions this parser can do as a dictionary. + + The dictionary maps the ``dest`` to actions. + """ + return {act.dest: act for act in self._actions} + + def _unset_required(self): + """Convenience function to turn off required arguments for first parse. + """ + self._required_args = [act for act in self._actions if act.required] + for act in self._required_args: + act.required = False + + def _reset_required(self): + """Convenience function to turn required arguments back on. + """ + for act in self._required_args: + act.required = True + + def parse_known_args(self, args=None, namespace=None): + """Parse args method to handle input-file dependent arguments.""" + # run parse args once to make sure the name space is populated with + # the input files. We'll turn off raising NoInputFileErrors on this + # pass + self.no_input_file_err = True + self._unset_required() + opts, extra_opts = super(ResultsArgumentParser, self).parse_known_args( + args, namespace) + # now do it again + self.no_input_file_err = False + self._reset_required() + opts, extra_opts = super(ResultsArgumentParser, self).parse_known_args( + args, opts) + # populate the parameters option if it wasn't specified + if opts.parameters is None: + parameters = get_common_parameters(opts.input_file, + collection='variable_params') + # now call parse parameters action to populate the namespace + self.actions['parameters'](self, opts, parameters) + # parse the sampler-specific options and check for any unknowns + unknown = [] + for fn in opts.input_file: + fp = loadfile(fn, 'r') + sampler_parser, _ = fp.extra_args_parser(skip_args=self.skip_args) + if sampler_parser is not None: + opts, still_unknown = sampler_parser.parse_known_args( + extra_opts, namespace=opts) + unknown.append(set(still_unknown)) + # the intersection of the unknowns are options not understood by + # any of the files + unknown = set.intersection(*unknown) + return opts, list(unknown) + + def add_results_option_group(self): + """Adds the options used to call gwin.io.results_from_cli function + to the parser. + + These are options releated to loading the results from a run of + gwin, for purposes of plotting and/or creating tables. + + Any argument strings included in the ``skip_args`` attribute will not + be added. + """ + results_reading_group = self.add_argument_group( + title="Arguments for loading results", + description="Additional, file-specific arguments may also be " + "provided, depending on what input-files are given. See " + "--file-help for details.") + results_reading_group.add_argument( + "--input-file", type=str, required=True, nargs="+", + action=ParseLabelArg, metavar='FILE[:LABEL]', + help="Path to input HDF file(s). A label may be specified for " + "each input file to use for plots when multiple files are " + "specified.") + # advanced help + results_reading_group.add_argument( + "-H", "--file-help", + action=PrintFileParams, skip_args=self.skip_args, + help="Based on the provided input-file(s), print all available " + "parameters that may be retrieved and all possible functions " + "on those parameters. Also print available additional " + "arguments that may be passed. This option is like an " + "advanced --help: if run, the program will just print the " + "information to screen, then exit.") + results_reading_group.add_argument( + "--parameters", type=str, nargs="+", metavar="PARAM[:LABEL]", + action=ParseParametersArg, + help="Name of parameters to load. If none provided will load all " + "of the model params in the input-file. If provided, the " + "parameters can be any of the model params or posterior " + "stats (loglikelihood, logprior, etc.) in the input file(s), " + "derived parameters from them, or any function of them. If " + "multiple files are provided, any parameter common to all " + "files may be used. Syntax for functions is python; any math " + "functions in the numpy libary may be used. Can optionally " + "also specify a LABEL for each parameter. If no LABEL is " + "provided, PARAM will used as the LABEL. If LABEL is the " + "same as a parameter in pycbc.waveform.parameters, the label " + "property of that parameter will be used (e.g., if LABEL " + "were 'mchirp' then {} would be used). To see all possible " + "parameters that may be used with the given input file(s), " + "as well as all avaiable functions, run --file-help, along " + "with one or more input files.".format( + _waveform.parameters.mchirp.label)) + return results_reading_group + + +def results_from_cli(opts, load_samples=True): + """Loads an inference result file along with any labels associated with it + from the command line options. + + Parameters + ---------- + opts : ArgumentParser options + The options from the command line. + load_samples : bool, optional + Load the samples from the file. + + Returns + ------- + fp_all : (list of) BaseInferenceFile type + The result file as an hdf file. If more than one input file, + then it returns a list. + parameters : list of str + List of the parameters to use, parsed from the parameters option. + labels : dict + Dictionary of labels to associate with the parameters. + samples_all : (list of) FieldArray(s) or None + If load_samples, the samples as a FieldArray; otherwise, None. + If more than one input file, then it returns a list. + """ + + # lists for files and samples from all input files + fp_all = [] + samples_all = [] + + input_files = opts.input_file + if isinstance(input_files, str): + input_files = [input_files] + + # loop over all input files + for input_file in input_files: + logging.info("Reading input file %s", input_file) + + # read input file + fp = loadfile(input_file, "r") + + # load the samples + if load_samples: + logging.info("Loading samples") + + # check if need extra parameters for a non-sampling parameter + file_parameters, ts = _transforms.get_common_cbc_transforms( + opts.parameters, fp.variable_params) + + # read samples from file + samples = fp.samples_from_cli(opts, parameters=file_parameters) + + logging.info("Using {} samples".format(samples.size)) + + # add parameters not included in file + samples = _transforms.apply_transforms(samples, ts) + + # else do not read samples + else: + samples = None + + # add results to lists from all input files + if len(input_files) > 1: + fp_all.append(fp) + samples_all.append(samples) + + # else only one input file then do not return lists + else: + fp_all = fp + samples_all = samples + + return fp_all, opts.parameters, opts.parameters_labels, samples_all + + +def injections_from_cli(opts): + """Gets injection parameters from the inference file(s). + + Parameters + ---------- + opts : argparser + Argparser object that has the command-line objects to parse. + + Returns + ------- + FieldArray + Array of the injection parameters from all of the input files given + by ``opts.input_file``. + """ + input_files = opts.input_file + if isinstance(input_files, str): + input_files = [input_files] + injections = None + # loop over all input files getting the injection files + for input_file in input_files: + fp = loadfile(input_file, 'r') + these_injs = fp.read_injections() + if injections is None: + injections = these_injs + else: + injections = injections.append(these_injs) + return injections diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index 8a1665c..cd66929 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -40,7 +40,7 @@ from pycbc.io import FieldArray from pycbc.types import FrequencySeries from pycbc.waveform import parameters as wfparams - +from pycbc.inject import InjectionSet class BaseInferenceFile(h5py.File): """Base class for all inference hdf files. @@ -212,6 +212,114 @@ def write_posterior(self, posterior_file, **kwargs): """ pass + @abstractmethod + def samples_from_cli(self, opts, extra_opts=None, parameters=None, + **kwargs): + """This should load samples using the given command-line options. + """ + pass + + @staticmethod + def extra_args_parser(parser=None, skip_args=None, **kwargs): + """Provides a parser that can be used to parse sampler-specific command + line options for loading samples. + + This is optional. Inheriting classes may override this if they want to + implement their own options. + + Parameters + ---------- + parser : argparse.ArgumentParser, optional + Instead of creating a parser, add arguments to the given one. If + none provided, will create one. + skip_args : list, optional + Don't include the given options. Options should be given as the + option string, minus the '--'. For example, + ``skip_args=['iteration']`` would cause the ``--iteration`` + argument not to be included. + \**kwargs : + All other keyword arguments are passed to the parser that is + created. + + Returns + ------- + parser : argparse.ArgumentParser or None + If this class adds extra arguments, an argument parser with the + extra arguments. Otherwise, will just return whatever was passed + for the ``parser`` argument (default is None). + actions : list of argparse.Action + List of the actions that were added. + """ + return parser, [] + + @staticmethod + def _get_optional_args(args, opts, err_on_missing=False, **kwargs): + """Convenience function to retrieve arguments from an argparse + namespace. + + Parameters + ---------- + args : list of str + List of arguments to retreive. + opts : argparse.namespace + Namespace to retreive arguments for. + err_on_missing : bool, optional + If an argument is not found in the namespace, raise an + AttributeError. Otherwise, just pass. Default is False. + \**kwargs : + All other keyword arguments are added to the return dictionary. + Any keyword argument that is the same as an argument in ``args`` + will override what was retrieved from ``opts``. + + Returns + ------- + dict : + Dictionary mapping arguments to values retrieved from ``opts``. If + keyword arguments were provided, these will also be included in the + dictionary. + """ + parsed = {} + for arg in args: + try: + parsed[arg] = getattr(opts, arg) + except AttributeError as e: + if err_on_missing: + raise AttributeError(e) + else: + continue + parsed.update(kwargs) + return parsed + + def samples_from_cli(self, opts, parameters=None, **kwargs): + """Reads samples from the given command-line options. + + Parameters + ---------- + opts : argparse Namespace + The options with the settings to use for loading samples (the sort + of thing returned by ``ArgumentParser().parse_args``). + parameters : (list of) str, optional + A list of the parameters to load. If none provided, will try to + get the parameters to load from ``opts.parameters``. + \**kwargs : + All other keyword arguments are passed to ``read_samples``. These + will override any options with the same name. + + Returns + ------- + FieldArray : + Array of the loaded samples. + """ + if parameters is None and opts.parameters is None: + parameters = self.variable_args + elif parameters is None: + parameters = opts.parameters + # parse optional arguments + _, extra_actions = self.extra_args_parser() + extra_args = [act.dest for act in extra_actions] + kwargs = self._get_optional_args(extra_args, opts, **kwargs) + return self.read_samples(parameters, **kwargs) + @property def static_params(self): """Returns a dictionary of the static_params. The keys are the argument @@ -415,7 +523,6 @@ def write_psd(self, psds, group=None): if group is None: group = subgroup else: - print group, subgroup group = '/'.join([group, subgroup]) for ifo in psds: self[group.format(ifo=ifo)] = psds[ifo] @@ -437,6 +544,20 @@ def write_injections(self, injection_file): except IOError: logging.warn("Could not read %s as an HDF file", injection_file) + def read_injections(self): + """Gets injection parameters. + + Returns + ------- + FieldArray + Array of the injection parameters. + """ + injset = InjectionSet(self.filename, hdf_group=self.injections_group) + injections = injset.table.view(FieldArray) + # close the new open filehandler to self + injset._injhandler.filehandler.close() + return injections + def write_command_line(self): """Writes command line to attributes. @@ -633,27 +754,27 @@ def copy(self, other, ignore=None, parameters=None, parameter_names=None, other.attrs['thin_end'] = None return other + @classmethod + def write_kwargs_to_attrs(cls, attrs, **kwargs): + """Writes the given keywords to the given ``attrs``. -def write_kwargs_to_hdf_attrs(attrs, **kwargs): - """Writes the given keywords to the given ``attrs``. - - If any keyword argument points to a dict, the keyword will point to a - list of the dict's keys. Each key is then written to the attrs with its - corresponding value. + If any keyword argument points to a dict, the keyword will point to a + list of the dict's keys. Each key is then written to the attrs with its + corresponding value. - Parameters - ---------- - attrs : an HDF attrs - Can be either the ``attrs`` of the hdf file, or any group in a file. - \**kwargs : - The keywords to write. - """ - for arg, val in kwargs.items(): - if val is None: - val = str(None) - if isinstance(val, dict): - attrs[arg] = val.keys() - # just call self again with the dict as kwargs - write_kwargs_to_hdf_attrs(attrs, **val) - else: - attrs[arg] = val + Parameters + ---------- + attrs : an HDF attrs + The ``attrs`` of an hdf file or a group in an hdf file. + \**kwargs : + The keywords to write. + """ + for arg, val in kwargs.items(): + if val is None: + val = str(None) + if isinstance(val, dict): + attrs[arg] = val.keys() + # just call self again with the dict as kwargs + cls.write_kwargs_to_attrs(attrs, **val) + else: + attrs[arg] = val diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py index f77247f..0d60935 100644 --- a/gwin/io/base_mcmc.py +++ b/gwin/io/base_mcmc.py @@ -29,7 +29,7 @@ from abc import (ABCMeta, abstractmethod) import numpy -from .base_hdf import write_kwargs_to_hdf_attrs +import argparse class MCMCIO(object): @@ -148,6 +148,77 @@ def read_raw_samples(self, fields, arrays[name] = arr return arrays + @staticmethod + def extra_args_parser(parser=None, skip_args=None, **kwargs): + """Create a parser to parse sampler-specific arguments for loading + samples. + + Parameters + ---------- + parser : argparse.ArgumentParser, optional + Instead of creating a parser, add arguments to the given one. If + none provided, will create one. + skip_args : list, optional + Don't parse the given options. Options should be given as the + option string, minus the '--'. For example, + ``skip_args=['iteration']`` would cause the ``--iteration`` + argument not to be included. + \**kwargs : + All other keyword arguments are passed to the parser that is + created. + + Returns + ------- + parser : argparse.ArgumentParser + An argument parser with th extra arguments added. + actions : list of argparse.Action + A list of the actions that were added. + """ + if parser is None: + parser = argparse.ArgumentParser(**kwargs) + elif kwargs: + raise ValueError("No other keyword arguments should be provded if " + "a parser is provided.") + if skip_args is None: + skip_args = [] + actions = [] + if 'thin-start' not in skip_args: + act = parser.add_argument( + "--thin-start", type=int, default=None, + help="Sample number to start collecting samples to plot. If " + "none provided, will use the input file's `thin_start` " + "attribute.") + actions.append(act) + if 'thin-interval' not in skip_args: + act = parser.add_argument( + "--thin-interval", type=int, default=None, + help="Interval to use for thinning samples. If none provided, " + "will use the input file's `thin_interval` attribute.") + actions.append(act) + if 'thin-end' not in skip_args: + act = parser.add_argument( + "--thin-end", type=int, default=None, + help="Sample number to stop collecting samples to plot. If " + "none provided, will use the input file's `thin_end` " + "attribute.") + actions.append(act) + if 'iteration' not in skip_args: + act = parser.add_argument( + "--iteration", type=int, default=None, + help="Only retrieve the given iteration. To load " + "the last n-th sampe use -n, e.g., -1 will " + "load the last iteration. This overrides " + "the thin-start/interval/end options.") + actions.append(act) + if 'walkers' not in skip_args: + act = parser.add_argument( + "--walkers", type=int, nargs="+", default=None, + help="Only retrieve samples from the listed " + "walkers. Default is to retrieve from all " + "walkers.") + actions.append(act) + return parser, actions + def write_resume_point(self): """Keeps a list of the number of iterations that were in a file when a run was resumed from a checkpoint.""" @@ -248,4 +319,4 @@ def write_burn_in(self, burn_in): except KeyError: group.create_group(key) attrs = group[key].attrs - write_kwargs_to_hdf_attrs(attrs, **burn_in.burn_in_data[tst]) + self.write_kwargs_to_attrs(attrs, **burn_in.burn_in_data[tst]) diff --git a/gwin/models/base.py b/gwin/models/base.py index e15dc57..a734ee5 100644 --- a/gwin/models/base.py +++ b/gwin/models/base.py @@ -34,7 +34,6 @@ from pycbc.io import FieldArray from pycbc.workflow import ConfigParser -from gwin.io.base_hdf import write_kwargs_to_hdf_attrs # # ============================================================================= @@ -755,4 +754,4 @@ def write_metadata(self, fp): fp.attrs['model'] = self.name fp.attrs['variable_params'] = list(self.variable_params) fp.attrs['sampling_params'] = list(self.sampling_params) - write_kwargs_to_hdf_attrs(fp.attrs, static_params=self.static_params) + fp.write_kwargs_to_attrs(fp.attrs, static_params=self.static_params) diff --git a/gwin/option_utils.py b/gwin/option_utils.py index 47ff79c..813cb37 100644 --- a/gwin/option_utils.py +++ b/gwin/option_utils.py @@ -19,6 +19,7 @@ import logging import shutil +import argparse from pycbc import (conversions, inject, transforms) from pycbc.distributions import (bounded, constraints) @@ -29,10 +30,7 @@ from pycbc.strain import from_cli_multi_ifos as strain_from_cli_multi_ifos from pycbc.strain import (gates_from_cli, psd_gates_from_cli, apply_gates_to_td, apply_gates_to_fd) - -from gwin import (burn_in, models, sampler) -from gwin.io.hdf import InferenceFile, check_integrity -from gwin.io.txt import InferenceTXTFile +from pycbc import waveform # ----------------------------------------------------------------------------- @@ -67,124 +65,6 @@ def config_parser_from_cli(opts): return WorkflowConfigParser(opts.config_files, overrides) -# ----------------------------------------------------------------------------- -# -# Utilities for setting up a sampler -# -# ----------------------------------------------------------------------------- - -def add_sampler_option_group(parser): - """Adds the options needed to set up an inference sampler. - - Parameters - ---------- - parser : object - ArgumentParser instance. - """ - sampler_group = parser.add_argument_group( - "Arguments for setting up a sampler") - - # required options - sampler_group.add_argument( - "--sampler", required=True, choices=sampler.samplers.keys(), - help="Sampler class to use for finding posterior.") - sampler_group.add_argument( - "--niterations", type=int, - help="Number of iterations to perform. If 'use_sampler' is given to " - "burn-in-function, this will be counted after the sampler's burn " - "function has run. Otherwise, this is the total number of " - "iterations, including any burn in.") - sampler_group.add_argument( - "--n-independent-samples", type=int, - help="Run the sampler until the specified number of " - "independent samples is obtained, at minimum. Requires " - "checkpoint-interval. At each checkpoint the burn-in iteration " - "and ACL is updated. The number of independent samples is the " - "number of samples across all walkers starting at the " - "burn-in-iteration and skipping every `ACL`th iteration. " - "Either this or niteration should be specified (but not both).") - # sampler-specific options - sampler_group.add_argument( - "--nwalkers", type=int, default=None, - help="Number of walkers to use in sampler. Required for MCMC " - "samplers.") - sampler_group.add_argument( - "--ntemps", type=int, default=None, - help="Number of temperatures to use in sampler. Required for parallel " - "tempered MCMC samplers.") - sampler_group.add_argument( - "--burn-in-function", default=None, nargs='+', - choices=burn_in.burn_in_functions.keys(), - help="Use the given function to determine when chains are burned in. " - "If none provided, no burn in will be estimated. " - "If multiple functions are provided, will use the maximum " - "iteration from all functions.") - sampler_group.add_argument( - "--min-burn-in", type=int, default=0, - help="Force the burn-in to be at least the given number of " - "iterations.") - sampler_group.add_argument( - "--update-interval", type=int, default=None, - help="If using kombine, specify the number of steps to take between " - "proposal updates. Note: for purposes of updating, kombine " - "counts iterations since the last checkpoint. This interval " - "should therefore be less than the checkpoint interval, else " - "no updates will occur. To ensure that updates happen at equal " - "intervals, make checkpoint-interval a multiple of " - "update-interval.") - sampler_group.add_argument( - "--nprocesses", type=int, default=None, - help="Number of processes to use. If not given then use maximum.") - sampler_group.add_argument( - "--use-mpi", action='store_true', default=False, - help="Use MPI to parallelize the sampler") - sampler_group.add_argument( - "--logpost-function", default="logposterior", - help="Which attribute of the model to use for the logposterior. " - "The default is logposterior. For example, if using the " - "gaussian_noise model, you may wish to set this to logplr, since " - "the logposterior includes a large constant contribution from " - "log noise likelihood.") - - return sampler_group - - -def sampler_from_cli(opts, model, pool=None): - """Parses the given command-line options to set up a sampler. - - Parameters - ---------- - opts : object - ArgumentParser options. - model : model - The model to use with the sampler. - - Returns - ------- - gwin.sampler - A sampler initialized based on the given arguments. - """ - # create a wrapper for the model - model = models.CallModel(model, opts.logpost_function) - - # Used to help paralleize over multiple cores / MPI - if opts.nprocesses > 1: - models._global_instance = model - model_call = models._call_global_model - else: - model_call = None - - sclass = sampler.samplers[opts.sampler] - - pool = choose_pool(mpi=opts.use_mpi, processes=opts.nprocesses) - - if pool is not None: - pool.count = opts.nprocesses - - return sclass.from_cli(opts, model, - pool=pool, model_call=model_call) - - # ----------------------------------------------------------------------------- # # Utilities for loading data @@ -299,226 +179,112 @@ def data_from_cli(opts): # ----------------------------------------------------------------------------- # -# Utilities for loading and plotting results +# Utilities for plotting results # # ----------------------------------------------------------------------------- -def add_inference_results_option_group(parser, include_parameters_group=True): - """Adds the options used to call gwin.results_from_cli function - to an argument parser. These are options releated to loading the results - from a run of pycbc_inference, for purposes of plotting and/or creating - tables. - Parameters - ---------- - parser : object - ArgumentParser instance. - include_parameters_group : bool - If true then include `--parameters-group` option. - """ +class ParseLabelArg(argparse.Action): + """Argparse action that will parse arguments that can accept labels. - results_reading_group = parser.add_argument_group( - "Arguments for loading inference results") - - # required options - results_reading_group.add_argument( - "--input-file", type=str, required=True, nargs="+", - help="Path to input HDF files.") - results_reading_group.add_argument( - "--parameters", type=str, nargs="+", metavar="PARAM[:LABEL]", - help="Name of parameters to load. If none provided will load all of " - "the model params in the input-file. If provided, the " - "parameters can be any of the model params or posteriors in " - "the input file, derived parameters from them, or any function " - "of them. Syntax for functions is python; any math functions in " - "the numpy libary may be used. Can optionally also specify a " - "label for each parameter. If no label is provided, will try to " - "retrieve a label from the input-file. If no label can be found " - "in the input-file, will try to get a label from " - "pycbc.waveform.parameters. If no label can be found in either " - "place, will just use the parameter.") - - # optionals - results_reading_group.add_argument( - "--thin-start", type=int, default=None, - help="Sample number to start collecting samples to plot. If none " - "provided, will start at the end of the burn-in.") - results_reading_group.add_argument( - "--thin-interval", type=int, default=None, - help="Interval to use for thinning samples. If none provided, will " - "use the auto-correlation length found in the file.") - results_reading_group.add_argument( - "--thin-end", type=int, default=None, - help="Sample number to stop collecting samples to plot. If none " - "provided, will stop at the last sample from the sampler.") - results_reading_group.add_argument( - "--iteration", type=int, default=None, - help="Only retrieve the given iteration. To load the last n-th sampe " - "use -n, e.g., -1 will load the last iteration. This overrides " - "the thin-start/interval/end options.") - if include_parameters_group: - results_reading_group.add_argument( - "--parameters-group", type=str, - default=InferenceFile.samples_group, - choices=[InferenceFile.samples_group, InferenceFile.stats_group], - help="Group in the HDF InferenceFile to look for parameters.") - - return results_reading_group - - -def parse_parameters_opt(parameters): - """Parses the --parameters opt in the results_reading_group. + This assumes that the values set on the command line for its assigned + argument are strings formatted like ``PARAM[:LABEL]``. When the arguments + are parsed, the ``LABEL`` bit is stripped off and added to a dictionary + mapping ``PARAM -> LABEL``. This dictionary is stored to the parsed + namespace called ``{dest}_labels``, where ``{dest}`` is the argument's + ``dest`` setting (by default, this is the same as the option string). + Likewise, the argument's ``dest`` in the parsed namespace is updated so + that it is just ``PARAM``. - Parameters - ---------- - parameters : list of str or None - The parameters to parse. - Returns - ------- - parameters : list of str - The parameters. - labels : dict - A dictionary mapping parameters for which labels were provide to those - labels. - """ - if parameters is None: - return None, {} - # load the labels - labels = {} - for ii, p in enumerate(parameters): - if len(p.split(':')) == 2: - p, label = p.split(':') - parameters[ii] = p - labels[p] = label - return parameters, labels - - -def results_from_cli(opts, load_samples=True, **kwargs): - """ - Loads an inference result file along with any labels associated with it - from the command line options. + If no ``LABEL`` is provided, then ``PARAM`` will be used for ``LABEL``. - Parameters - ---------- - opts : ArgumentParser options - The options from the command line. - load_samples : {True, bool} - Load samples from the results file using the parameters, thin_start, - and thin_interval specified in the options. The samples are returned - as a FieldArray instance. - - **kwargs : - All other keyword arguments are passed to the InferenceFile's - read_samples function. - - Returns - ------- - fp_all : pycbc.io.InferenceFile - The result file as an InferenceFile. If more than one input file, - then it returns a list. - parameters_all : list - List of the parameters to use, parsed from the parameters option. - If more than one input file, then it returns a list. - labels_all : list - List of labels to associate with the parameters. If more than one - input file, then it returns a list. - samples_all : {None, FieldArray} - If load_samples, the samples as a FieldArray; otherwise, None. - If more than one input file, then it returns a list. + This action can work on arguments that have ``nargs != 0`` and ``type`` set + to ``str``. """ - - # lists for files and samples from all input files - fp_all = [] - parameters_all = [] - labels_all = [] - samples_all = [] - - input_files = opts.input_file - if isinstance(input_files, str): - input_files = [input_files] - - # loop over all input files - for input_file in input_files: - logging.info("Reading input file %s", input_file) - - # read input file - fp = InferenceFile(input_file, "r") - - # get parameters and a dict of labels for each parameter - parameters = (fp.variable_params if opts.parameters is None - else opts.parameters) - parameters, ldict = parse_parameters_opt(parameters) - - # convert labels dict to list - labels = [] - for p in parameters: - try: - label = ldict[p] - except KeyError: - label = fp.read_label(p) - labels.append(label) - - # load the samples - if load_samples: - logging.info("Loading samples") - - # check if need extra parameters for a non-sampling parameter - file_parameters, ts = transforms.get_common_cbc_transforms( - parameters, fp.variable_params) - - # read samples from file - samples = fp.read_samples( - file_parameters, thin_start=opts.thin_start, - thin_interval=opts.thin_interval, thin_end=opts.thin_end, - iteration=opts.iteration, - samples_group=opts.parameters_group, **kwargs) - - # add parameters not included in file - samples = transforms.apply_transforms(samples, ts) - - # else do not read samples - else: - samples = None - - # add results to lists from all input files - if len(input_files) > 1: - fp_all.append(fp) - parameters_all.append(parameters) - labels_all.append(labels) - samples_all.append(samples) - - # else only one input file then do not return lists - else: - fp_all = fp - parameters_all = parameters - labels_all = labels - samples_all = samples - - return fp_all, parameters_all, labels_all, samples_all - - -def get_file_type(filename): - """ Returns I/O object to use for file. - - Parameters - ---------- - filename : str - Name of file. - - Returns - ------- - file_type : {InferenceFile, InferenceTXTFile} - The type of inference file object to use. + def __init__(self, type=str, nargs=None, **kwargs): + # check that type is string + if type != str: + raise ValueError("the type for this action must be a string") + if nargs == 0: + raise ValueError("nargs must not be 0 for this action") + super(ParseLabelArg, self).__init__(type=type, nargs=nargs, + **kwargs) + + def __call__(self, parser, namespace, values, option_string=None): + singlearg = isinstance(values, (str, unicode)) + if singlearg: + values = [values] + params = [] + labels = {} + for param in values: + psplit = param.split(':') + if len(psplit) == 2: + param, label = psplit + else: + label = param + labels[param] = label + params.append(param) + # update the namespace + if singlearg: + params = params[0] + setattr(namespace, self.dest, params) + setattr(namespace, '{}_labels'.format(self.dest), labels) + + +class ParseParametersArg(ParseLabelArg): + """Argparse action that will parse parameters and labels from an opton. + + Does the same as ``ParseLabelArg``, with the additional functionality that + if ``LABEL`` is a known parameter in ``pycbc.waveform.parameters``, then + the label attribute there will be used in the labels dictionary. + Otherwise, ``LABEL`` will be used. + + Examples + -------- + Create a parser and add two arguments that use this action (note that the + first argument accepts multiple inputs while the second only accepts a + single input): + + >>> import argparse + >>> parser = argparse.ArgumentParser() + >>> parser.add_argument('--parameters', type=str, nargs="+", + action=ParseParametersArg) + >>> parser.add_argument('--z-arg', type=str, action=ParseParametersArg) + + Parse a command line that uses these options: + + >>> import shlex + >>> cli = "--parameters 'mass1+mass2:mtotal' ra ni --z-arg foo:bar" + >>> opts = parser.parse_args(shlex.split(cli)) + >>> opts.parameters + ['mass1+mass2', 'ra', 'ni'] + >>> opts.parameters_labels + {'mass1+mass2': '$M~(\\mathrm{M}_\\odot)$', 'ni': 'ni', 'ra': '$\\alpha$'} + >>> opts.z_arg + 'foo' + >>> opts.z_arg_labels + {'foo': 'bar'} + + In the above, the first argument to ``--parameters`` was ``mtotal``. Since + this is a recognized parameter in ``pycbc.waveform.parameters``, the label + dictionary contains the latex string associated with the ``mtotal`` + parameter. A label was not provided for the second argument, and so ``ra`` + was used. Since ``ra`` is also a recognized parameter, its associated latex + string was used in the labels dictionary. Since ``ni`` and ``bar`` (the + label for ``z-arg``) are not recognized parameters, they were just used + as-is in the labels dictionaries. """ - txt_extensions = [".txt", ".dat", ".csv"] - hdf_extensions = [".hdf", ".h5"] - for ext in hdf_extensions: - if filename.endswith(ext): - return InferenceFile - for ext in txt_extensions: - if filename.endswith(ext): - return InferenceTXTFile - raise TypeError("Extension is not supported.") + def __call__(self, parser, namespace, values, option_string=None): + super(ParseParametersArg, self).__call__(parser, namespace, values, + option_string=option_string) + # try to replace the labels with a label from waveform.parameters + labels = getattr(namespace, '{}_labels'.format(self.dest)) + for param, label in labels.items(): + try: + label = getattr(waveform.parameters, label).label + labels[param] = label + except AttributeError: + pass def add_plot_posterior_option_group(parser): @@ -585,9 +351,6 @@ def add_plot_posterior_option_group(parser): "injection in the file to work. Any values " "specified by expected-parameters will override " "the values obtained for the injection.") - # FIXME: the following should be made an attribute of the results file - pgroup.add_argument("--injection-hdf-group", default="H1/injections", - help="HDF group that contains injection values.") return pgroup @@ -626,47 +389,6 @@ def plot_ranges_from_cli(opts): return mins, maxs -def injections_from_cli(opts): - """Gets injection parameters from the inference file(s). - - Parameters - ---------- - opts : argparser - Argparser object that has the command-line objects to parse. - - Returns - ------- - FieldArray - Array of the injection parameters from all of the input files given - by ``opts.input_file``. - """ - input_files = opts.input_file - if isinstance(input_files, str): - input_files = [input_files] - parameters, _ = parse_parameters_opt(opts.parameters) - if parameters is None: - with InferenceFile(input_files[0], 'r') as fp: - parameters = fp.variable_params - injections = None - # loop over all input files getting the injection files - for input_file in input_files: - # read injections from HDF input file as FieldArray - these_injs = inject.InjectionSet( - input_file, - hdf_group=opts.injection_hdf_group, - ).table.view(FieldArray) - if injections is None: - injections = these_injs - else: - injections = injections.append(these_injs) - # check if need extra parameters than parameters stored in injection file - _, ts = transforms.get_common_cbc_transforms(parameters, - injections.fieldnames) - # add parameters not included in injection file - injections = transforms.apply_transforms(injections, ts) - return injections - - def expected_parameters_from_cli(opts): """Parses the --expected-parameters arguments from the `plot_posterior` option group. @@ -705,7 +427,7 @@ def add_scatter_option_group(parser): "scatter plot.") scatter_group.add_argument( - '--z-arg', type=str, default=None, + '--z-arg', type=str, default=None, action=ParseParametersArg, help='What to color the scatter points by. Syntax is the same as the ' 'parameters option.') scatter_group.add_argument( diff --git a/gwin/results/scatter_histograms.py b/gwin/results/scatter_histograms.py index 740b85b..36d34b4 100644 --- a/gwin/results/scatter_histograms.py +++ b/gwin/results/scatter_histograms.py @@ -341,7 +341,7 @@ def create_marginalized_hist(ax, values, label, percentiles=None, else: orientation = 'vertical' ax.hist(values, bins=50, histtype=htype, orientation=orientation, - facecolor=fillcolor, edgecolor=color, lw=2, normed=True) + facecolor=fillcolor, edgecolor=color, lw=2, density=True) if percentiles is None: percentiles = [5., 50., 95.] values = numpy.percentile(values, percentiles) @@ -496,8 +496,9 @@ def create_multidim_plot(parameters, samples, labels=None, Names of the variables to be plotted. samples : FieldArray A field array of the samples to plot. - labels: {None, list}, optional - A list of names for the parameters. + labels: dict, optional + A dictionary mapping parameters to labels. If none provided, will just + use the parameter strings as the labels. mins : {None, dict}, optional Minimum value for the axis of each variable in `parameters`. If None, it will use the minimum of the corresponding variable in @@ -563,10 +564,7 @@ def create_multidim_plot(parameters, samples, labels=None, `{('param1', 'param2'): (pyplot.axes, row index, column index)}` """ if labels is None: - labels = [p for p in parameters] - # turn labels into a dict for easier access - labels = dict(zip(parameters, labels)) - + labels = {p: p for p in parameters} # set up the figure with a grid of axes # if only plotting 2 parameters, make the marginal plots smaller nparams = len(parameters) From b29dd389de7c348801a8007f754aab20013ea5bb Mon Sep 17 00:00:00 2001 From: Collin Capano Date: Tue, 25 Sep 2018 12:28:55 +0200 Subject: [PATCH 3/3] Update emcee pt (#73) * fix a bug in base_mcmc hdf and add docs * fix docs in emcee io * add global call method for logprior * add base_multitemper sampler methods and io * move some config loading to base_mcmc * remove unnecessary import * add support for multi tempered samplers to burn_in * update emcee_pt * add imports to module __init__s * remove note from executable * add emcee_pt to io * fix import errors * make sure stats are written out with the correct dtype * fix bugs * fix more bugs * create an action for parsing temps arg * make sure fields is a list * fix pep8 issues --- bin/gwin | 9 - gwin/burn_in.py | 52 +- gwin/io/__init__.py | 2 + gwin/io/base_hdf.py | 8 +- gwin/io/base_mcmc.py | 42 +- gwin/io/base_multitemper.py | 260 ++++++++++ gwin/io/emcee.py | 3 +- gwin/io/emcee_pt.py | 97 ++++ gwin/models/__init__.py | 9 + gwin/sampler/__init__.py | 4 +- gwin/sampler/base_mcmc.py | 152 ++++-- gwin/sampler/base_multitemper.py | 192 ++++++++ gwin/sampler/emcee.py | 39 +- gwin/sampler/emcee_pt.py | 822 +++++++------------------------ 14 files changed, 963 insertions(+), 728 deletions(-) create mode 100644 gwin/io/base_multitemper.py create mode 100644 gwin/io/emcee_pt.py create mode 100644 gwin/sampler/base_multitemper.py diff --git a/bin/gwin b/bin/gwin index cacded7..146a44e 100644 --- a/bin/gwin +++ b/bin/gwin @@ -195,15 +195,6 @@ with ctx: # Finalize the output sampler.finalize() - # FIXME: move to emcee_pt's finalize method - #with InferenceFile(checkpoint_file, 'a') as fp: - # try: - # lnz, dlnz = sampler.calculate_logevidence(fp) - # logging.info("Saving evidence") - # sampler.write_logevidence(fp, lnz, dlnz) - # except NotImplementedError: - # pass - # rename checkpoint to output and delete backup logging.info("Moving checkpoint to output") os.rename(sampler.checkpoint_file, opts.output_file) diff --git a/gwin/burn_in.py b/gwin/burn_in.py index d87bf69..c7eb348 100644 --- a/gwin/burn_in.py +++ b/gwin/burn_in.py @@ -215,6 +215,16 @@ def _getlogposts(self, filename): logposts = samples['loglikelihood'] + samples['logprior'] return logposts + def _getacls(self, filename, start_index): + """Convenience function for calculating acls for the given filename. + + Since we calculate the acls, this will also store it to the sampler. + """ + acls = self.sampler.compute_acl(filename, start_index=start_index) + # since we calculated it, save the acls to the sampler + self.sampler.acls = acls + return acls + def halfchain(self, filename): """Just uses half the chain as the burn-in iteration. """ @@ -279,7 +289,7 @@ def nacl(self, filename): """ niters = self._getniters(filename) kstart = int(niters / 2.) - acls = self.sampler.compute_acl(filename, start_index=kstart) + acls = self._getacls(filename, start_index=kstart) is_burned_in = {param: (self._nacls * acl) < kstart for (param, acl) in acls.items()} data = self.burn_in_data['nacl'] @@ -291,8 +301,6 @@ def nacl(self, filename): data['burn_in_iteration'] = NOT_BURNED_IN_ITER # additional information data['status_per_parameter'] = is_burned_in - # since we calculated it, save the acls to the sampler - self.sampler.acls = acls def ks_test(self, filename): """Applies ks burn-in test.""" @@ -371,3 +379,41 @@ def from_config(cls, cp, sampler): kwargs['min_iterations'] = int( cp.get_opt_tag(section, 'min-iterations', tag)) return cls(sampler, burn_in_test, **kwargs) + + +class MultiTemperedMCMCBurnInTests(MCMCBurnInTests): + """Adds support for multiple temperatures to the MCMCBurnInTests.""" + + def _getacls(self, filename, start_index): + """Convenience function for calculating acls for the given filename. + + This function is used by the ``n_acl`` burn-in test. That function + expects the returned ``acls`` dict to just report a single ACL for + each parameter. Since multi-tempered samplers return an array of ACLs + for each parameter instead, this takes the max over the array before + returning. + + Since we calculate the acls, this will also store it to the sampler. + """ + acls = super(MultiTemperedMCMCBurnInTests, self)._getacls( + filename, start_index) + # return the max for each parameter + return {param: vals.max() for (param, vals) in acls.items()} + + def _getlogposts(self, filename): + """Convenience function for retrieving log posteriors. + + This just gets the coldest temperature chain, and returns arrays with + shape nwalkers x niterations, so the parent class can run the same + ``posterior_step`` function. + """ + with self.sampler.io(filename, 'r') as fp: + samples = fp.read_raw_samples( + ['loglikelihood', 'logprior'], thin_start=0, thin_interval=1, + temps=0, flatten=False) + # reshape to drop the first dimension + for (stat, arr) in samples.items(): + _, nwalkers, niterations = arr.shape + samples[stat] = arr.reshape((nwalkers, niterations)) + logposts = samples['loglikelihood'] + samples['logprior'] + return logposts diff --git a/gwin/io/__init__.py b/gwin/io/__init__.py index 7c15d3b..23cb79c 100644 --- a/gwin/io/__init__.py +++ b/gwin/io/__init__.py @@ -34,10 +34,12 @@ from ..option_utils import (ParseLabelArg, ParseParametersArg) from .emcee import EmceeFile +from .emcee_pt import EmceePTFile from .txt import InferenceTXTFile filetypes = { EmceeFile.name: EmceeFile, + EmceePTFile.name: EmceePTFile } diff --git a/gwin/io/base_hdf.py b/gwin/io/base_hdf.py index cd66929..cc8df50 100644 --- a/gwin/io/base_hdf.py +++ b/gwin/io/base_hdf.py @@ -609,11 +609,17 @@ def get_slice(self, thin_start=None, thin_interval=None, thin_end=None): The slice needed. """ if thin_start is None: - thin_start = self.thin_start + thin_start = int(self.thin_start) + else: + thin_start = int(thin_start) if thin_interval is None: thin_interval = self.thin_interval + else: + thin_interval = int(numpy.ceil(thin_interval)) if thin_end is None: thin_end = self.thin_end + else: + thin_end = int(thin_end) return slice(thin_start, thin_end, thin_interval) def copy_metadata(self, other): diff --git a/gwin/io/base_mcmc.py b/gwin/io/base_mcmc.py index 0d60935..c6f7f73 100644 --- a/gwin/io/base_mcmc.py +++ b/gwin/io/base_mcmc.py @@ -47,12 +47,9 @@ def write_samples(self, samples, parameters=None, start_iteration=None, max_iterations=None): """Writes samples to the given file. - Results are written to: - - ``fp[samples_group/{vararg}]``, - - where ``{vararg}`` is the name of a model params. The samples are - written as an ``nwalkers x niterations`` array. + Results are written to ``samples_group/{vararg}``, where ``{vararg}`` + is the name of a model params. The samples are written as an + ``nwalkers x niterations`` array. Parameters ----------- @@ -117,6 +114,23 @@ def read_raw_samples(self, fields, fields : list The list of field names to retrieve. Must be names of datasets in the ``samples_group``. + thin_start : int, optional + Start reading from the given iteration. Default is to start from + the first iteration. + thin_interval : int, optional + Only read every ``thin_interval``th sample. Default is 1. + thin_end : int, optional + Stop reading at the given iteration. Default is to end at the last + iteration. + iteration : int, optional + Only read the given iteration. If this provided, it overrides + the ``thin_(start|interval|end)`` options. + walkers : int, optional + Only read from the given walkers. Default is to read all. + flatten : bool, optional + Flatten the samples to 1D arrays before returning. Otherwise, the + returned arrays will have shape (requested walkers x + requested iteration(s)). Default is True. Returns ------- @@ -127,13 +141,13 @@ def read_raw_samples(self, fields, fields = [fields] # walkers to load if walkers is not None: - widx = numpy.zeros(fp.nwalkers, dtype=bool) + widx = numpy.zeros(self.nwalkers, dtype=bool) widx[walkers] = True else: widx = slice(0, None) # get the slice to use if iteration is not None: - get_index = iteration + get_index = int(iteration) else: get_index = self.get_slice(thin_start=thin_start, thin_end=thin_end, @@ -242,6 +256,11 @@ def niterations(self): """Returns the number of iterations the sampler was run for.""" return self[self.sampler_group].attrs['niterations'] + @property + def nwalkers(self): + """Returns the number of walkers used by the sampler.""" + return self[self.sampler_group].attrs['nwalkers'] + def write_sampler_metadata(self, sampler): """Writes the sampler's metadata.""" self.attrs['sampler'] = sampler.name @@ -285,16 +304,11 @@ def write_acls(self, acls): self[self.sampler_group].attrs['acl'] = acl # set the default thin interval to be the acl (if it is finite) if numpy.isfinite(acl): - self.attrs['thin_interval'] = acl + self.attrs['thin_interval'] = int(numpy.ceil(acl)) def read_acls(self): """Reads the acls of all the parameters. - Parameters - ---------- - fp : InferenceFile - An open file handler to read the acls from. - Returns ------- dict diff --git a/gwin/io/base_multitemper.py b/gwin/io/base_multitemper.py new file mode 100644 index 0000000..e389809 --- /dev/null +++ b/gwin/io/base_multitemper.py @@ -0,0 +1,260 @@ +# Copyright (C) 2018 Collin Capano +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# self.option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# +"""Provides I/O support for multi-tempered sampler. +""" + +from __future__ import absolute_import + +import argparse + +from .base_mcmc import MCMCIO + + +class ParseTempsArg(argparse.Action): + """Argparse action that will parse temps argument. + + If the provided argument is 'all', sets 'all' in the namespace dest. If a + a sequence of numbers are provided, converts those numbers to ints before + saving to the namespace. + """ + def __init__(self, type=str, **kwargs): + # check that type is string + if type != str: + raise ValueError("the type for this action must be a string") + super(ParseTempsArg, self).__init__(type=type, **kwargs) + + def __call__(self, parser, namespace, values, option_string=None): + singlearg = isinstance(values, (str, unicode)) + if singlearg: + values = [values] + if values[0] == 'all': + # check that only a single value was provided + if len(values) > 1: + raise ValueError("if provide 'all', should not specify any " + "other temps") + temps = 'all' + else: + temps = [] + for val in values: + try: + val = int(val) + except ValueError: + pass + temps.append(val) + if singlearg: + temps = temps[0] + setattr(namespace, self.dest, temps) + + +class MultiTemperedMCMCIO(MCMCIO): + """Abstract base class for multi-tempered MCMC sampler IO. + """ + + @property + def ntemps(self): + """Returns the number of temperatures used by the sampler.""" + return self[self.sampler_group].attrs['ntemps'] + + def write_sampler_metadata(self, sampler): + """Adds writing ntemps to MCMCIO. + """ + super(MultiTemperedMCMCIO, self).write_sampler_metadata(sampler) + self[self.sampler_group].attrs["ntemps"] = sampler.ntemps + + def write_samples(self, samples, parameters=None, + start_iteration=None, max_iterations=None): + """Writes samples to the given file. + + Results are written to ``samples_group/{vararg}``, where ``{vararg}`` + is the name of a model params. The samples are written as an + ``ntemps x nwalkers x niterations`` array. + + Parameters + ----------- + samples : dict + The samples to write. Each array in the dictionary should have + shape nwalkers x niterations. + parameters : list, optional + Only write the specified parameters to the file. If None, will + write all of the keys in the ``samples`` dict. + start_iteration : int, optional + Write results to the file's datasets starting at the given + iteration. Default is to append after the last iteration in the + file. + max_iterations : int, optional + Set the maximum size that the arrays in the hdf file may be resized + to. Only applies if the samples have not previously been written + to file. The default (None) is to use the maximum size allowed by + h5py. + """ + ntemps, nwalkers, niterations = samples.values()[0].shape + assert all(p.shape == (ntemps, nwalkers, niterations) + for p in samples.values()), ( + "all samples must have the same shape") + if max_iterations is not None and max_iterations < niterations: + raise IndexError("The provided max size is less than the " + "number of iterations") + group = self.samples_group + '/{name}' + if parameters is None: + parameters = samples.keys() + # loop over number of dimensions + for param in parameters: + dataset_name = group.format(name=param) + istart = start_iteration + try: + fp_niterations = self[dataset_name].shape[-1] + if istart is None: + istart = fp_niterations + istop = istart + niterations + if istop > fp_niterations: + # resize the dataset + self[dataset_name].resize(istop, axis=2) + except KeyError: + # dataset doesn't exist yet + if istart is not None and istart != 0: + raise ValueError("non-zero start_iteration provided, " + "but dataset doesn't exist yet") + istart = 0 + istop = istart + niterations + self.create_dataset(dataset_name, (ntemps, nwalkers, istop), + maxshape=(ntemps, nwalkers, + max_iterations), + dtype=samples[param].dtype, + fletcher32=True) + self[dataset_name][:, :, istart:istop] = samples[param] + + def read_raw_samples(self, fields, + thin_start=None, thin_interval=None, thin_end=None, + iteration=None, temps=None, walkers=None, + flatten=True): + """Base function for reading samples. + + Parameters + ----------- + fields : list + The list of field names to retrieve. Must be names of datasets in + the ``samples_group``. + thin_start : int, optional + Start reading from the given iteration. Default is to start from + the first iteration. + thin_interval : int, optional + Only read every ``thin_interval``th sample. Default is 1. + thin_end : int, optional + Stop reading at the given iteration. Default is to end at the last + iteration. + iteration : int, optional + Only read the given iteration. If this provided, it overrides + the ``thin_(start|interval|end)`` options. + temps : 'all' or (list of) int, optional + The temperature index (or list of indices) to retrieve. If None, + only samples from the coldest (= 0) temperature chain will be + retrieved. To retrieve all temperates pass 'all', or a list of + all of the temperatures. Default is to only load the coldest + temperature. + walkers : (list of) int, optional + Only read from the given walkers. Default is to read all. + flatten : bool, optional + Flatten the samples to 1D arrays before returning. Otherwise, the + returned arrays will have shape (requested temps x + requested walkers x requested iteration(s)). Default is True. + + Returns + ------- + array_class + An instance of the given array class populated with values + retrieved from the fields. + """ + if isinstance(fields, (str, unicode)): + fields = [fields] + # walkers to load + if walkers is not None: + widx = numpy.zeros(self.nwalkers, dtype=bool) + widx[walkers] = True + nwalkers = widx.sum() + else: + widx = slice(None, None) + nwalkers = self.nwalkers + # temperatures to load + selecttemps = False + if temps is None: + tidx = 0 + ntemps = 1 + elif isinstance(temps, int): + tidx = temps + ntemps = 1 + else: + # temps is either 'all' or a list of temperatures; + # in either case, we'll get all of the temperatures from the file; + # if not 'all', then we'll pull out the ones we want + tidx = slice(None, None) + selecttemps = temps != 'all' + if selecttemps: + ntemps = len(temps) + else: + ntemps = self.ntemps + # get the slice to use + if iteration is not None: + get_index = int(iteration) + niterations = 1 + else: + get_index = self.get_slice(thin_start=thin_start, + thin_end=thin_end, + thin_interval=thin_interval) + # we'll just get the number of iterations from the returned shape + niterations = None + # load + group = self.samples_group + '/{name}' + arrays = {} + for name in fields: + arr = self[group.format(name=name)][tidx, widx, get_index] + if niterations is None: + niterations = arr.shape[-1] + # pull out the temperatures we need + if selecttemps: + arr = arr[temps, ...] + if flatten: + arr = arr.flatten() + else: + # ensure that the returned array is 3D + arr = arr.reshape((ntemps, nwalkers, niterations)) + arrays[name] = arr + return arrays + + @staticmethod + def extra_args_parser(parser=None, skip_args=None, **kwargs): + """Adds --temps to MCMCIO parser. + """ + if skip_args is None: + skip_args = [] + parser, actions = MCMCIO.extra_args_parser( + parser=parser, skip_args=skip_args, **kwargs) + if 'temps' not in skip_args: + act = parser.add_argument( + "--temps", nargs="+", default=None, action=ParseTempsArg, + help="Get the given temperatures. May provide either a " + "sequence of integers specifying the temperatures to " + "plot, or 'all' for all temperatures. Default is to only " + "plot the coldest (= 0) temperature chain.") + actions.append(act) + return parser, actions diff --git a/gwin/io/emcee.py b/gwin/io/emcee.py index 8331226..f792e72 100644 --- a/gwin/io/emcee.py +++ b/gwin/io/emcee.py @@ -38,7 +38,7 @@ def read_acceptance_fraction(self, walkers=None): Parameters ----------- - walkers : {None, (list of) int} + walkers : (list of) int, optional The walker index (or a list of indices) to retrieve. If None, samples from all walkers will be obtained. @@ -72,4 +72,5 @@ def write_acceptance_fraction(self, acceptance_fraction): self[group] = acceptance_fraction def write_posterior(self, filename, **kwargs): + """Write me.""" pass diff --git a/gwin/io/emcee_pt.py b/gwin/io/emcee_pt.py new file mode 100644 index 0000000..1948360 --- /dev/null +++ b/gwin/io/emcee_pt.py @@ -0,0 +1,97 @@ +# Copyright (C) 2018 Collin Capano +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# self.option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +"""Provides I/O support for emcee_pt. +""" + +from __future__ import absolute_import + +from .base_hdf import BaseInferenceFile +from .base_multitemper import MultiTemperedMCMCIO + + +class EmceePTFile(MultiTemperedMCMCIO, BaseInferenceFile): + """Class to handle file IO for the ``emcee`` sampler.""" + + name = 'emcee_pt_file' + + @property + def betas(self): + """The betas that were used.""" + return self[self.sampler_group].attrs["betas"] + + def write_sampler_metadata(self, sampler): + """Adds writing betas to MultiTemperedMCMCIO. + """ + super(EmceePTFile, self).write_sampler_metadata(sampler) + self[self.sampler_group].attrs["betas"] = sampler.betas + + def read_acceptance_fraction(self, temps=None, walkers=None): + """Reads the acceptance fraction. + + Parameters + ----------- + temps : (list of) int, optional + The temperature index (or a list of indices) to retrieve. If None, + acfs from all temperatures and all walkers will be retrieved. + walkers : (list of) int, optional + The walker index (or a list of indices) to retrieve. If None, + samples from all walkers will be obtained. + + Returns + ------- + array + Array of acceptance fractions with shape (requested temps, + requested walkers). + """ + group = self.sampler_group + '/acceptance_fraction' + if walkers is None: + wmask = numpy.ones(self.nwalkers, dtype=bool) + else: + wmask = numpy.zeros(self.nwalkers, dtype=bool) + wmask[walkers] = True + if temps is None: + tmask = numpy.ones(self.ntemps, dtype=bool) + else: + tmask = numpy.zeros(self.ntemps, dtype=bool) + tmask[temps] = True + return self[group][:][numpy.ix_(tmask, wmask)] + + def write_acceptance_fraction(self, acceptance_fraction): + """Write acceptance_fraction data to file. + + Results are written to ``[sampler_group]/acceptance_fraction``; the + resulting dataset has shape (ntemps, nwalkers). + + Parameters + ----------- + acceptance_fraction : numpy.ndarray + Array of acceptance fractions to write. Must have shape + ntemps x nwalkers. + """ + # check + assert acceptance_fraction.shape == (self.ntemps, self.nwalkers), ( + "acceptance fraction must have shape ntemps x nwalker") + group = self.sampler_group + '/acceptance_fraction' + try: + self[group][:] = acceptance_fraction + except KeyError: + # dataset doesn't exist yet, create it + self[group] = acceptance_fraction + + def write_posterior(self, filename, **kwargs): + """Write me.""" + pass diff --git a/gwin/models/__init__.py b/gwin/models/__init__.py index 48af52c..06f2eae 100644 --- a/gwin/models/__init__.py +++ b/gwin/models/__init__.py @@ -34,6 +34,15 @@ def _call_global_model(*args, **kwds): return _global_instance(*args, **kwds) # pylint:disable=not-callable +def _call_global_model_logprior(*args, **kwds): + """Private function for a calling global's logprior. + + This is needed for samplers that use a separate function for the logprior, + like ``emcee_pt``. + """ + return _global_instance(*args, callstat='logprior', **kwds) + + class CallModel(object): """Wrapper class for calling models from a sampler. diff --git a/gwin/sampler/__init__.py b/gwin/sampler/__init__.py index aa7cf3a..d52ce48 100644 --- a/gwin/sampler/__init__.py +++ b/gwin/sampler/__init__.py @@ -22,14 +22,14 @@ from .base import (initial_dist_from_config, create_new_output_file) # from .kombine import KombineSampler from .emcee import EmceeEnsembleSampler -# from .emcee_pt import EmceePTSampler +from .emcee_pt import EmceePTSampler # from .mcmc import MCMCSampler # list of available samplers samplers = {cls.name: cls for cls in ( # KombineSampler, EmceeEnsembleSampler, - # EmceePTSampler, + EmceePTSampler, # MCMCSampler, )} diff --git a/gwin/sampler/base_mcmc.py b/gwin/sampler/base_mcmc.py index d5afbc0..cba9a8b 100644 --- a/gwin/sampler/base_mcmc.py +++ b/gwin/sampler/base_mcmc.py @@ -28,6 +28,7 @@ from abc import (ABCMeta, abstractmethod, abstractproperty) import logging import numpy +from pycbc.workflow import ConfigParser from pycbc.filter import autocorrelation from ..io import validate_checkpoint_files @@ -76,36 +77,69 @@ def raw_samples_to_dict(sampler, raw_samples): return samples -def raw_stats_to_dict(sampler, raw_stats): - """Converts an ND array of model stats to a dict. +def blob_data_to_dict(stat_names, blobs): + """Converts list of "blobs" to a dictionary of model stats. - The ``raw_stats`` may either be a numpy array or a list. If the - former, the stats are assumed to have shape - ``[sampler.base_shape x] niterations x nstats, where nstats are the number - of stats returned by ``sampler.model.default_stats``. If the latter, the - list is cast to an array that is assumed to be the same shape as if an - array was given. + Samplers like ``emcee`` store the extra tuple returned by ``CallModel`` to + a list called blobs. This is a list of lists of tuples with shape + niterations x nwalkers x nstats, where nstats is the number of stats + returned by the model's ``default_stats``. This converts that list to a + dictionary of arrays keyed by the stat names. Parameters ---------- - sampler : sampler instance - An instance of an MCMC sampler. - raw_stats : array or list - The stats to convert. + stat_names : list of str + The list of the stat names. + blobs : list of list of tuples + The data to convert. Returns ------- dict : A dictionary mapping the model's ``default_stats`` to arrays of values. - Each array will have shape ``[sampler.base_shape x] niterations``. + Each array will have shape ``nwalkers x niterations``. """ - if not isinstance(raw_stats, numpy.ndarray): - # Assume list. Since the model returns a tuple of values, this should - # be a [sampler.base_shape x] x niterations list of tuples. We can - # therefore immediately convert this to a ND array. - raw_stats = numpy.array(raw_stats) - return {stat: raw_stats[..., ii] - for (ii, stat) in enumerate(sampler.model.default_stats)} + # get the dtypes of each of the stats; we'll just take this from the + # first iteration and walker + dtypes = [type(val) for val in blobs[0][0]] + assert len(stat_names) == len(dtypes), ( + "number of stat names must match length of tuples in the blobs") + # convert to an array; to ensure that we get the dtypes correct, we'll + # cast to a structured array + raw_stats = numpy.array(blobs, dtype=zip(stat_names, dtypes)) + # transpose so that it has shape nwalkers x niterations + raw_stats = raw_stats.transpose() + # now return as a dictionary + return {stat: raw_stats[stat] for stat in stat_names} + + +def get_optional_arg_from_config(cp, section, arg, dtype=str): + """Convenience function to retrieve an optional argument from a config + file. + + Parameters + ---------- + cp : ConfigParser + Open config parser to retrieve the argument from. + section : str + Name of the section to retrieve from. + arg : str + Name of the argument to retrieve. + dtype : datatype, optional + Cast the retrieved value (if it exists) to the given datatype. Default + is ``str``. + + Returns + ------- + val : None or str + If the argument is present, the value. Otherwise, None. + """ + if cp.has_option(section, arg): + val = dtype(cp.get(section, arg)) + else: + val = None + return val + # # ============================================================================= @@ -436,6 +470,62 @@ def checkpoint(self): logging.info("Clearing samples from memory") self.clear_samples() + @staticmethod + def checkpoint_from_config(cp, section): + """Gets the checkpoint interval from the given config file. + + This looks for 'checkpoint-interval' in the section. + + Parameters + ---------- + cp : ConfigParser + Open config parser to retrieve the argument from. + section : str + Name of the section to retrieve from. + + Return + ------ + int or None : + The checkpoint interval, if it is in the section. Otherw + """ + return get_optional_arg_from_config(cp, section, 'checkpoint-interval', + dtype=int) + + def set_target_from_config(self, cp, section): + """Sets the target using the given config file. + + This looks for 'niterations' to set the ``target_niterations``, and + 'effective-nsamples' to set the ``target_eff_nsamples``. + + Parameters + ---------- + cp : ConfigParser + Open config parser to retrieve the argument from. + section : str + Name of the section to retrieve from. + """ + if cp.has_option(section, "niterations"): + niterations = int(cp.get(section, "niterations")) + else: + niterations = None + if cp.has_option(section, "effective-nsamples"): + nsamples = int(cp.get(section, "effective-nsamples")) + else: + nsamples = None + self.set_target(niterations=niterations, eff_nsamples=nsamples) + + def set_burn_in_from_config(self, cp): + """Sets the burn in class from the given config file. + + If no burn-in section exists in the file, then this just set the + burn-in class to None. + """ + try: + bit = self.burn_in_class.from_config(cp, self) + except ConfigParser.Error: + bit = None + self.set_burn_in(bit) + @abstractmethod def compute_acf(cls, filename, **kwargs): """A method to compute the autocorrelation function of samples in the @@ -519,25 +609,31 @@ def compute_acf(cls, filename, start_index=None, end_index=None, return acfs @classmethod - def compute_acl(cls, filename, start_index=None, end_index=None): + def compute_acl(cls, filename, start_index=None, end_index=None, + min_nsamples=10): """Computes the autocorrleation length for all model params in the given file. Parameter values are averaged over all walkers at each iteration. - The ACL is then calculated over the averaged chain. If the returned ACL - is `inf`, will default to the number of current iterations. + The ACL is then calculated over the averaged chain. If an ACL cannot + be calculated because there are not enough samples, it will be set + to ``inf``. Parameters ----------- filename : str Name of a samples file to compute ACLs for. - start_index : {None, int} + start_index : int, optional The start index to compute the acl from. If None, will try to use the number of burn-in iterations in the file; otherwise, will start at the first sample. - end_index : {None, int} + end_index : int, optional The end index to compute the acl to. If None, will go to the end of the current iteration. + min_nsamples : int, optional + Require a minimum number of samples to compute an ACL. If the + number of samples per walker is less than this, will just set to + ``inf``. Default is 10. Returns ------- @@ -551,10 +647,8 @@ def compute_acl(cls, filename, start_index=None, end_index=None): param, thin_start=start_index, thin_interval=1, thin_end=end_index, flatten=False)[param] samples = samples.mean(axis=0) - # if < 10 samples, just set to inf - # Note: this should be done inside of pycbc's autocorrelation - # function - if samples.size < 10: + # if < min number of samples, just set to inf + if samples.size < min_nsamples: acl = numpy.inf else: acl = autocorrelation.calculate_acl(samples) diff --git a/gwin/sampler/base_multitemper.py b/gwin/sampler/base_multitemper.py new file mode 100644 index 0000000..13541cd --- /dev/null +++ b/gwin/sampler/base_multitemper.py @@ -0,0 +1,192 @@ +# Copyright (C) 2018 Collin Capano +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + + +# +# ============================================================================= +# +# Preamble +# +# ============================================================================= +# +"""Provides constructor classes provide support for parallel tempered MCMC +samplers.""" + +from __future__ import absolute_import + +import numpy +from pycbc.filter import autocorrelation + + +class MultiTemperedSupport(object): + """Provides methods for supporting multi-tempered samplers. + """ + _ntemps = None + + @property + def ntemps(self): + """The number of temeratures that are set.""" + return self._ntemps + + +class MultiTemperedAutocorrSupport(object): + """Provides class methods for calculating multi-tempered ACFs/ACLs. + """ + + @classmethod + def compute_acf(cls, filename, start_index=None, end_index=None, + per_walker=False, walkers=None, parameters=None, + temps=None): + """Computes the autocorrleation function of the model params in the + given file. + + By default, parameter values are averaged over all walkers at each + iteration. The ACF is then calculated over the averaged chain for each + temperature. An ACF per-walker will be returned instead if + ``per_walker=True``. + + Parameters + ----------- + filename : str + Name of a samples file to compute ACFs for. + start_index : {None, int} + The start index to compute the acl from. If None, will try to use + the number of burn-in iterations in the file; otherwise, will start + at the first sample. + end_index : {None, int} + The end index to compute the acl to. If None, will go to the end + of the current iteration. + per_walker : optional, bool + Return the ACF for each walker separately. Default is False. + walkers : optional, int or array + Calculate the ACF using only the given walkers. If None (the + default) all walkers will be used. + parameters : optional, str or array + Calculate the ACF for only the given parameters. If None (the + default) will calculate the ACF for all of the model params. + temps : optional, (list of) int or 'all' + The temperature index (or list of indices) to retrieve. If None + (the default), the ACF will only be computed for the coldest (= 0) + temperature chain. To compute an ACF for all temperates pass 'all', + or a list of all of the temperatures. + + Returns + ------- + dict : + Dictionary of arrays giving the ACFs for each parameter. If + ``per-walker`` is True, the arrays will have shape + ``ntemps x nwalkers x niterations``. Otherwise, the returned array + will have shape ``ntemps x niterations``. + """ + acfs = {} + with cls._io(filename, 'r') as fp: + if parameters is None: + parameters = fp.variable_params + if isinstance(parameters, str) or isinstance(parameters, unicode): + parameters = [parameters] + if isinstance(temps, int): + temps = [temps] + elif temps == 'all': + temps = numpy.arange(fp.ntemps) + elif temps is None: + temps = [0] + for param in parameters: + subacfs = [] + for tk in temps: + if per_walker: + # just call myself with a single walker + if walkers is None: + walkers = numpy.arange(fp.nwalkers) + arrays = [cls.compute_acfs(filename, + start_index=start_index, + end_index=end_index, + per_walker=False, + walkers=ii, + parameters=param, + temps=tk)[param][0, :] + for ii in walkers] + # we'll stack all of the walker arrays to make a single + # nwalkers x niterations array; when these are stacked + # below, we'll get a ntemps x nwalkers x niterations + # array + subacfs.append(numpy.vstack(arrays)) + else: + samples = fp.read_raw_samples( + param, thin_start=start_index, + thin_interval=1, thin_end=end_index, + walkers=walkers, temps=tk, flatten=False)[param] + # contract the walker dimension using the mean, and + # flatten the (length 1) temp dimension + samples = samples.mean(axis=1)[0, :] + thisacf = autocorrelation.calculate_acf( + samples).numpy() + subacfs.append(thisacf) + # stack the temperatures + acfs[param] = numpy.stack(subacfs) + return acfs + + @classmethod + def compute_acl(cls, filename, start_index=None, end_index=None, + min_nsamples=10): + """Computes the autocorrleation length for all model params and + temperatures in the given file. + + Parameter values are averaged over all walkers at each iteration and + temperature. The ACL is then calculated over the averaged chain. + + Parameters + ----------- + filename : str + Name of a samples file to compute ACLs for. + start_index : {None, int} + The start index to compute the acl from. If None, will try to use + the number of burn-in iterations in the file; otherwise, will start + at the first sample. + end_index : {None, int} + The end index to compute the acl to. If None, will go to the end + of the current iteration. + min_nsamples : int, optional + Require a minimum number of samples to compute an ACL. If the + number of samples per walker is less than this, will just set to + ``inf``. Default is 10. + + Returns + ------- + dict + A dictionary of ntemps-long arrays of the ACLs of each parameter. + """ + acls = {} + with cls._io(filename, 'r') as fp: + if end_index is None: + end_index = fp.niterations + tidx = numpy.arange(fp.ntemps) + for param in fp.variable_params: + these_acls = numpy.zeros(fp.ntemps) + for tk in tidx: + samples = fp.read_raw_samples( + param, thin_start=start_index, thin_interval=1, + thin_end=end_index, temps=tk, flatten=False)[param] + # contract the walker dimension using the mean, and flatten + # the (length 1) temp dimension + samples = samples.mean(axis=1)[0, :] + if samples.size < min_nsamples: + acl = numpy.inf + else: + acl = autocorrelation.calculate_acl(samples) + if acl <= 0: + acl = numpy.inf + these_acls[tk] = acl + acls[param] = these_acls + return acls diff --git a/gwin/sampler/emcee.py b/gwin/sampler/emcee.py index 443f89d..0cc3f60 100644 --- a/gwin/sampler/emcee.py +++ b/gwin/sampler/emcee.py @@ -31,11 +31,10 @@ import numpy import emcee from pycbc.pool import choose_pool -from pycbc.workflow import ConfigParser from .base import BaseSampler from .base_mcmc import (BaseMCMC, MCMCAutocorrSupport, raw_samples_to_dict, - raw_stats_to_dict) + blob_data_to_dict, get_optional_arg_from_config) from ..burn_in import MCMCBurnInTests from ..io import EmceeFile from .. import models @@ -123,11 +122,8 @@ def model_stats(self): The returned array has shape ``nwalkers x niterations``. """ - raw_stats = numpy.array(self._sampler.blobs) - # raw_stats has shape niterations x nwalkers x nstats; transpose - # so that it has shape nwalkers x niterations x nstats - raw_stats = raw_stats.transpose((1, 0, 2)) - return raw_stats_to_dict(self, raw_stats) + stats = self.model.default_stats + return blob_data_to_dict(stats, self._sampler.blobs) def clear_samples(self): """Clears the samples and stats from memory. @@ -202,31 +198,14 @@ def from_config(cls, cp, model, nprocesses=1, use_mpi=False): # get the number of walkers to use nwalkers = int(cp.get(section, "nwalkers")) # get the checkpoint interval, if it's specified - if cp.has_option(section, "checkpoint-interval"): - checkpoint_interval = int(cp.get(section, "checkpoint-interval")) - else: - checkpoint_interval = None - if cp.has_option(section, "logpost-function"): - lnpost = cp.get(section, "logpost-function") - else: - lnpost = None + checkpoint_interval = cls.checkpoint_from_config(cp, section) + # get the logpost function + lnpost = get_optional_arg_from_config(cp, section, 'logpost-function') obj = cls(model, nwalkers, checkpoint_interval=checkpoint_interval, logpost_function=lnpost, nprocesses=nprocesses, use_mpi=use_mpi) - # get target - if cp.has_option(section, "niterations"): - niterations = int(cp.get(section, "niterations")) - else: - niterations = None - if cp.has_option(section, "effective-nsamples"): - nsamples = int(cp.get(section, "effective-nsamples")) - else: - nsamples = None - obj.set_target(niterations=niterations, eff_nsamples=nsamples) + # set target + obj.set_target_from_config(cp, section) # add burn-in if it's specified - try: - bit = obj.burn_in_class.from_config(cp, obj) - except ConfigParser.Error: - bit = None - obj.set_burn_in(bit) + obj.set_burn_in_from_config(cp) return obj diff --git a/gwin/sampler/emcee_pt.py b/gwin/sampler/emcee_pt.py index cef83fd..19ab4d8 100644 --- a/gwin/sampler/emcee_pt.py +++ b/gwin/sampler/emcee_pt.py @@ -14,45 +14,30 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -# -# ============================================================================= -# -# Preamble -# -# ============================================================================= -# """ -This modules provides classes and functions for using the emcee sampler +This modules provides classes and functions for using the emcee_pt sampler packages for parameter estimation. """ -# The following two classes are needed for two reason -# 1) pools freeze state when created and so classes *cannot be updated* -# 2) methods cannot be pickled. - - -class _callprior(object): - """Calls the model's prior function, and ensures that no - metadata is returned.""" - def __init__(self, model_call): - self.callable = model_call - - def __call__(self, args): - prior = self.callable(args, callfunc='prior') - return prior if isinstance(prior, numpy.float64) else prior[0] - +from __future__ import absolute_import -class _callloglikelihood(object): - """Calls the model's loglikelihood function. - """ - def __init__(self, model_call): - self.callable = model_call +import numpy +import emcee +import logging +from pycbc.pool import choose_pool - def __call__(self, args): - return self.callable(args, callfunc='loglikelihood') +from .base import BaseSampler +from .base_mcmc import (BaseMCMC, raw_samples_to_dict, + get_optional_arg_from_config) +from .base_multitemper import (MultiTemperedSupport, + MultiTemperedAutocorrSupport) +from ..burn_in import MultiTemperedMCMCBurnInTests +from ..io import EmceePTFile +from .. import models -class EmceePTSampler(BaseMCMCSampler): +class EmceePTSampler(MultiTemperedAutocorrSupport, MultiTemperedSupport, + BaseMCMC, BaseSampler): """This class is used to construct a parallel-tempered MCMC sampler from the emcee package's PTSampler. @@ -70,647 +55,190 @@ class EmceePTSampler(BaseMCMCSampler): cores/nodes/etc. """ name = "emcee_pt" - - def __init__(self, model, ntemps, nwalkers, pool=None, - model_call=None): - - try: - import emcee - except ImportError: - raise ImportError("emcee is not installed.") - - if model_call is None: - model_call = model + _io = EmceePTFile + burn_in_class = MultiTemperedMCMCBurnInTests + + def __init__(self, model, ntemps, nwalkers, checkpoint_interval=None, + loglikelihood_function=None, nprocesses=1, use_mpi=False): + + self.model = model + + # create a wrapper for calling the model + if loglikelihood_function is None: + loglikelihood_function = 'loglikelihood' + # frustratingly, emcee_pt does not support blob data, so we have to + # turn it off + model_call = models.CallModel(model, loglikelihood_function, + return_all_stats=False) + + # Set up the pool + if nprocesses > 1: + # these are used to help paralleize over multiple cores / MPI + models._global_instance = model_call + model_call = models._call_global_model + prior_call = models._call_global_model_logprior + else: + prior_call = models.CallModel(model, 'logprior', + return_all_stats=False) + pool = choose_pool(mpi=use_mpi, processes=nprocesses) + if pool is not None: + pool.count = nprocesses # construct the sampler: PTSampler needs the likelihood and prior # functions separately ndim = len(model.variable_params) - sampler = emcee.PTSampler(ntemps, nwalkers, ndim, - _callloglikelihood(model_call), - _callprior(model_call), - pool=pool) - # initialize - super(EmceePTSampler, self).__init__( - sampler, model) + self._sampler = emcee.PTSampler(ntemps, nwalkers, ndim, + model_call, prior_call, pool=pool) self._nwalkers = nwalkers self._ntemps = ntemps + self._checkpoint_interval = checkpoint_interval - @classmethod - def from_cli(cls, opts, model, pool=None, - model_call=None): - """Create an instance of this sampler from the given command-line - options. - - Parameters - ---------- - opts : ArgumentParser options - The options to parse. - model : LikelihoodEvaluator - The model to use with the sampler. + @property + def io(self): + return self._io - Returns - ------- - EmceePTSampler - An emcee sampler initialized based on the given arguments. - """ - return cls(model, opts.ntemps, opts.nwalkers, - pool=pool, model_call=model_call) + @property + def base_shape(self): + return (self.ntemps, self.nwalkers,) @property - def ntemps(self): - return self._ntemps + def betas(self): + return self._sampler.betas + + @classmethod + def from_config(cls, cp, model, nprocesses=1, use_mpi=False): + """Loads the sampler from the given config file.""" + section = "sampler" + # check name + assert cp.get(section, "name") == cls.name, ( + "name in section [sampler] must match mine") + # get the number of walkers to use + nwalkers = int(cp.get(section, "nwalkers")) + # get the number of temps + ntemps = int(cp.get(section, "ntemps")) + # get the checkpoint interval, if it's specified + checkpoint_interval = cls.checkpoint_from_config(cp, section) + # get the loglikelihood function + logl = get_optional_arg_from_config(cp, section, 'logl-function') + obj = cls(model, ntemps, nwalkers, + checkpoint_interval=checkpoint_interval, + loglikelihood_function=logl, nprocesses=nprocesses, + use_mpi=use_mpi) + # set target + obj.set_target_from_config(cp, section) + # add burn-in if it's specified + obj.set_burn_in_from_config(cp) + return obj @property - def chain(self): - """Get all past samples as an ntemps x nwalker x niterations x ndim - array. - """ - # emcee returns the chain as ntemps x nwalker x niterations x ndim - return self._sampler.chain + def samples(self): + """A dict mapping ``variable_params`` to arrays of samples currently + in memory. - def clear_chain(self): - """Clears the chain and blobs from memory. + The arrays have shape ``ntemps x nwalkers x niterations``. """ - # store the iteration that the clear is occuring on - self.lastclear = self.niterations - # now clear the chain - self._sampler.reset() + # emcee stores samples to it's chain attribute as a + # nwalker x niterations x ndim array + raw_samples = self._sampler.chain + return raw_samples_to_dict(self, raw_samples) @property def model_stats(self): - """Returns the log likelihood ratio and log prior as a FieldArray. + """Returns the log likelihood ratio and log prior as a dict of arrays. + The returned array has shape ntemps x nwalkers x niterations. + + Unfortunately, because ``emcee_pt`` does not have blob support, this + will only return the loglikelihood, logprior, and logjacobian, + regardless of what stats the model can return. """ # likelihood has shape ntemps x nwalkers x niterations logl = self._sampler.lnlikelihood # get prior from posterior logp = self._sampler.lnprobability - logl - # compute the likelihood ratio - loglr = logl - self.model.lognl - kwargs = {'loglr': loglr, 'prior': logp} + logjacobian = numpy.zeros(logp.size) # if different coordinates were used for sampling, get the jacobian if self.model.sampling_transforms is not None: samples = self.samples - # convert to dict - d = {param: samples[param] for param in samples.fieldnames} - logj = self.model.logjacobian(**d) - kwargs['logjacobian'] = logj - return FieldArray.from_kwargs(**kwargs) - - @property - def lnpost(self): - """Get the natural logarithm of the likelihood + the prior as an - ntemps x nwalkers x niterations array. + flattened_samples = {param: arr.ravel() + for param, arr in samples.items()} + for ii in range(logp.size): + these_samples = {param: vals[ii] + for param, vals in flattened_samples.items()} + self.model.update(**these_samples) + logjacobian[ii] = self.model.logjacobian + logjacobian = logjacobian.reshape(logp.shape) + # put the logprior into the variable_params space + logp -= logjacobian + return {'loglikelihood': logl, 'logprior': logp, + 'logjacobian': logjacobian} + + def clear_samples(self): + """Clears the chain and blobs from memory. """ - # emcee returns ntemps x nwalkers x niterations - return self._sampler.lnprobability - - def set_p0(self, samples_file=None, prior=None): - """Sets the initial position of the walkers. - - Parameters - ---------- - samples_file : InferenceFile, optional - If provided, use the last iteration in the given file for the - starting positions. - prior : JointDistribution, optional - Use the given prior to set the initial positions rather than - ``model``'s prior. + # store the iteration that the clear is occuring on + self._lastclear = self.niterations + self._itercounter = 0 + # now clear the chain + self._sampler.reset() - Returns - ------- - p0 : array - An ntemps x nwalkers x ndim array of the initial positions that - were set. + def set_state_from_file(self, filename): + """Sets the state of the sampler back to the instance saved in a file. """ - # create a (nwalker, ndim) array for initial positions - ntemps = self.ntemps - nwalkers = self.nwalkers - ndim = len(self.variable_params) - p0 = numpy.ones((ntemps, nwalkers, ndim)) - # if samples are given then use those as initial positions - if samples_file is not None: - samples = self.read_samples(samples_file, self.variable_params, - iteration=-1, temps='all', - flatten=False)[..., 0] - # transform to sampling parameter space - samples = self.model.apply_sampling_transforms( - samples) - # draw random samples if samples are not provided - else: - samples = self.model.prior_rvs( - size=nwalkers*ntemps, prior=prior).reshape((ntemps, nwalkers)) - # convert to array - for i, param in enumerate(self.sampling_params): - p0[..., i] = samples[param] - self._p0 = p0 - return p0 - - def run(self, niterations, **kwargs): + with self.io(filename, 'r') as fp: + rstate = fp.read_random_state() + # set the numpy random state + numpy.random.set_state(rstate) + + def run_mcmc(self, niterations, **kwargs): """Advance the ensemble for a number of samples. Parameters ---------- niterations : int Number of samples to get from sampler. - - Returns - ------- - p : numpy.array - An array of current walker positions with shape (nwalkers, ndim). - lnpost : numpy.array - The list of log posterior probabilities for the walkers at - positions p, with shape (nwalkers, ndim). - rstate : - The current state of the random number generator. + \**kwargs : + All other keyword arguments are passed to the emcee sampler. """ pos = self._pos if pos is None: - pos = self.p0 + pos = self._p0 res = self._sampler.run_mcmc(pos, niterations, **kwargs) - p, lnpost, rstate = res[0], res[1], res[2] + p, _, _ = res[0], res[1], res[2] # update the positions self._pos = p - return p, lnpost, rstate - - # read/write functions - - # add ntemps and betas to metadata - def write_metadata(self, fp, **kwargs): - """Writes metadata about this sampler to the given file. Metadata is - written to the file's `attrs`. - - Parameters - ---------- - fp : InferenceFile - A file handler to an open inference file. - **kwargs : - All keyword arguments are saved as separate arguments in the - file attrs. If any keyword argument is a dictionary, the keyword - will point to the list of keys in the the file's ``attrs``. Each - key is then stored as a separate attr with its corresponding value. - """ - super(EmceePTSampler, self).write_metadata(fp, **kwargs) - fp.attrs["ntemps"] = self.ntemps - fp.attrs["betas"] = self._sampler.betas - - def write_acceptance_fraction(self, fp): - """Write acceptance_fraction data to file. Results are written to - `fp[acceptance_fraction/temp{k}]` where k is the temperature. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - """ - group = "acceptance_fraction/temp{tk}" - # acf has shape ntemps x nwalkers - acf = self.acceptance_fraction - for tk in range(fp.ntemps): - try: - fp[group.format(tk=tk)][:] = acf[tk, :] - except KeyError: - # dataset doesn't exist yet, create it - fp[group.format(tk=tk)] = acf[tk, :] - - @staticmethod - def read_acceptance_fraction(fp, temps=None, walkers=None): - """Reads the acceptance fraction from the given file. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - temps : {None, (list of) int} - The temperature index (or a list of indices) to retrieve. If None, - acfs from all temperatures and all walkers will be retrieved. - walkers : {None, (list of) int} - The walker index (or a list of indices) to retrieve. If None, - samples from all walkers will be obtained. - - Returns - ------- - array - Array of acceptance fractions with shape (requested temps, - requested walkers). - """ - group = 'acceptance_fraction/temp{tk}' - if temps is None: - temps = numpy.arange(fp.ntemps) - if walkers is None: - wmask = numpy.ones(fp.nwalkers, dtype=bool) - else: - wmask = numpy.zeros(fp.nwalkers, dtype=bool) - wmask[walkers] = True - arrays = [] - for tk in temps: - arrays.extend(fp[group.format(tk=tk)][wmask]) - return arrays - - @staticmethod - def write_samples_group(fp, samples_group, parameters, samples, - start_iteration=None, max_iterations=None): - """Writes samples to the given file. - - Results are written to: - - ``fp[samples_group/{vararg}]``, - - where ``{vararg}`` is the name of a variable arg. The samples are - written as an ``ntemps x nwalkers x niterations`` array. - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - samples_group : str - Name of samples group to write. - parameters : list - The parameters to write to the file. - samples : FieldArray - The samples to write. Should be a FieldArray with fields containing - the samples to write and shape nwalkers x niterations. - start_iteration : int, optional - Write results to the file's datasets starting at the given - iteration. Default is to append after the last iteration in the - file. - max_iterations : int, optional - Set the maximum size that the arrays in the hdf file may be resized - to. Only applies if the samples have not previously been written - to file. The default (None) is to use the maximum size allowed by - h5py. - """ - ntemps, nwalkers, niterations = samples.shape - if max_iterations is not None and max_iterations < niterations: - raise IndexError("The provided max size is less than the " - "number of iterations") - group = samples_group + '/{name}' - # loop over number of dimensions - for param in parameters: - dataset_name = group.format(name=param) - istart = start_iteration - try: - fp_niterations = fp[dataset_name].shape[-1] - if istart is None: - istart = fp_niterations - istop = istart + niterations - if istop > fp_niterations: - # resize the dataset - fp[dataset_name].resize(istop, axis=2) - except KeyError: - # dataset doesn't exist yet - if istart is not None and istart != 0: - raise ValueError("non-zero start_iteration provided, but " - "dataset doesn't exist yet") - istart = 0 - istop = istart + niterations - fp.create_dataset(dataset_name, (ntemps, nwalkers, istop), - maxshape=(ntemps, nwalkers, max_iterations), - dtype=float, fletcher32=True) - fp[dataset_name][:, :, istart:istop] = samples[param] - - def write_results(self, fp, start_iteration=None, max_iterations=None, - **metadata): - """Writes metadata, samples, model stats, and acceptance fraction - to the given file. See the write function for each of those for - details. - - Parameters - ----------- - fp : InferenceFile - A file handler to an open inference file. - start_iteration : int, optional - Write results to the file's datasets starting at the given - iteration. Default is to append after the last iteration in the - file. - max_iterations : int, optional - Set the maximum size that the arrays in the hdf file may be resized - to. Only applies if the samples have not previously been written - to file. The default (None) is to use the maximum size allowed by - h5py. - \**metadata : - All other keyword arguments are passed to ``write_metadata``. - """ - self.write_metadata(fp, **metadata) - self.write_chain(fp, start_iteration=start_iteration, - max_iterations=max_iterations) - self.write_model_stats(fp, start_iteration=start_iteration, - max_iterations=max_iterations) - self.write_acceptance_fraction(fp) - self.write_state(fp) - - @staticmethod - def _read_fields(fp, fields_group, fields, array_class, - thin_start=None, thin_interval=None, thin_end=None, - iteration=None, temps=None, walkers=None, flatten=True): - """Base function for reading samples and model stats. See - `read_samples` and `read_model_stats` for details. + def write_results(self, filename): + """Writes samples, model stats, acceptance fraction, and random state + to the given file. Parameters ----------- - fp : InferenceFile - An open file handler to read the samples from. - fields_group : str - The name of the group to retrieve the desired fields. - fields : list - The list of field names to retrieve. Must be names of groups in - `fp[fields_group/]`. - array_class : FieldArray or similar - The type of array to return. Must have a `from_kwargs` attribute. - - For other details on keyword arguments, see `read_samples` and - `read_model_stats`. - - Returns - ------- - array_class - An instance of the given array class populated with values - retrieved from the fields. + filename : str + The file to write to. The file is opened using the ``io`` class + in an an append state. """ - # walkers to load - if walkers is not None: - widx = numpy.zeros(fp.nwalkers, dtype=bool) - widx[walkers] = True - nwalkers = widx.sum() - else: - widx = slice(None, None) - nwalkers = fp.nwalkers - # temperatures to load - selecttemps = False - if temps is None: - tidx = 0 - ntemps = 1 - elif isinstance(temps, int): - tidx = temps - ntemps = 1 - else: - # temps is either 'all' or a list of temperatures; - # in either case, we'll get all of the temperatures from the file; - # if not 'all', then we'll pull out the ones we want - tidx = slice(None, None) - selecttemps = temps != 'all' - if selecttemps: - ntemps = len(temps) - else: - ntemps = fp.ntemps - # get the slice to use - if iteration is not None: - get_index = iteration - niterations = 1 - else: - if thin_end is None: - # use the number of current iterations - thin_end = fp.niterations - get_index = fp.get_slice(thin_start=thin_start, thin_end=thin_end, - thin_interval=thin_interval) - # we'll just get the number of iterations from the returned shape - niterations = None - # load - arrays = {} - group = fields_group + '/{name}' - for name in fields: - arr = fp[group.format(name=name)][tidx, widx, get_index] - if niterations is None: - niterations = arr.shape[-1] - # pull out the temperatures we need - if selecttemps: - arr = arr[temps, ...] - if flatten: - arr = arr.flatten() - else: - # ensure that the returned array is 3D - arr = arr.reshape((ntemps, nwalkers, niterations)) - arrays[name] = arr - return array_class.from_kwargs(**arrays) + with self.io(filename, 'a') as fp: + # write samples + fp.write_samples(self.samples, self.model.variable_params) + # write stats + fp.write_samples(self.model_stats) + # write accpetance + fp.write_acceptance_fraction(self._sampler.acceptance_fraction) + # write random state + fp.write_random_state() @classmethod - def read_samples(cls, fp, parameters, - thin_start=None, thin_interval=None, thin_end=None, - iteration=None, temps=0, walkers=None, flatten=True, - samples_group=None, array_class=None): - """Reads samples for the given parameter(s). - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - parameters : (list of) strings - The parameter(s) to retrieve. A parameter can be the name of any - field in `fp[fp.samples_group]`, a virtual field or method of - `FieldArray` (as long as the file contains the necessary fields - to derive the virtual field or method), and/or a function of - these. - thin_start : int - Index of the sample to begin returning samples. Default is to read - samples after burn in. To start from the beginning set thin_start - to 0. - thin_interval : int - Interval to accept every i-th sample. Default is to use the - `fp.acl`. If `fp.acl` is not set, then use all samples - (set thin_interval to 1). - thin_end : int - Index of the last sample to read. If not given then - `fp.niterations` is used. - iteration : int - Get a single iteration. If provided, will override the - `thin_{start/interval/end}` arguments. - walkers : {None, (list of) int} - The walker index (or a list of indices) to retrieve. If None, - samples from all walkers will be obtained. - temps : {None, (list of) int, 'all'} - The temperature index (or list of indices) to retrieve. If None, - only samples from the coldest (= 0) temperature chain will be - retrieved. To retrieve all temperates pass 'all', or a list of - all of the temperatures. - flatten : {True, bool} - The returned array will be one dimensional, with all desired - samples from all desired walkers concatenated together. If False, - the returned array will have dimension requested temps x requested - walkers x requested iterations. - samples_group : {None, str} - The group in `fp` from which to retrieve the parameter fields. If - None, searches in `fp.samples_group`. - array_class : {None, array class} - The type of array to return. The class must have a `from_kwargs` - class method and a `parse_parameters` method. If None, will return - a FieldArray. - - Returns - ------- - array_class - Samples for the given parameters, as an instance of a the given - `array_class` (`FieldArray` if `array_class` is None). - """ - # get the group to load from - if samples_group is None: - samples_group = fp.samples_group - # get the type of array class to use - if array_class is None: - array_class = FieldArray - # get the names of fields needed for the given parameters - possible_fields = fp[samples_group].keys() - loadfields = array_class.parse_parameters(parameters, possible_fields) - return cls._read_fields( - fp, samples_group, loadfields, array_class, - thin_start=thin_start, thin_interval=thin_interval, - thin_end=thin_end, iteration=iteration, temps=temps, - walkers=walkers, flatten=flatten) - - @classmethod - def compute_acfs(cls, fp, start_index=None, end_index=None, - per_walker=False, walkers=None, parameters=None, - temps=None): - """Computes the autocorrleation function of the model params in the - given file. - - By default, parameter values are averaged over all walkers at each - iteration. The ACF is then calculated over the averaged chain for each - temperature. An ACF per-walker will be returned instead if - ``per_walker=True``. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - start_index : {None, int} - The start index to compute the acl from. If None, will try to use - the number of burn-in iterations in the file; otherwise, will start - at the first sample. - end_index : {None, int} - The end index to compute the acl to. If None, will go to the end - of the current iteration. - per_walker : optional, bool - Return the ACF for each walker separately. Default is False. - walkers : optional, int or array - Calculate the ACF using only the given walkers. If None (the - default) all walkers will be used. - parameters : optional, str or array - Calculate the ACF for only the given parameters. If None (the - default) will calculate the ACF for all of the model params. - temps : optional, (list of) int or 'all' - The temperature index (or list of indices) to retrieve. If None - (the default), the ACF will only be computed for the coldest (= 0) - temperature chain. To compute an ACF for all temperates pass 'all', - or a list of all of the temperatures. - - Returns - ------- - FieldArray - A ``FieldArray`` of the ACF vs iteration for each parameter. If - `per-walker` is True, the FieldArray will have shape - ``ntemps x nwalkers x niterations``. Otherwise, the returned - array will have shape ``ntemps x niterations``. - """ - acfs = {} - if parameters is None: - parameters = fp.variable_params - if isinstance(parameters, str) or isinstance(parameters, unicode): - parameters = [parameters] - if isinstance(temps, int): - temps = [temps] - elif temps == 'all': - temps = numpy.arange(fp.ntemps) - elif temps is None: - temps = [0] - for param in parameters: - subacfs = [] - for tk in temps: - if per_walker: - # just call myself with a single walker - if walkers is None: - walkers = numpy.arange(fp.nwalkers) - arrays = [cls.compute_acfs(fp, start_index=start_index, - end_index=end_index, - per_walker=False, walkers=ii, - parameters=param, - temps=tk)[param][0, :] - for ii in walkers] - # we'll stack all of the walker arrays to make a single - # nwalkers x niterations array; when these are stacked - # below, we'll get a ntemps x nwalkers x niterations array - subacfs.append(numpy.vstack(arrays)) - else: - samples = cls.read_samples(fp, param, - thin_start=start_index, - thin_interval=1, - thin_end=end_index, - walkers=walkers, temps=tk, - flatten=False)[param] - # contract the walker dimension using the mean, and flatten - # the (length 1) temp dimension - samples = samples.mean(axis=1)[0, :] - thisacf = autocorrelation.calculate_acf(samples).numpy() - subacfs.append(thisacf) - # stack the temperatures - # FIXME: the following if/else can be condensed to a single line - # using numpy.stack, once the version requirements are bumped to - # numpy >= 1.10 - if per_walker: - nw, ni = subacfs[0].shape - acfs[param] = numpy.zeros((len(temps), nw, ni), dtype=float) - for tk in range(len(temps)): - acfs[param][tk, ...] = subacfs[tk] - else: - acfs[param] = numpy.vstack(subacfs) - return FieldArray.from_kwargs(**acfs) - - @classmethod - def compute_acls(cls, fp, start_index=None, end_index=None): - """Computes the autocorrleation length for all model params and - temperatures in the given file. - - Parameter values are averaged over all walkers at each iteration and - temperature. The ACL is then calculated over the averaged chain. If - the returned ACL is `inf`, will default to the number of current - iterations. - - Parameters - ----------- - fp : InferenceFile - An open file handler to read the samples from. - start_index : {None, int} - The start index to compute the acl from. If None, will try to use - the number of burn-in iterations in the file; otherwise, will start - at the first sample. - end_index : {None, int} - The end index to compute the acl to. If None, will go to the end - of the current iteration. - - Returns - ------- - dict - A dictionary of ntemps-long arrays of the ACLs of each parameter. - """ - acls = {} - if end_index is None: - end_index = fp.niterations - tidx = numpy.arange(fp.ntemps) - for param in fp.variable_params: - these_acls = numpy.zeros(fp.ntemps, dtype=int) - for tk in tidx: - samples = cls.read_samples(fp, param, thin_start=start_index, - thin_interval=1, thin_end=end_index, - temps=tk, flatten=False)[param] - # contract the walker dimension using the mean, and flatten - # the (length 1) temp dimension - samples = samples.mean(axis=1)[0, :] - acl = autocorrelation.calculate_acl(samples) - if numpy.isinf(acl): - acl = samples.size - these_acls[tk] = acl - acls[param] = these_acls - return acls - - @classmethod - def calculate_logevidence(cls, fp, thin_start=None, thin_end=None, + def calculate_logevidence(cls, filename, thin_start=None, thin_end=None, thin_interval=None): - """Calculates the log evidence from the given file using emcee's + """Calculates the log evidence from the given file using ``emcee_pt``'s thermodynamic integration. Parameters ---------- - fp : InferenceFile - An open file handler to read the stats from. + filename : str + Name of the file to read the samples from. Should be an + ``EmceePTFile``. thin_start : int Index of the sample to begin returning stats. Default is to read stats after burn in. To start from the beginning set thin_start @@ -730,27 +258,43 @@ def calculate_logevidence(cls, fp, thin_start=None, thin_end=None, dlnZ : float The error on the estimate. """ - try: - import emcee - except ImportError: - raise ImportError("emcee is not installed.") - - stats_group = fp.stats_group - parameters = fp[stats_group].keys() - logstats = cls.read_samples(fp, parameters, samples_group=stats_group, - thin_start=thin_start, thin_end=thin_end, - thin_interval=thin_interval, - temps='all', flatten=False) - # get the likelihoods - logls = logstats['loglr'] + fp.lognl - # we need the betas that were used - betas = fp.attrs['betas'] - # annoyingly, theromdynaimc integration in PTSampler is an instance - # method, so we'll implement a dummy one - ntemps = fp.ntemps - nwalkers = fp.nwalkers - ndim = len(fp.variable_params) + with cls._io(filename, 'r') as fp: + logls = fp.read_raw_samples(['loglikelihood'], + thin_start=thin_start, + thin_interval=thin_interval, + thin_end=thin_end, + temps='all', flatten=False) + logls = logls['loglikelihood'] + # we need the betas that were used + betas = fp.betas + # annoyingly, theromdynaimc integration in PTSampler is an instance + # method, so we'll implement a dummy one + ntemps = fp.ntemps + nwalkers = fp.nwalkers + ndim = len(fp.variable_params) dummy_sampler = emcee.PTSampler(ntemps, nwalkers, ndim, None, None, betas=betas) return dummy_sampler.thermodynamic_integration_log_evidence( logls=logls, fburnin=0.) + + def finalize(self): + """Calculates the log evidence and writes to the checkpoint file. + + The thin start/interval/end for calculating the log evidence are + retrieved from the checkpoint file's thinning attributes. + """ + logging.info("Calculating log evidence") + # get the thinning settings + with self.io(self.checkpoint_file, 'r') as fp: + thin_start = fp.thin_start + thin_interval = fp.thin_interval + thin_end = fp.thin_end + # calculate + logz, dlogz = self.calculate_logevidence( + self.checkpoint_file, thin_start=thin_start, thin_end=thin_end, + thin_interval=thin_interval) + logging.info("log Z, dlog Z: {}, {}".format(logz, dlogz)) + # write to both the checkpoint and backup + for fn in [self.checkpoint_file, self.backup_file]: + with self.io(fn, "a") as fp: + fp.write_logevidence(logz, dlogz)