Source code for pysgmcmc.samplers.sghmc

# vim: foldmethod=marker
import tensorflow as tf
from pysgmcmc.samplers.base_classes import BurnInMCMCSampler

from pysgmcmc.tensor_utils import (
    vectorize, unvectorize, safe_divide, safe_sqrt,
)

from pysgmcmc.stepsize_schedules import ConstantStepsizeSchedule


[docs]class SGHMCSampler(BurnInMCMCSampler): """ Stochastic Gradient Hamiltonian Monte-Carlo Sampler that uses a burn-in procedure to adapt its own hyperparameters during the initial stages of sampling. See [1] for more details on this burn-in procedure.\n See [2] for more details on Stochastic Gradient Hamiltonian Monte-Carlo. [1] J. T. Springenberg, A. Klein, S. Falkner, F. Hutter In Advances in Neural Information Processing Systems 29 (2016).\n `Bayesian Optimization with Robust Bayesian Neural Networks. <http://aad.informatik.uni-freiburg.de/papers/16-NIPS-BOHamiANN.pdf>`_ [2] T. Chen, E. B. Fox, C. Guestrin In Proceedings of Machine Learning Research 32 (2014).\n `Stochastic Gradient Hamiltonian Monte Carlo <https://arxiv.org/pdf/1402.4102.pdf>`_ """
[docs] def __init__(self, params, cost_fun, batch_generator=None, stepsize_schedule=ConstantStepsizeSchedule(0.01), burn_in_steps=3000, mdecay=0.05, scale_grad=1.0, session=tf.get_default_session(), dtype=tf.float64, seed=None): """ Initialize the sampler parameters and set up a tensorflow.Graph for later queries. parameters ---------- params : list of tensorflow.Variable objects Target parameters for which we want to sample new values. cost_fun : callable Function that takes `params` as input and returns a 1-d `tensorflow.Tensor` that contains the cost-value. Frequently denoted with `U` in literature. batch_generator : iterable, optional Iterable which returns dictionaries to feed into tensorflow.Session.run() calls to evaluate the cost function. Defaults to `None` which indicates that no batches shall be fed. stepsize_schedule : pysgmcmc.stepsize_schedules.StepsizeSchedule Iterator class that produces a stream of stepsize values that we can use in our samplers. See also: `pysgmcmc.stepsize_schedules` burn_in_steps : int, optional Number of burn-in steps to perform. In each burn-in step, this sampler will adapt its own internal parameters to decrease its error. Defaults to `3000`.\n For reference see: `Bayesian Optimization with Robust Bayesian Neural Networks. <http://aad.informatik.uni-freiburg.de/papers/16-NIPS-BOHamiANN.pdf>`_ mdecay : float, optional (Constant) momentum decay per time-step. Defaults to `0.05`.\n For reference see: `Bayesian Optimization with Robust Bayesian Neural Networks. <http://aad.informatik.uni-freiburg.de/papers/16-NIPS-BOHamiANN.pdf>`_ scale_grad : float, optional Value that is used to scale the magnitude of the noise used during sampling. In a typical batches-of-data setting this usually corresponds to the number of examples in the entire dataset. Defaults to `1.0` which corresponds to no scaling. session : tensorflow.Session, optional Session object which knows about the external part of the graph (which defines `Cost`, and possibly batches). Used internally to evaluate (burn-in/sample) the sampler. dtype : tensorflow.DType, optional Type of elements of `tensorflow.Tensor` objects used in this sampler. Defaults to `tensorflow.float64`. seed : int, optional Random seed to use. Defaults to `None`. See Also ---------- pysgmcmc.sampling.BurnInMCMCSampler: Base class for `SGHMCSampler` that specifies how actual sampling is performed (using iterator protocol, e.g. `next(sampler)`). """ # Set up BurnInMCMCSampler base class: # initialize member variables common to all samplers # and run initializers for all uninitialized variables in `params` # (to avoid errors in the graph definitions below). super().__init__( params=params, cost_fun=cost_fun, burn_in_steps=burn_in_steps, batch_generator=batch_generator, seed=seed, dtype=dtype, session=session, stepsize_schedule=stepsize_schedule ) # Initialize graph constants {{{ # noise = tf.constant(0., name="noise", dtype=dtype) scale_grad = tf.constant(scale_grad, dtype=dtype, name="scale_grad") epsilon_scaled = tf.divide(self.epsilon, tf.sqrt(scale_grad), name="epsilon_scaled") mdecay = tf.constant(mdecay, name="mdecay", dtype=dtype) # }}} Initialize graph constants # grads = [vectorize(gradient) for gradient in tf.gradients(self.cost, params)] # Initialize internal sampler parameters {{{ # tau = [tf.Variable(tf.ones_like(param, dtype=dtype), dtype=dtype, name="tau_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params)] r = [tf.Variable(1. / (tau[i].initialized_value() + 1), name="R_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params)] g = [tf.Variable(tf.ones_like(param, dtype=dtype), dtype=dtype, name="g_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params)] v_hat = [tf.Variable(tf.ones_like(param, dtype=dtype), dtype=dtype, name="v_hat_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params)] # Initialize Mass matrix inverse minv = [tf.Variable(tf.divide(tf.constant(1., dtype=dtype), tf.sqrt(v_hat[i].initialized_value())), name="minv_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params)] # Initialize momentum V = [tf.Variable(tf.zeros_like(param, dtype=dtype), dtype=dtype, name="v_{}".format(i), trainable=False) for i, param in enumerate(self.vectorized_params)] # }}} Initialize internal sampler parameters # self.minv_t = [None] * len(params) # gets burned-in # R_t = 1/ (tau + 1), shouldn't it be: 1 / tau according to terms? # It is not, and changing it to that breaks everything! # Why? for i, (param, grad) in enumerate(zip(params, grads)): vectorized_param = self.vectorized_params[i] # Burn-in logic {{{ # r_t = tf.assign(r[i], 1. / (tau[i] + 1), name="r_t_{}".format(i)) # r_t should always use the old value of tau with tf.control_dependencies([r_t]): tau_t = tf.assign_add( tau[i], safe_divide(-g[i] * g[i] * tau[i], v_hat[i]) + 1, name="tau_t_{}".format(i) ) # minv = v_hat^{-1/2} = 1 / sqrt(v_hat) self.minv_t[i] = tf.assign( minv[i], safe_divide(1., safe_sqrt(v_hat[i])), name="minv_t_{}".format(i) ) # tau_t, minv_t should always use the old values of G, v_hat with tf.control_dependencies([tau_t, self.minv_t[i]]): g_t = tf.assign_add( g[i], -r_t * g[i] + r_t * grad, name="g_t_{}".format(i) ) v_hat_t = tf.assign_add( v_hat[i], - r_t * v_hat[i] + r_t * grad ** 2, name="v_hat_t_{}".format(i) ) # }}} Burn-in logic # with tf.control_dependencies([g_t, v_hat_t]): # Draw random normal sample {{{ # # Equation 10, variance of normal sample # 2 * epsilon ** 2 * mdecay * Minv - 0 (noise is 0) - epsilon ** 4 # = 2 * epsilon ** 2 * epsilon * v_hat^{-1/2} * C * Minv # = 2 * epsilon ** 3 * v_hat^{-1/2} * C * v_hat^{-1/2} - epsilon ** 4 # (co-) variance of normal sample noise_scale = ( tf.constant(2., dtype=dtype) * epsilon_scaled ** tf.constant(2., dtype=dtype) * mdecay * self.minv_t[i] - tf.constant(2., dtype=dtype) * epsilon_scaled ** tf.constant(3., dtype) * tf.square(self.minv_t[i]) * noise - epsilon_scaled ** 4 ) # turn into stddev sigma = tf.sqrt(tf.maximum(noise_scale, 1e-16), name="sigma_{}".format(i)) sample = self._draw_noise_sample( sigma=sigma, shape=vectorized_param.shape ) # }}} Draw random sample # # HMC Update {{{ # # Equation 10: right side, where: # Minv = v_hat^{-1/2}, Mdecay = epsilon * v_hat^{-1/2} C v_t = tf.assign_add( V[i], - self.epsilon ** 2 * self.minv_t[i] * grad - mdecay * V[i] + sample, name="v_t_{}".format(i) ) # Equation 10: left side vectorized_Theta_t = tf.assign_add( vectorized_param, v_t ) self.theta_t[i] = tf.assign( param, unvectorize( vectorized_Theta_t, original_shape=param.shape ), name="theta_t_{}".format(i) )
# }}} HMC Update #