Source code for pysgmcmc.samplers.sghmc

# vim: foldmethod=marker
import tensorflow as tf
from pysgmcmc.samplers.base_classes import BurnInMCMCSampler

from pysgmcmc.tensor_utils import (
    vectorize, unvectorize, safe_divide, safe_sqrt,
)

from pysgmcmc.stepsize_schedules import ConstantStepsizeSchedule


[docs]class SGHMCSampler(BurnInMCMCSampler):
    """ Stochastic Gradient Hamiltonian Monte-Carlo Sampler that uses a burn-in
        procedure to adapt its own hyperparameters during the initial stages
        of sampling.

        See [1] for more details on this burn-in procedure.\n
        See [2] for more details on Stochastic Gradient Hamiltonian Monte-Carlo.

        [1] J. T. Springenberg, A. Klein, S. Falkner, F. Hutter
            In Advances in Neural Information Processing Systems 29 (2016).\n

            `Bayesian Optimization with Robust Bayesian Neural Networks. <http://aad.informatik.uni-freiburg.de/papers/16-NIPS-BOHamiANN.pdf>`_


        [2] T. Chen, E. B. Fox, C. Guestrin
            In Proceedings of Machine Learning Research 32 (2014).\n
            `Stochastic Gradient Hamiltonian Monte Carlo <https://arxiv.org/pdf/1402.4102.pdf>`_
    """

[docs]    def __init__(self, params, cost_fun, batch_generator=None,
                 stepsize_schedule=ConstantStepsizeSchedule(0.01),
                 burn_in_steps=3000, mdecay=0.05, scale_grad=1.0,
                 session=tf.get_default_session(), dtype=tf.float64, seed=None):
        """ Initialize the sampler parameters and set up a tensorflow.Graph
            for later queries.

        parameters
        ----------
        params : list of tensorflow.Variable objects
            Target parameters for which we want to sample new values.

        cost_fun : callable
            Function that takes `params` as input and returns a
            1-d `tensorflow.Tensor` that contains the cost-value.
            Frequently denoted with `U` in literature.

        batch_generator : iterable, optional
            Iterable which returns dictionaries to feed into
            tensorflow.Session.run() calls to evaluate the cost function.
            Defaults to `None` which indicates that no batches shall be fed.

        stepsize_schedule : pysgmcmc.stepsize_schedules.StepsizeSchedule
            Iterator class that produces a stream of stepsize values that
            we can use in our samplers.
            See also: `pysgmcmc.stepsize_schedules`

        burn_in_steps : int, optional
            Number of burn-in steps to perform. In each burn-in step, this
            sampler will adapt its own internal parameters to decrease its error.
            Defaults to `3000`.\n
            For reference see:
            `Bayesian Optimization with Robust Bayesian Neural Networks. <http://aad.informatik.uni-freiburg.de/papers/16-NIPS-BOHamiANN.pdf>`_

        mdecay : float, optional
            (Constant) momentum decay per time-step.
            Defaults to `0.05`.\n
            For reference see:
            `Bayesian Optimization with Robust Bayesian Neural Networks. <http://aad.informatik.uni-freiburg.de/papers/16-NIPS-BOHamiANN.pdf>`_

        scale_grad : float, optional
            Value that is used to scale the magnitude of the noise used
            during sampling. In a typical batches-of-data setting this usually
            corresponds to the number of examples in the entire dataset.
            Defaults to `1.0` which corresponds to no scaling.

        session : tensorflow.Session, optional
            Session object which knows about the external part of the graph
            (which defines `Cost`, and possibly batches).
            Used internally to evaluate (burn-in/sample) the sampler.

        dtype : tensorflow.DType, optional
            Type of elements of `tensorflow.Tensor` objects used in this sampler.
            Defaults to `tensorflow.float64`.

        seed : int, optional
            Random seed to use.
            Defaults to `None`.

        See Also
        ----------
        pysgmcmc.sampling.BurnInMCMCSampler:
            Base class for `SGHMCSampler` that specifies how actual sampling
            is performed (using iterator protocol, e.g. `next(sampler)`).

        """

        # Set up BurnInMCMCSampler base class:
        # initialize member variables common to all samplers
        # and run initializers for all uninitialized variables in `params`
        # (to avoid errors in the graph definitions below).
        super().__init__(
            params=params, cost_fun=cost_fun, burn_in_steps=burn_in_steps,
            batch_generator=batch_generator,
            seed=seed, dtype=dtype, session=session,
            stepsize_schedule=stepsize_schedule
        )

        #  Initialize graph constants {{{ #

        noise = tf.constant(0., name="noise", dtype=dtype)

        scale_grad = tf.constant(scale_grad, dtype=dtype, name="scale_grad")

        epsilon_scaled = tf.divide(self.epsilon, tf.sqrt(scale_grad), name="epsilon_scaled")

        mdecay = tf.constant(mdecay, name="mdecay", dtype=dtype)

        #  }}} Initialize graph constants #

        grads = [vectorize(gradient) for gradient in
                 tf.gradients(self.cost, params)]

        #  Initialize internal sampler parameters {{{ #

        tau = [tf.Variable(tf.ones_like(param, dtype=dtype),
                           dtype=dtype, name="tau_{}".format(i),
                           trainable=False)
               for i, param in enumerate(self.vectorized_params)]

        r = [tf.Variable(1. / (tau[i].initialized_value() + 1),
                         name="R_{}".format(i), trainable=False)
             for i, param in enumerate(self.vectorized_params)]

        g = [tf.Variable(tf.ones_like(param, dtype=dtype),
                         dtype=dtype, name="g_{}".format(i),
                         trainable=False)
             for i, param in enumerate(self.vectorized_params)]

        v_hat = [tf.Variable(tf.ones_like(param, dtype=dtype),
                             dtype=dtype, name="v_hat_{}".format(i),
                             trainable=False)
                 for i, param in enumerate(self.vectorized_params)]

        # Initialize Mass matrix inverse
        minv = [tf.Variable(tf.divide(tf.constant(1., dtype=dtype),
                            tf.sqrt(v_hat[i].initialized_value())),
                            name="minv_{}".format(i), trainable=False)
                for i, param in enumerate(self.vectorized_params)]

        # Initialize momentum
        V = [tf.Variable(tf.zeros_like(param, dtype=dtype),
                         dtype=dtype, name="v_{}".format(i),
                         trainable=False)
             for i, param in enumerate(self.vectorized_params)]

        #  }}} Initialize internal sampler parameters #

        self.minv_t = [None] * len(params)  # gets burned-in

        # R_t = 1/ (tau + 1), shouldn't it be: 1 / tau according to terms?
        # It is not, and changing it to that breaks everything!
        # Why?

        for i, (param, grad) in enumerate(zip(params, grads)):
            vectorized_param = self.vectorized_params[i]
            #  Burn-in logic {{{ #
            r_t = tf.assign(r[i], 1. / (tau[i] + 1), name="r_t_{}".format(i))

            # r_t should always use the old value of tau
            with tf.control_dependencies([r_t]):
                tau_t = tf.assign_add(
                    tau[i],
                    safe_divide(-g[i] * g[i] * tau[i], v_hat[i]) + 1,
                    name="tau_t_{}".format(i)
                )

                # minv = v_hat^{-1/2} = 1 / sqrt(v_hat)
                self.minv_t[i] = tf.assign(
                    minv[i],
                    safe_divide(1., safe_sqrt(v_hat[i])),
                    name="minv_t_{}".format(i)
                )
                # tau_t, minv_t should always use the old values of G, v_hat
                with tf.control_dependencies([tau_t, self.minv_t[i]]):
                    g_t = tf.assign_add(
                        g[i],
                        -r_t * g[i] + r_t * grad,
                        name="g_t_{}".format(i)
                    )

                    v_hat_t = tf.assign_add(
                        v_hat[i],
                        - r_t * v_hat[i] + r_t * grad ** 2,
                        name="v_hat_t_{}".format(i)
                    )

            #  }}} Burn-in logic #

                    with tf.control_dependencies([g_t, v_hat_t]):

                        #  Draw random normal sample {{{ #

                        # Equation 10, variance of normal sample

                        # 2 * epsilon ** 2 * mdecay * Minv - 0 (noise is 0) - epsilon ** 4
                        # = 2 * epsilon ** 2 * epsilon * v_hat^{-1/2} * C * Minv
                        # = 2 * epsilon ** 3 * v_hat^{-1/2} * C * v_hat^{-1/2} - epsilon ** 4

                        # (co-) variance of normal sample
                        noise_scale = (
                            tf.constant(2., dtype=dtype) *
                            epsilon_scaled ** tf.constant(2., dtype=dtype) *
                            mdecay * self.minv_t[i] - tf.constant(2., dtype=dtype) *
                            epsilon_scaled ** tf.constant(3., dtype) *
                            tf.square(self.minv_t[i]) * noise - epsilon_scaled ** 4
                        )

                        # turn into stddev
                        sigma = tf.sqrt(tf.maximum(noise_scale, 1e-16),
                                        name="sigma_{}".format(i))

                        sample = self._draw_noise_sample(
                            sigma=sigma, shape=vectorized_param.shape
                        )

                        #  }}} Draw random sample #

                        #  HMC Update {{{ #

                        # Equation 10: right side, where:
                        # Minv = v_hat^{-1/2}, Mdecay = epsilon * v_hat^{-1/2} C
                        v_t = tf.assign_add(
                            V[i],
                            - self.epsilon ** 2 * self.minv_t[i] * grad -
                            mdecay * V[i] + sample,
                            name="v_t_{}".format(i)
                        )

                        # Equation 10: left side
                        vectorized_Theta_t = tf.assign_add(
                            vectorized_param, v_t
                        )

                        self.theta_t[i] = tf.assign(
                            param,
                            unvectorize(
                                vectorized_Theta_t, original_shape=param.shape
                            ),
                            name="theta_t_{}".format(i)
                        )

                        #  }}} HMC Update #