Source code for pysgmcmc.samplers.svgd

import tensorflow as tf
from pysgmcmc.tensor_utils import pdist, squareform, median
from pysgmcmc.stepsize_schedules import ConstantStepsizeSchedule
from pysgmcmc.samplers.base_classes import MCMCSampler


# XXX: Interface needs to change more: particles should be List[List[tensorflow.Variable]]
# where each inner list is one guess of a network.
# This would enable the bnn code to change such that SVGD becomes applicable
# to our BNN.


[docs]class SVGDSampler(MCMCSampler):
    """ Stein Variational Gradient Descent Sampler.

        See [1] for more details on stein variational gradient descent.\n

        [1] Q. Liu, D. Wang
            In Advances in Neural Information Processing Systems 29 (2016).\n
            `Stein Variational Gradient Descent: A General Purpose Bayesian Inference Algorithm. <https://arxiv.org/pdf/1608.04471>`_

    """
[docs]    def __init__(self, particles, cost_fun, batch_generator=None,
                 stepsize_schedule=ConstantStepsizeSchedule(0.1),
                 alpha=0.9, fudge_factor=1e-6, session=tf.get_default_session(),
                 dtype=tf.float64, seed=None):
        """ Initialize the sampler parameters and set up a tensorflow.Graph
            for later queries.

        Parameters
        ----------
        particles : List[tensorflow.Variable]
            List of particles each representing a (different) guess of the
            target parameters of this sampler.

        cost_fun : callable
            Function that takes `params` of *one* particle as input and
            returns a 1-d `tensorflow.Tensor` that contains the cost-value.
            Frequently denoted with `U` in literature.

        batch_generator : iterable, optional
            Iterable which returns dictionaries to feed into
            tensorflow.Session.run() calls to evaluate the cost function.
            Defaults to `None` which indicates that no batches shall be fed.

        stepsize_schedule : pysgmcmc.stepsize_schedules.StepsizeSchedule
            Iterator class that produces a stream of stepsize values that
            we can use in our samplers.
            See also: `pysgmcmc.stepsize_schedules`

        alpha : float, optional
            TODO DOKU
            Defaults to `0.9`.

        fudge_factor : float, optional
            TODO DOKU
            Defaults to `1e-6`.

        session : tensorflow.Session, optional
            Session object which knows about the external part of the graph
            (which defines `Cost`, and possibly batches).
            Used internally to evaluate (burn-in/sample) the sampler.

        dtype : tensorflow.DType, optional
            Type of elements of `tensorflow.Tensor` objects used in this sampler.
            Defaults to `tensorflow.float64`.

        seed : int, optional
            Random seed to use.
            Defaults to `None`.

        See Also
        ----------
        pysgmcmc.sampling.MCMCSampler:
            Base class for `SteinVariationalGradientDescentSampler` that
            specifies how actual sampling is performed (using iterator protocol,
            e.g. `next(sampler)`).

        """

        assert isinstance(alpha, (int, float))
        assert isinstance(fudge_factor, (int, float))
        assert callable(cost_fun)

        self.particles = tf.stack(particles)

        def cost_fun_wrapper(params):
            return tf.map_fn(lambda particle: cost_fun(particle), self.particles)

        cost_fun_wrapper.__name__ = cost_fun.__name__

        super().__init__(
            params=particles,
            cost_fun=cost_fun_wrapper,
            batch_generator=batch_generator,
            session=session, seed=seed, dtype=dtype,
            stepsize_schedule=stepsize_schedule
        )

        fudge_factor = tf.constant(
            fudge_factor, dtype=self.dtype, name="fudge_factor"
        )

        self.epsilon = tf.Variable(
            stepsize_schedule.initial_value, dtype=self.dtype, name="stepsize"
        )

        self.n_particles = tf.cast(
            self.particles.shape[0], self.dtype
        )

        historical_grad = tf.get_variable(
            "historical_grad", self.particles.shape, dtype=dtype,
            initializer=tf.zeros_initializer()
        )

        self.session.run(
            tf.variables_initializer([historical_grad, self.epsilon])
        )

        lnpgrad = tf.squeeze(tf.gradients(self.cost, self.particles))

        kernel_matrix, kernel_gradients = self.svgd_kernel(self.particles)

        grad_theta = tf.divide(
            tf.matmul(kernel_matrix, lnpgrad) + kernel_gradients,
            self.n_particles
        )

        historical_grad_t = tf.assign(
            historical_grad,
            alpha * historical_grad + (1. - alpha) * (grad_theta ** 2)
        )

        adj_grad = tf.divide(
            grad_theta,
            fudge_factor + tf.sqrt(historical_grad_t)
        )

        for i, param in enumerate(self.params):
            self.theta_t[i] = tf.assign_sub(
                param,
                self.epsilon * adj_grad[i]
            )

[docs]    def svgd_kernel(self, particles):
        """ Calculate a kernel matrix with corresponding derivatives
            for the given `particles`.
            TODO: DOKU ON KERNEL TRICK

        Parameters
        ----------
        particles : TODO

        Returns
        ----------
        kernel_matrix : tf.Tensor
            TODO

        kernel_gradients : tf.Tensor
            TODO

        """
        euclidean_distances = pdist(particles)
        pairwise_distances = squareform(euclidean_distances) ** 2

        # kernel trick
        h = tf.sqrt(
            0.5 * median(pairwise_distances) / tf.log(self.n_particles + 1.)
        )

        kernel_matrix = tf.exp(-pairwise_distances / h ** 2 / 2)
        kernel_sum = tf.reduce_sum(kernel_matrix, axis=1)

        kernel_gradients = tf.add(
            -tf.matmul(kernel_matrix, particles),
            tf.multiply(particles, tf.expand_dims(kernel_sum, axis=1))
        )

        return kernel_matrix, kernel_gradients / (h ** 2)

    # XXX: Probably unnecessary. Changes should happen toplevel.
    # However using this to test *just* the svgd implementation and make
    # it conform to list of lists interface still seems reasonable.
    # Later: make BNN use multiple get_net calls to get variables
    # and extract appropriate groups from tf.trainable_variables
    # (use scope prefix)
    def _duplicate_variables(self, variables, duplicate_index):
        duplicate = []
        for var in variables:
            name = var.name.split(":")[0] + "_" + str(duplicate_index)
            dup_var = tf.get_variable(name, initializer=var.initializer._inputs[1])
            # session.run(tf.variables_initializer([dup_var]))
            duplicate.append(dup_var)
        return duplicate