Source code for pysgmcmc.data_batches

import numpy as np
import logging

__all__ = (
    "generate_batches",
    "generate_shuffled_batches",
)


[docs]def generate_batches(x, y, x_placeholder, y_placeholder, batch_size=20, seed=None):
    """ Infinite generator of random minibatches for a dataset.

        For general reference on (infinite) generators, see:
        https://www.python.org/dev/peps/pep-0255/

    Parameters
    ----------
    x: np.ndarray (N, D)
        Training data points/features

    y : np.ndarray (N, 1)
        Training data labels

    x_placeholder : tensorflow.placeholder
        Placeholder for batches of data from `x`.

    y_placeholder : tensorflow.placeholder
        Placeholder for batches of data from `y`.

    batch_size : int, optional
        Number of datapoints to put into a batch.

    seed: int, optional
        Random seed to use during batch generation.
        Defaults to `None`.

    Yields
    -------
    batch_dict : dict
        A dictionary that maps `x_placeholder` and `y_placeholder`
        to `batch_size` sized minibatches of data (numpy.ndarrays)
        from the dataset `x`, `y`.

    Examples
    -------
    Simple batch extraction example:

    >>> import numpy as np
    >>> import tensorflow as tf
    >>> N, D = 100, 3  # 100 datapoints with 3 features each
    >>> x = np.asarray([np.random.uniform(-10, 10, D) for _ in range(N)])
    >>> y = np.asarray([np.random.choice([0., 1.]) for _ in range(N)])
    >>> x.shape, y.shape
    ((100, 3), (100,))
    >>> x_placeholder, y_placeholder = tf.placeholder(dtype=tf.float64), tf.placeholder(dtype=tf.float64)
    >>> batch_size = 20
    >>> gen = generate_batches(x, y, x_placeholder, y_placeholder, batch_size)
    >>> batch_dict = next(gen)  # extract a batch
    >>> set(batch_dict.keys()) == set((x_placeholder, y_placeholder))
    True
    >>> batch_dict[x_placeholder].shape, batch_dict[y_placeholder].shape
    ((20, 3), (20, 1))

    Batch extraction resizes batch size if dataset is too small:

    >>> import numpy as np
    >>> import tensorflow as tf
    >>> N, D = 10, 3  # 10 datapoints with 3 features each
    >>> x = np.asarray([np.random.uniform(-10, 10, D) for _ in range(N)])
    >>> y = np.asarray([np.random.choice([0., 1.]) for _ in range(N)])
    >>> x.shape, y.shape
    ((10, 3), (10,))
    >>> x_placeholder, y_placeholder = tf.placeholder(dtype=tf.float64), tf.placeholder(dtype=tf.float64)
    >>> batch_size = 20
    >>> gen = generate_batches(x, y, x_placeholder, y_placeholder, batch_size)
    >>> batch_dict = next(gen)  # extract a batch
    >>> set(batch_dict.keys()) == set((x_placeholder, y_placeholder))
    True
    >>> batch_dict[x_placeholder].shape, batch_dict[y_placeholder].shape
    ((10, 3), (10, 1))

    In this case, the batches contain exactly all datapoints:

    >>> np.allclose(batch_dict[x_placeholder], x), np.allclose(batch_dict[y_placeholder].reshape(N,), y)
    (True, True)

    """

    # Sanitize inputs
    assert(isinstance(batch_size, int)), "generate_batches: batch size must be an integer."
    assert(batch_size > 0), "generate_batches: batch size must be greater than zero."

    assert(seed is None or isinstance(seed, int)), "generate_batches: seed must be an integer or `None`"

    assert seed is None or (0 <= seed <= 2 ** 32 - 1)

    assert(y.shape[0] == x.shape[0]), "Not exactly one label per datapoint!"

    n_examples = x.shape[0]

    if seed is None:
        seed = np.random.randint(1, 100000)

    rng = np.random.RandomState()
    rng.seed(seed)

    # Check if we have enough data points to form a minibatch
    # otherwise set the batchsize equal to the number of input points
    initial_batch_size = batch_size
    # print(batch_size)
    batch_size = min(initial_batch_size, n_examples)
    # print(batch_size)

    if initial_batch_size != batch_size:
        logging.error("Not enough datapoints to form a minibatch. "
                      "Batchsize was set to %s", batch_size)

    while True:
        # `np.random.randint` is end-exclusive => for n_examples == batch_size, start == 0 holds
        start = rng.randint(0, (n_examples - batch_size + 1))

        minibatch_x = x[start:start + batch_size]
        minibatch_y = y[start:start + batch_size, None]

        feed_dict = {
            x_placeholder: minibatch_x,
            y_placeholder: minibatch_y.reshape(-1, 1)
        }
        yield feed_dict


[docs]def generate_shuffled_batches(x, y, x_placeholder, y_placeholder,
                              batch_size=20, seed=None):

    """ Infinite generator of shuffled random minibatches for a dataset.

        For general reference on (infinite) generators, see:
        https://www.python.org/dev/peps/pep-0255/

    Parameters
    ----------
    x: np.ndarray (N, D)
        Training data points/features

    y : np.ndarray (N, 1)
        Training data labels

    x_placeholder : tensorflow.placeholder
        Placeholder for batches of data from `x`.

    y_placeholder : tensorflow.placeholder
        Placeholder for batches of data from `y`.

    batch_size : int, optional
        Number of datapoints to put into a batch.

    seed: int, optional
        Random seed to use during batch generation (and for shuffling!).
        Defaults to `None`.

    Yields
    -------
    batch_dict: dict
        A dictionary that maps `x_placeholder` and `y_placeholder`
        to `batch_size` sized minibatches of data (numpy.ndarrays)
        from the dataset `x`, `y`.

    Examples
    -------

    Simple shuffled batch extraction example:

    >>> import numpy as np
    >>> import tensorflow as tf
    >>> N, D = 100, 3  # 100 datapoints with 3 features each
    >>> x = np.asarray([np.random.uniform(-10, 10, D) for _ in range(N)])
    >>> y = np.asarray([np.random.choice([0., 1.]) for _ in range(N)])
    >>> x.shape, y.shape
    ((100, 3), (100,))
    >>> x_placeholder, y_placeholder = tf.placeholder(dtype=tf.float64), tf.placeholder(dtype=tf.float64)
    >>> batch_size = 20
    >>> gen = generate_shuffled_batches(x, y, x_placeholder, y_placeholder, batch_size)
    >>> batch_dict = next(gen)  # extract a batch
    >>> set(batch_dict.keys()) == set((x_placeholder, y_placeholder))
    True
    >>> batch_dict[x_placeholder].shape, batch_dict[y_placeholder].shape
    ((20, 3), (20, 1))

    TODO: Demonstrate that shuffled batches are shuffled correctly, e.g.
    datapoint still matches corresponding label

    """

    # always use a seed in order to shuffle x and y in the same way
    if seed is None:
        seed = np.random.randint(1, 100000)

    rng_x, rng_y = np.random.RandomState(), np.random.RandomState()
    rng_x.seed(seed)
    rng_y.seed(seed)

    for batch in generate_batches(x, y, x_placeholder, y_placeholder, batch_size, seed):
        # shuffles x and y in the same way
        rng_x.shuffle(batch[x_placeholder])
        rng_y.shuffle(batch[y_placeholder])
        yield batch