Source code for pysgmcmc.data_batches

import numpy as np
import logging

__all__ = (
    "generate_batches",
    "generate_shuffled_batches",
)


[docs]def generate_batches(x, y, x_placeholder, y_placeholder, batch_size=20, seed=None): """ Infinite generator of random minibatches for a dataset. For general reference on (infinite) generators, see: https://www.python.org/dev/peps/pep-0255/ Parameters ---------- x: np.ndarray (N, D) Training data points/features y : np.ndarray (N, 1) Training data labels x_placeholder : tensorflow.placeholder Placeholder for batches of data from `x`. y_placeholder : tensorflow.placeholder Placeholder for batches of data from `y`. batch_size : int, optional Number of datapoints to put into a batch. seed: int, optional Random seed to use during batch generation. Defaults to `None`. Yields ------- batch_dict : dict A dictionary that maps `x_placeholder` and `y_placeholder` to `batch_size` sized minibatches of data (numpy.ndarrays) from the dataset `x`, `y`. Examples ------- Simple batch extraction example: >>> import numpy as np >>> import tensorflow as tf >>> N, D = 100, 3 # 100 datapoints with 3 features each >>> x = np.asarray([np.random.uniform(-10, 10, D) for _ in range(N)]) >>> y = np.asarray([np.random.choice([0., 1.]) for _ in range(N)]) >>> x.shape, y.shape ((100, 3), (100,)) >>> x_placeholder, y_placeholder = tf.placeholder(dtype=tf.float64), tf.placeholder(dtype=tf.float64) >>> batch_size = 20 >>> gen = generate_batches(x, y, x_placeholder, y_placeholder, batch_size) >>> batch_dict = next(gen) # extract a batch >>> set(batch_dict.keys()) == set((x_placeholder, y_placeholder)) True >>> batch_dict[x_placeholder].shape, batch_dict[y_placeholder].shape ((20, 3), (20, 1)) Batch extraction resizes batch size if dataset is too small: >>> import numpy as np >>> import tensorflow as tf >>> N, D = 10, 3 # 10 datapoints with 3 features each >>> x = np.asarray([np.random.uniform(-10, 10, D) for _ in range(N)]) >>> y = np.asarray([np.random.choice([0., 1.]) for _ in range(N)]) >>> x.shape, y.shape ((10, 3), (10,)) >>> x_placeholder, y_placeholder = tf.placeholder(dtype=tf.float64), tf.placeholder(dtype=tf.float64) >>> batch_size = 20 >>> gen = generate_batches(x, y, x_placeholder, y_placeholder, batch_size) >>> batch_dict = next(gen) # extract a batch >>> set(batch_dict.keys()) == set((x_placeholder, y_placeholder)) True >>> batch_dict[x_placeholder].shape, batch_dict[y_placeholder].shape ((10, 3), (10, 1)) In this case, the batches contain exactly all datapoints: >>> np.allclose(batch_dict[x_placeholder], x), np.allclose(batch_dict[y_placeholder].reshape(N,), y) (True, True) """ # Sanitize inputs assert(isinstance(batch_size, int)), "generate_batches: batch size must be an integer." assert(batch_size > 0), "generate_batches: batch size must be greater than zero." assert(seed is None or isinstance(seed, int)), "generate_batches: seed must be an integer or `None`" assert seed is None or (0 <= seed <= 2 ** 32 - 1) assert(y.shape[0] == x.shape[0]), "Not exactly one label per datapoint!" n_examples = x.shape[0] if seed is None: seed = np.random.randint(1, 100000) rng = np.random.RandomState() rng.seed(seed) # Check if we have enough data points to form a minibatch # otherwise set the batchsize equal to the number of input points initial_batch_size = batch_size # print(batch_size) batch_size = min(initial_batch_size, n_examples) # print(batch_size) if initial_batch_size != batch_size: logging.error("Not enough datapoints to form a minibatch. " "Batchsize was set to %s", batch_size) while True: # `np.random.randint` is end-exclusive => for n_examples == batch_size, start == 0 holds start = rng.randint(0, (n_examples - batch_size + 1)) minibatch_x = x[start:start + batch_size] minibatch_y = y[start:start + batch_size, None] feed_dict = { x_placeholder: minibatch_x, y_placeholder: minibatch_y.reshape(-1, 1) } yield feed_dict
[docs]def generate_shuffled_batches(x, y, x_placeholder, y_placeholder, batch_size=20, seed=None): """ Infinite generator of shuffled random minibatches for a dataset. For general reference on (infinite) generators, see: https://www.python.org/dev/peps/pep-0255/ Parameters ---------- x: np.ndarray (N, D) Training data points/features y : np.ndarray (N, 1) Training data labels x_placeholder : tensorflow.placeholder Placeholder for batches of data from `x`. y_placeholder : tensorflow.placeholder Placeholder for batches of data from `y`. batch_size : int, optional Number of datapoints to put into a batch. seed: int, optional Random seed to use during batch generation (and for shuffling!). Defaults to `None`. Yields ------- batch_dict: dict A dictionary that maps `x_placeholder` and `y_placeholder` to `batch_size` sized minibatches of data (numpy.ndarrays) from the dataset `x`, `y`. Examples ------- Simple shuffled batch extraction example: >>> import numpy as np >>> import tensorflow as tf >>> N, D = 100, 3 # 100 datapoints with 3 features each >>> x = np.asarray([np.random.uniform(-10, 10, D) for _ in range(N)]) >>> y = np.asarray([np.random.choice([0., 1.]) for _ in range(N)]) >>> x.shape, y.shape ((100, 3), (100,)) >>> x_placeholder, y_placeholder = tf.placeholder(dtype=tf.float64), tf.placeholder(dtype=tf.float64) >>> batch_size = 20 >>> gen = generate_shuffled_batches(x, y, x_placeholder, y_placeholder, batch_size) >>> batch_dict = next(gen) # extract a batch >>> set(batch_dict.keys()) == set((x_placeholder, y_placeholder)) True >>> batch_dict[x_placeholder].shape, batch_dict[y_placeholder].shape ((20, 3), (20, 1)) TODO: Demonstrate that shuffled batches are shuffled correctly, e.g. datapoint still matches corresponding label """ # always use a seed in order to shuffle x and y in the same way if seed is None: seed = np.random.randint(1, 100000) rng_x, rng_y = np.random.RandomState(), np.random.RandomState() rng_x.seed(seed) rng_y.seed(seed) for batch in generate_batches(x, y, x_placeholder, y_placeholder, batch_size, seed): # shuffles x and y in the same way rng_x.shuffle(batch[x_placeholder]) rng_y.shuffle(batch[y_placeholder]) yield batch