Permanently remove several modules from tf.contrib.bayesflow.

These modules are very infrequently used and will not be developed moving forward. Removing this code paves the way for remaining modules in tf.contrib.bayesflow to move to their own repo. PiperOrigin-RevId: 174110067
2025-12-07 12:20:24 +01:00 · 2017-10-31 16:20:19 -07:00 · 2017-10-31 16:20:19 -07:00 · 2ccf3aba42
commit 2ccf3aba42
parent ef7052fbd9
24 changed files with 11 additions and 3694 deletions
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@ -3,12 +3,15 @@
 #   particularly useful for Bayesian inference.
 #   APIs here are meant to evolve over time.

+package(default_visibility = [
+    "//learning/brain/contrib/bayesflow:__subpackages__",
+    "//tensorflow:__subpackages__",
+])
+
 licenses(["notice"])  # Apache 2.0

 exports_files(["LICENSE"])

-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")

 py_library(
@ -100,44 +103,6 @@ cuda_py_test(
    ],
 )

-cuda_py_test(
-    name = "entropy_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/entropy_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/python/ops/distributions",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:variables",
-    ],
-)
-
-cuda_py_test(
-    name = "stochastic_variables_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/stochastic_variables_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-    ],
-)
-
 cuda_py_test(
    name = "monte_carlo_test",
    size = "small",
@ -180,88 +145,6 @@ cuda_py_test(
    ],
 )

-cuda_py_test(
-    name = "stochastic_graph_test",
-    size = "small",
-    srcs = ["python/kernel_tests/stochastic_graph_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "variational_inference_test",
-    size = "small",
-    srcs = ["python/kernel_tests/variational_inference_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/contrib/layers:layers_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
-    ],
-)
-
-cuda_py_test(
-    name = "stochastic_tensor_test",
-    size = "small",
-    srcs = ["python/kernel_tests/stochastic_tensor_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-cuda_py_test(
-    name = "stochastic_gradient_estimators_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/stochastic_gradient_estimators_test.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/distributions:distributions_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
-    ],
-)
-
-cuda_py_test(
-    name = "reinforce_simple_example",
-    size = "small",
-    srcs = ["examples/reinforce_simple/reinforce_simple_example.py"],
-    additional_deps = [
-        ":bayesflow_py",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 filegroup(
    name = "all_files",
    srcs = glob(
--- a/tensorflow/contrib/bayesflow/init.py
+++ b/tensorflow/contrib/bayesflow/init.py
@ -23,15 +23,9 @@ from __future__ import print_function
 # pylint: disable=unused-import,line-too-long
 from tensorflow.contrib.bayesflow.python.ops import csiszar_divergence
 from tensorflow.contrib.bayesflow.python.ops import custom_grad
-from tensorflow.contrib.bayesflow.python.ops import entropy
 from tensorflow.contrib.bayesflow.python.ops import hmc
 from tensorflow.contrib.bayesflow.python.ops import metropolis_hastings
 from tensorflow.contrib.bayesflow.python.ops import monte_carlo
-from tensorflow.contrib.bayesflow.python.ops import stochastic_gradient_estimators
-from tensorflow.contrib.bayesflow.python.ops import stochastic_graph
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
-from tensorflow.contrib.bayesflow.python.ops import stochastic_variables
-from tensorflow.contrib.bayesflow.python.ops import variational_inference
 # pylint: enable=unused-import,line-too-long

 from tensorflow.python.util.all_util import remove_undocumented
@ -39,8 +33,6 @@ from tensorflow.python.util.all_util import remove_undocumented

 _allowed_symbols = ['csiszar_divergence', 'custom_grad', 'entropy',
                    'metropolis_hastings', 'monte_carlo', 'hmc', 'special_math',
-                    'stochastic_gradient_estimators', 'stochastic_graph',
-                    'stochastic_tensor', 'stochastic_variables',
-                    'variational_inference']
+                    'stochastic_variables', 'variational_inference']

 remove_undocumented(__name__, _allowed_symbols)
--- a/tensorflow/contrib/bayesflow/examples/reinforce_simple/reinforce_simple_example.py
+++ b/tensorflow/contrib/bayesflow/examples/reinforce_simple/reinforce_simple_example.py
@ -1,140 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Simple examples of the REINFORCE algorithm."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-
-distributions = tf.contrib.distributions
-sg = tf.contrib.bayesflow.stochastic_graph
-st = tf.contrib.bayesflow.stochastic_tensor
-
-
-def split_apply_merge(inp, partitions, fns):
-  """Split input according to partitions.  Pass results through fns and merge.
-
-  Args:
-    inp: the input vector
-    partitions: tensor of same length as input vector, having values 0, 1
-    fns: the two functions.
-
-  Returns:
-    the vector routed, where routed[i] = fns[partitions[i]](inp[i])
-  """
-  new_inputs = tf.dynamic_partition(inp, partitions, len(fns))
-  new_outputs = [fns[i](x) for i, x in enumerate(new_inputs)]
-  new_indices = tf.dynamic_partition(
-      tf.range(0, inp.get_shape()[0]), partitions, len(fns))
-  return tf.dynamic_stitch(new_indices, new_outputs)
-
-
-def plus_1(inputs):
-  return inputs + 1.0
-
-
-def minus_1(inputs):
-  return inputs - 1.0
-
-
-def build_split_apply_merge_model():
-  """Build the Split-Apply-Merge Model.
-
-  Route each value of input [-1, -1, 1, 1] through one of the
-  functions, plus_1, minus_1.  The decision for routing is made by
-  4 Bernoulli R.V.s whose parameters are determined by a neural network
-  applied to the input.  REINFORCE is used to update the NN parameters.
-
-  Returns:
-    The 3-tuple (route_selection, routing_loss, final_loss), where:
-
-      - route_selection is an int 4-vector
-      - routing_loss is a float 4-vector
-      - final_loss is a float scalar.
-  """
-  inputs = tf.constant([[-1.0], [-1.0], [1.0], [1.0]])
-  targets = tf.constant([[0.0], [0.0], [0.0], [0.0]])
-  paths = [plus_1, minus_1]
-  weights = tf.get_variable("w", [1, 2])
-  bias = tf.get_variable("b", [1, 1])
-  logits = tf.matmul(inputs, weights) + bias
-
-  # REINFORCE forward step
-  route_selection = st.StochasticTensor(
-      distributions.Categorical(logits=logits))
-
-  # Accessing route_selection as a Tensor below forces a sample of
-  # the Categorical distribution based on its logits.
-  # This is equivalent to calling route_selection.value().
-  #
-  # route_selection.value() returns an int32 4-vector with random
-  # values in {0, 1}
-  # COPY+ROUTE+PASTE
-  outputs = split_apply_merge(inputs, route_selection, paths)
-
-  # flatten routing_loss to a row vector (from a column vector)
-  routing_loss = tf.reshape(tf.square(outputs - targets), shape=[-1])
-
-  # Total loss: score function loss + routing loss.
-  # The score function loss (through `route_selection.loss(routing_loss)`)
-  # returns:
-  #  [stop_gradient(routing_loss) *
-  #   route_selection.log_pmf(stop_gradient(route_selection.value()))],
-  # where log_pmf has gradients going all the way back to weights and bias.
-  # In this case, the routing_loss depends on the variables only through
-  # "route_selection", which has a stop_gradient on it.  So the
-  # gradient of the loss really come through the score function
-  surrogate_loss = sg.surrogate_loss([routing_loss])
-  final_loss = tf.reduce_sum(surrogate_loss)
-
-  return (route_selection, routing_loss, final_loss)
-
-
-class REINFORCESimpleExample(tf.test.TestCase):
-
-  def testSplitApplyMerge(self):
-    # Repeatability.  SGD has a tendency to jump around, even here.
-    tf.set_random_seed(1)
-
-    with self.test_session() as sess:
-      # Use sampling to train REINFORCE
-      with st.value_type(st.SampleValue()):
-        (route_selection,
-         routing_loss,
-         final_loss) = build_split_apply_merge_model()
-
-      sgd = tf.train.GradientDescentOptimizer(1.0).minimize(final_loss)
-
-      tf.global_variables_initializer().run()
-
-      for i in range(10):
-        # Run loss and inference step.  This toy problem converges VERY quickly.
-        (routing_loss_v, final_loss_v, route_selection_v, _) = sess.run(
-            [routing_loss, final_loss, tf.identity(route_selection), sgd])
-        print(
-            "Iteration %d, routing loss: %s, final_loss: %s, "
-            "route selection: %s"
-            % (i, routing_loss_v, final_loss_v, route_selection_v))
-
-      self.assertAllEqual([0, 0, 1, 1], route_selection_v)
-      self.assertAllClose([0.0, 0.0, 0.0, 0.0], routing_loss_v)
-      self.assertAllClose(0.0, final_loss_v)
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/entropy_test.py
@ -1,352 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Monte Carlo Ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib import layers as layers_lib
-from tensorflow.contrib.bayesflow.python.ops import entropy_impl as entropy
-from tensorflow.contrib.distributions.python.ops import mvn_diag as mvn_diag_lib
-from tensorflow.contrib.distributions.python.ops import mvn_tril as mvn_tril_lib
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.distributions import kullback_leibler as kullback_leibler_lib
-from tensorflow.python.ops.distributions import normal as normal_lib
-from tensorflow.python.ops.distributions import util as distribution_util
-from tensorflow.python.platform import test
-
-layers = layers_lib
-
-
-class NormalNoEntropy(normal_lib.Normal):  # pylint: disable=no-init
-  """Normal distribution without a `.entropy` method."""
-
-  def entropy(self):
-    return NotImplementedError('Entropy removed by gremlins')
-
-
-def get_train_op(scalar_loss, optimizer='SGD', learning_rate=1.0, decay=0.0):
-  global_step = variables.Variable(0)
-
-  def decay_fn(rate, t):
-    return rate * (1 + math_ops.to_float(t))**(-decay)
-
-  train_op = layers.optimize_loss(
-      scalar_loss,
-      global_step,
-      learning_rate,
-      optimizer,
-      learning_rate_decay_fn=decay_fn)
-  return train_op
-
-
-def _assert_monotonic_decreasing(array, atol=1e-5):
-  array = np.asarray(array)
-  _assert_monotonic_increasing(-array, atol=atol)
-
-
-def _assert_monotonic_increasing(array, atol=1e-5):
-  array = np.asarray(array)
-  diff = np.diff(array.ravel())
-  np.testing.assert_array_less(-1 * atol, diff)
-
-
-class ElboRatioTest(test.TestCase):
-  """Show sampling converges to true KL values."""
-
-  def setUp(self):
-    self._rng = np.random.RandomState(0)
-
-  def test_convergence_to_kl_using_sample_form_on_3dim_normal(self):
-    # Test that the sample mean KL is the same as analytic when we use samples
-    # to estimate every part of the KL divergence ratio.
-    vector_shape = (2, 3)
-    n_samples = 5000
-
-    with self.test_session():
-      q = mvn_diag_lib.MultivariateNormalDiag(
-          loc=self._rng.rand(*vector_shape),
-          scale_diag=self._rng.rand(*vector_shape))
-      p = mvn_diag_lib.MultivariateNormalDiag(
-          loc=self._rng.rand(*vector_shape),
-          scale_diag=self._rng.rand(*vector_shape))
-
-      # In this case, the log_ratio is the KL.
-      sample_kl = -1 * entropy.elbo_ratio(
-          log_p=p.log_prob,
-          q=q,
-          n=n_samples,
-          form=entropy.ELBOForms.sample,
-          seed=42)
-      actual_kl = kullback_leibler_lib.kl_divergence(q, p)
-
-      # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
-      # pass.
-      self.assertEqual((2,), sample_kl.get_shape())
-      self.assertAllClose(actual_kl.eval(), sample_kl.eval(), rtol=0.05)
-
-  def test_convergence_to_kl_using_analytic_entropy_form_on_3dim_normal(self):
-    # Test that the sample mean KL is the same as analytic when we use an
-    # analytic entropy combined with sampled cross-entropy.
-    n_samples = 5000
-
-    vector_shape = (2, 3)
-    with self.test_session():
-      q = mvn_diag_lib.MultivariateNormalDiag(
-          loc=self._rng.rand(*vector_shape),
-          scale_diag=self._rng.rand(*vector_shape))
-      p = mvn_diag_lib.MultivariateNormalDiag(
-          loc=self._rng.rand(*vector_shape),
-          scale_diag=self._rng.rand(*vector_shape))
-
-      # In this case, the log_ratio is the KL.
-      sample_kl = -1 * entropy.elbo_ratio(
-          log_p=p.log_prob,
-          q=q,
-          n=n_samples,
-          form=entropy.ELBOForms.analytic_entropy,
-          seed=42)
-      actual_kl = kullback_leibler_lib.kl_divergence(q, p)
-
-      # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
-      # pass.
-      self.assertEqual((2,), sample_kl.get_shape())
-      self.assertAllClose(actual_kl.eval(), sample_kl.eval(), rtol=0.1)
-
-  def test_sample_kl_zero_when_p_and_q_are_the_same_distribution(self):
-    n_samples = 50
-
-    vector_shape = (2, 3)
-    with self.test_session():
-      q = mvn_diag_lib.MultivariateNormalDiag(
-          loc=self._rng.rand(*vector_shape),
-          scale_diag=self._rng.rand(*vector_shape))
-
-      # In this case, the log_ratio is the KL.
-      sample_kl = -1 * entropy.elbo_ratio(
-          log_p=q.log_prob,
-          q=q,
-          n=n_samples,
-          form=entropy.ELBOForms.sample,
-          seed=42)
-
-      self.assertEqual((2,), sample_kl.get_shape())
-      self.assertAllClose(np.zeros(2), sample_kl.eval())
-
-
-class EntropyShannonTest(test.TestCase):
-
-  def test_normal_entropy_default_form_uses_exact_entropy(self):
-    with self.test_session():
-      dist = normal_lib.Normal(loc=1.11, scale=2.22)
-      mc_entropy = entropy.entropy_shannon(dist, n=11)
-      exact_entropy = dist.entropy()
-      self.assertEqual(exact_entropy.get_shape(), mc_entropy.get_shape())
-      self.assertAllClose(exact_entropy.eval(), mc_entropy.eval())
-
-  def test_normal_entropy_analytic_form_uses_exact_entropy(self):
-    with self.test_session():
-      dist = normal_lib.Normal(loc=1.11, scale=2.22)
-      mc_entropy = entropy.entropy_shannon(
-          dist, form=entropy.ELBOForms.analytic_entropy)
-      exact_entropy = dist.entropy()
-      self.assertEqual(exact_entropy.get_shape(), mc_entropy.get_shape())
-      self.assertAllClose(exact_entropy.eval(), mc_entropy.eval())
-
-  def test_normal_entropy_sample_form_gets_approximate_answer(self):
-    # Tested by showing we get a good answer that is not exact.
-    with self.test_session():
-      dist = normal_lib.Normal(loc=1.11, scale=2.22)
-      mc_entropy = entropy.entropy_shannon(
-          dist, n=1000, form=entropy.ELBOForms.sample, seed=0)
-      exact_entropy = dist.entropy()
-
-      self.assertEqual(exact_entropy.get_shape(), mc_entropy.get_shape())
-
-      # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
-      # pass.
-      self.assertAllClose(exact_entropy.eval(), mc_entropy.eval(), rtol=0.01)
-
-      # Make sure there is some error, proving we used samples
-      self.assertLess(0.0001, math_ops.abs(exact_entropy - mc_entropy).eval())
-
-  def test_default_entropy_falls_back_on_sample_if_analytic_not_available(self):
-    # Tested by showing we get a good answer that is not exact.
-    with self.test_session():
-      # NormalNoEntropy is like a Normal, but does not have .entropy method, so
-      # we are forced to fall back on sample entropy.
-      dist_no_entropy = NormalNoEntropy(loc=1.11, scale=2.22)
-      dist_yes_entropy = normal_lib.Normal(loc=1.11, scale=2.22)
-
-      mc_entropy = entropy.entropy_shannon(
-          dist_no_entropy, n=1000, form=entropy.ELBOForms.sample, seed=0)
-      exact_entropy = dist_yes_entropy.entropy()
-
-      self.assertEqual(exact_entropy.get_shape(), mc_entropy.get_shape())
-
-      # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
-      # pass.
-      self.assertAllClose(exact_entropy.eval(), mc_entropy.eval(), rtol=0.01)
-
-      # Make sure there is some error, proving we used samples
-      self.assertLess(0.0001, math_ops.abs(exact_entropy - mc_entropy).eval())
-
-
-class RenyiRatioTest(test.TestCase):
-  """Show renyi_ratio is minimized when the distributions match."""
-
-  def setUp(self):
-    self._rng = np.random.RandomState(0)
-
-  def test_fitting_two_dimensional_normal_n_equals_1000(self):
-    # Minmizing Renyi divergence should allow us to make one normal match
-    # another one exactly.
-    n = 1000
-    mu_true = np.array([1.0, -1.0], dtype=np.float64)
-    chol_true = np.array([[2.0, 0.0], [0.5, 1.0]], dtype=np.float64)
-    with self.test_session() as sess:
-      target = mvn_tril_lib.MultivariateNormalTriL(mu_true, chol_true)
-
-      # Set up q distribution by defining mean/covariance as Variables
-      mu = variables.Variable(
-          np.zeros(mu_true.shape), dtype=mu_true.dtype, name='mu')
-      mat = variables.Variable(
-          np.zeros(chol_true.shape), dtype=chol_true.dtype, name='mat')
-      chol = distribution_util.matrix_diag_transform(
-          mat, transform=nn_ops.softplus)
-      q = mvn_tril_lib.MultivariateNormalTriL(mu, chol)
-      for alpha in [0.25, 0.75]:
-
-        negative_renyi_divergence = entropy.renyi_ratio(
-            log_p=target.log_prob, q=q, n=n, alpha=alpha, seed=0)
-        train_op = get_train_op(
-            math_ops.reduce_mean(-negative_renyi_divergence),
-            optimizer='SGD',
-            learning_rate=0.5,
-            decay=0.1)
-
-        variables.global_variables_initializer().run()
-        renyis = []
-        for step in range(1000):
-          sess.run(train_op)
-          if step in [1, 5, 100]:
-            renyis.append(negative_renyi_divergence.eval())
-
-        # This optimization should maximize the renyi divergence.
-        _assert_monotonic_increasing(renyis, atol=0)
-
-        # Relative tolerance (rtol) chosen 2 times as large as minimim needed to
-        # pass.
-        self.assertAllClose(target.loc.eval(), q.loc.eval(), rtol=0.06)
-        self.assertAllClose(target.scale.to_dense().eval(),
-                            q.scale.to_dense().eval(),
-                            rtol=0.1)
-
-  def test_divergence_between_identical_distributions_is_zero(self):
-    n = 1000
-    vector_shape = (2, 3)
-    with self.test_session():
-      q = mvn_diag_lib.MultivariateNormalDiag(
-          loc=self._rng.rand(*vector_shape),
-          scale_diag=self._rng.rand(*vector_shape))
-      for alpha in [0.25, 0.75]:
-
-        negative_renyi_divergence = entropy.renyi_ratio(
-            log_p=q.log_prob, q=q, n=n, alpha=alpha, seed=0)
-
-        self.assertEqual((2,), negative_renyi_divergence.get_shape())
-        self.assertAllClose(np.zeros(2), negative_renyi_divergence.eval())
-
-
-class RenyiAlphaTest(test.TestCase):
-
-  def test_with_three_alphas(self):
-    with self.test_session():
-      for dtype in (dtypes.float32, dtypes.float64):
-        alpha_min = constant_op.constant(0.0, dtype=dtype)
-        alpha_max = 0.5
-        decay_time = 3
-
-        alpha_0 = entropy.renyi_alpha(
-            0, decay_time, alpha_min=alpha_min, alpha_max=alpha_max)
-        alpha_1 = entropy.renyi_alpha(
-            1, decay_time, alpha_min=alpha_min, alpha_max=alpha_max)
-        alpha_2 = entropy.renyi_alpha(
-            2, decay_time, alpha_min=alpha_min, alpha_max=alpha_max)
-        alpha_3 = entropy.renyi_alpha(
-            3, decay_time, alpha_min=alpha_min, alpha_max=alpha_max)
-
-        # Alpha should start at alpha_max.
-        self.assertAllClose(alpha_max, alpha_0.eval(), atol=1e-5)
-        # Alpha should finish at alpha_min.
-        self.assertAllClose(alpha_min.eval(), alpha_3.eval(), atol=1e-5)
-        # In between, alpha should be monotonically decreasing.
-        _assert_monotonic_decreasing(
-            [alpha_0.eval(), alpha_1.eval(), alpha_2.eval(), alpha_3.eval()])
-
-  def test_non_scalar_input_raises(self):
-    with self.test_session():
-      # Good values here
-      step = 0
-      alpha_min = 0.0
-      alpha_max = 0.5
-      decay_time = 3
-
-      # Use one bad value inside each check.
-      # The "bad" value is always the non-scalar one.
-      with self.assertRaisesRegexp(ValueError, 'must be scalar'):
-        entropy.renyi_alpha(
-            [step], decay_time, alpha_min=alpha_min, alpha_max=alpha_max).eval()
-
-      with self.assertRaisesRegexp(ValueError, 'must be scalar'):
-        entropy.renyi_alpha(
-            step, [decay_time], alpha_min=alpha_min, alpha_max=alpha_max).eval()
-
-      with self.assertRaisesRegexp(ValueError, 'must be scalar'):
-        entropy.renyi_alpha(
-            step, decay_time, alpha_min=[alpha_min], alpha_max=alpha_max).eval()
-
-      with self.assertRaisesRegexp(ValueError, 'must be scalar'):
-        entropy.renyi_alpha(
-            step, decay_time, alpha_min=alpha_min, alpha_max=[alpha_max]).eval()
-
-  def test_input_with_wrong_sign_raises(self):
-    with self.test_session():
-      # Good values here
-      step = 0
-      alpha_min = 0.0
-      alpha_max = 0.5
-      decay_time = 3
-
-      # Use one bad value inside each check.
-      # The "bad" value is always the non-scalar one.
-      with self.assertRaisesOpError('decay_time must be positive'):
-        entropy.renyi_alpha(
-            step, 0.0, alpha_min=alpha_min, alpha_max=alpha_max).eval()
-
-      with self.assertRaisesOpError('step must be non-negative'):
-        entropy.renyi_alpha(
-            -1, decay_time, alpha_min=alpha_min, alpha_max=alpha_max).eval()
-
-
-if __name__ == '__main__':
-  test.main()
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_gradient_estimators_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_gradient_estimators_test.py
@ -1,206 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for stochastic graphs."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from tensorflow.contrib import distributions
-from tensorflow.contrib.bayesflow.python.ops import stochastic_gradient_estimators
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-st = stochastic_tensor
-sge = stochastic_gradient_estimators
-dists = distributions
-
-
-def _vimco(loss):
-  """Python implementation of VIMCO."""
-  n = loss.shape[0]
-  log_loss = np.log(loss)
-  geometric_mean = []
-  for j in range(n):
-    geometric_mean.append(
-        np.exp(np.mean([log_loss[i, :] for i in range(n) if i != j], 0)))
-  geometric_mean = np.array(geometric_mean)
-
-  learning_signal = []
-  for j in range(n):
-    learning_signal.append(np.sum([loss[i, :] for i in range(n) if i != j], 0))
-  learning_signal = np.array(learning_signal)
-
-  local_learning_signal = np.log(1 / n * (learning_signal + geometric_mean))
-
-  # log_mean - local_learning_signal
-  log_mean = np.log(np.mean(loss, 0))
-  advantage = log_mean - local_learning_signal
-
-  return advantage
-
-
-class StochasticGradientEstimatorsTest(test.TestCase):
-
-  def setUp(self):
-    self._p = constant_op.constant(0.999999)
-    self._final_loss = constant_op.constant(3.2)
-
-  def _testScoreFunction(self, loss_fn, expected):
-    x = st.StochasticTensor(dists.Bernoulli(probs=self._p), loss_fn=loss_fn)
-    sf = x.loss(self._final_loss)
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllClose(*sess.run([expected, sf]))
-
-  def testScoreFunction(self):
-    expected = math_ops.log(self._p) * self._final_loss
-    self._testScoreFunction(sge.score_function, expected)
-
-  def testScoreFunctionWithConstantBaseline(self):
-    b = constant_op.constant(9.8)
-    expected = math_ops.log(self._p) * (self._final_loss - b)
-    self._testScoreFunction(
-        sge.get_score_function_with_constant_baseline(b), expected)
-
-  def testScoreFunctionWithBaselineFn(self):
-    b = constant_op.constant(9.8)
-
-    def baseline_fn(stoch_tensor, loss):
-      self.assertTrue(isinstance(stoch_tensor, st.StochasticTensor))
-      self.assertTrue(isinstance(loss, ops.Tensor))
-      return b
-
-    expected = math_ops.log(self._p) * (self._final_loss - b)
-    self._testScoreFunction(
-        sge.get_score_function_with_baseline(baseline_fn), expected)
-
-  def testScoreFunctionWithMeanBaseline(self):
-    ema_decay = 0.8
-    num_steps = 6
-    x = st.StochasticTensor(
-        dists.Bernoulli(probs=self._p),
-        loss_fn=sge.get_score_function_with_baseline(
-            sge.get_mean_baseline(ema_decay)))
-    sf = x.loss(self._final_loss)
-
-    # Expected EMA value
-    ema = 0.
-    for _ in range(num_steps):
-      ema -= (1. - ema_decay) * (ema - self._final_loss)
-
-    # Baseline is EMA with bias correction
-    bias_correction = 1. - ema_decay**num_steps
-    baseline = ema / bias_correction
-    expected = math_ops.log(self._p) * (self._final_loss - baseline)
-
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      for _ in range(num_steps - 1):
-        sess.run(sf)  # run to update EMA
-      self.assertAllClose(*sess.run([expected, sf]))
-
-  def testScoreFunctionWithAdvantageFn(self):
-    b = constant_op.constant(9.8)
-
-    def advantage_fn(stoch_tensor, loss):
-      self.assertTrue(isinstance(stoch_tensor, st.StochasticTensor))
-      self.assertTrue(isinstance(loss, ops.Tensor))
-      return loss - b
-
-    expected = math_ops.log(self._p) * (self._final_loss - b)
-    self._testScoreFunction(
-        sge.get_score_function_with_advantage(advantage_fn), expected)
-
-  def testVIMCOAdvantageFn(self):
-    # simple_loss: (3, 2) with 3 samples, batch size 2
-    simple_loss = np.array(
-        [[1.0, 1.5],
-         [1e-6, 1e4],
-         [2.0, 3.0]])
-    # random_loss: (100, 50, 64) with 100 samples, batch shape (50, 64)
-    random_loss = 100 * np.random.rand(100, 50, 64)
-
-    advantage_fn = sge.get_vimco_advantage_fn(have_log_loss=False)
-
-    with self.test_session() as sess:
-      for loss in [simple_loss, random_loss]:
-        expected = _vimco(loss)
-        loss_t = constant_op.constant(loss, dtype=dtypes.float32)
-        advantage_t = advantage_fn(None, loss_t)  # ST is not used
-        advantage = sess.run(advantage_t)
-        self.assertEqual(expected.shape, advantage_t.get_shape())
-        self.assertAllClose(expected, advantage, atol=5e-5)
-
-  def testVIMCOAdvantageGradients(self):
-    loss = np.log(
-        [[1.0, 1.5],
-         [1e-6, 1e4],
-         [2.0, 3.0]])
-    advantage_fn = sge.get_vimco_advantage_fn(have_log_loss=True)
-
-    with self.test_session():
-      loss_t = constant_op.constant(loss, dtype=dtypes.float64)
-      advantage_t = advantage_fn(None, loss_t)  # ST is not used
-      gradient_error = gradient_checker.compute_gradient_error(
-          loss_t,
-          loss_t.get_shape().as_list(),
-          advantage_t,
-          advantage_t.get_shape().as_list(),
-          x_init_value=loss)
-      self.assertLess(gradient_error, 1e-3)
-
-  def testVIMCOAdvantageWithSmallProbabilities(self):
-    theta_value = np.random.rand(10, 100000)
-    # Test with float16 dtype to ensure stability even in this extreme case.
-    theta = constant_op.constant(theta_value, dtype=dtypes.float16)
-    advantage_fn = sge.get_vimco_advantage_fn(have_log_loss=True)
-
-    with self.test_session() as sess:
-      log_loss = -math_ops.reduce_sum(theta, [1])
-      advantage_t = advantage_fn(None, log_loss)
-      grad_t = gradients_impl.gradients(advantage_t, theta)[0]
-      advantage, grad = sess.run((advantage_t, grad_t))
-      self.assertTrue(np.all(np.isfinite(advantage)))
-      self.assertTrue(np.all(np.isfinite(grad)))
-
-  def testScoreFunctionWithMeanBaselineHasUniqueVarScope(self):
-    ema_decay = 0.8
-    x = st.StochasticTensor(
-        dists.Bernoulli(probs=self._p),
-        loss_fn=sge.get_score_function_with_baseline(
-            sge.get_mean_baseline(ema_decay)))
-    y = st.StochasticTensor(
-        dists.Bernoulli(probs=self._p),
-        loss_fn=sge.get_score_function_with_baseline(
-            sge.get_mean_baseline(ema_decay)))
-    sf_x = x.loss(self._final_loss)
-    sf_y = y.loss(self._final_loss)
-    with self.test_session() as sess:
-      # Smoke test
-      sess.run(variables.global_variables_initializer())
-      sess.run([sf_x, sf_y])
-
-
-if __name__ == "__main__":
-  test.main()
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_graph_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_graph_test.py
@ -1,246 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for stochastic graphs."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib import distributions as distributions_lib
-from tensorflow.contrib.bayesflow.python.ops import stochastic_graph_impl
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-st = stochastic_tensor
-sg = stochastic_graph_impl
-distributions = distributions_lib
-
-
-class NormalNotParam(distributions.Normal):
-
-  @property
-  def reparameterization_type(self):
-    return distributions.NOT_REPARAMETERIZED
-
-
-class TestSurrogateLosses(test.TestCase):
-
-  def testPathwiseDerivativeDoesNotAddSurrogateLosses(self):
-    with self.test_session():
-      mu = [0.0, 0.1, 0.2]
-      sigma = constant_op.constant([1.1, 1.2, 1.3])
-      with st.value_type(st.SampleValue()):
-        prior = st.StochasticTensor(distributions.Normal(loc=mu, scale=sigma))
-        likelihood = st.StochasticTensor(
-            distributions.Normal(
-                loc=prior, scale=sigma))
-        self.assertEqual(
-            prior.distribution.reparameterization_type,
-            distributions.FULLY_REPARAMETERIZED)
-        self.assertEqual(
-            likelihood.distribution.reparameterization_type,
-            distributions.FULLY_REPARAMETERIZED)
-
-      loss = math_ops.square(array_ops.identity(likelihood) - [0.0, 0.1, 0.2])
-      sum_loss = math_ops.reduce_sum(loss)
-
-      surrogate_loss = sg.surrogate_loss([loss])
-      with self.assertRaisesRegexp(ValueError, "dimensionality 1 or greater"):
-        _ = sg.surrogate_loss([sum_loss])
-      surrogate_from_both = sg.surrogate_loss(
-          [loss, sum_loss * array_ops.ones_like(loss)])
-
-      # Pathwise derivative terms do not require add'l surrogate loss terms.
-      with self.test_session() as sess:
-        self.assertAllClose(*sess.run([loss, surrogate_loss]))
-        self.assertAllClose(*sess.run([(loss + sum_loss), surrogate_from_both]))
-
-  def _testSurrogateLoss(self, session, losses, expected_addl_terms, xs):
-    surrogate_loss = sg.surrogate_loss(losses)
-    expected_surrogate_loss = math_ops.add_n(losses + expected_addl_terms)
-    self.assertAllClose(*session.run([surrogate_loss, expected_surrogate_loss]))
-
-    # Test backprop
-    expected_grads = gradients_impl.gradients(ys=expected_surrogate_loss, xs=xs)
-    surrogate_grads = gradients_impl.gradients(ys=surrogate_loss, xs=xs)
-    self.assertEqual(len(expected_grads), len(surrogate_grads))
-    grad_values = session.run(expected_grads + surrogate_grads)
-    n_grad = len(expected_grads)
-    self.assertAllClose(grad_values[:n_grad], grad_values[n_grad:])
-
-  def testSurrogateLoss(self):
-    with self.test_session() as sess:
-      mu = constant_op.constant([0.0, 0.1, 0.2])
-      sigma = constant_op.constant([1.1, 1.2, 1.3])
-      with st.value_type(st.SampleValue()):
-        prior = st.StochasticTensor(NormalNotParam(loc=mu, scale=sigma))
-        likelihood = st.StochasticTensor(NormalNotParam(loc=prior, scale=sigma))
-        prior_2 = st.StochasticTensor(NormalNotParam(loc=mu, scale=sigma))
-
-      loss = math_ops.square(array_ops.identity(likelihood) - mu)
-      part_loss = math_ops.square(array_ops.identity(prior) - mu)
-      sum_loss = math_ops.reduce_sum(loss)
-      loss_nodeps = math_ops.square(array_ops.identity(prior_2) - mu)
-
-      # For ground truth, use the stop-gradient versions of the losses
-      loss_nograd = array_ops.stop_gradient(loss)
-      loss_nodeps_nograd = array_ops.stop_gradient(loss_nodeps)
-      sum_loss_nograd = array_ops.stop_gradient(sum_loss)
-
-      # These score functions should ignore prior_2
-      self._testSurrogateLoss(
-          session=sess,
-          losses=[loss],
-          expected_addl_terms=[
-              likelihood.distribution.log_prob(
-                  likelihood.value()) * loss_nograd,
-              prior.distribution.log_prob(prior.value()) * loss_nograd
-          ],
-          xs=[mu, sigma])
-
-      self._testSurrogateLoss(
-          session=sess,
-          losses=[loss, part_loss],
-          expected_addl_terms=[
-              likelihood.distribution.log_prob(
-                  likelihood.value()) * loss_nograd,
-              (prior.distribution.log_prob(prior.value()) *
-               array_ops.stop_gradient(part_loss + loss))
-          ],
-          xs=[mu, sigma])
-
-      self._testSurrogateLoss(
-          session=sess,
-          losses=[sum_loss * array_ops.ones_like(loss)],
-          expected_addl_terms=[(
-              likelihood.distribution.log_prob(likelihood.value()) *
-              sum_loss_nograd), prior.distribution.log_prob(prior.value()) *
-                               sum_loss_nograd],
-          xs=[mu, sigma])
-
-      self._testSurrogateLoss(
-          session=sess,
-          losses=[loss, sum_loss * array_ops.ones_like(loss)],
-          expected_addl_terms=[(
-              likelihood.distribution.log_prob(likelihood.value()) *
-              array_ops.stop_gradient(loss + sum_loss)),
-                               (prior.distribution.log_prob(prior.value()) *
-                                array_ops.stop_gradient(loss + sum_loss))],
-          xs=[mu, sigma])
-
-      # These score functions should ignore prior and likelihood
-      self._testSurrogateLoss(
-          session=sess,
-          losses=[loss_nodeps],
-          expected_addl_terms=[(prior_2.distribution.log_prob(prior_2.value()) *
-                                loss_nodeps_nograd)],
-          xs=[mu, sigma])
-
-      # These score functions should include all terms selectively
-      self._testSurrogateLoss(
-          session=sess,
-          losses=[loss, loss_nodeps],
-          # We can't guarantee ordering of output losses in this case.
-          expected_addl_terms=[(
-              likelihood.distribution.log_prob(likelihood.value()) *
-              loss_nograd), prior.distribution.log_prob(prior.value()) *
-                               loss_nograd,
-                               (prior_2.distribution.log_prob(prior_2.value()) *
-                                loss_nodeps_nograd)],
-          xs=[mu, sigma])
-
-  def testNoSurrogateLoss(self):
-    with self.test_session():
-      mu = constant_op.constant([0.0, 0.1, 0.2])
-      sigma = constant_op.constant([1.1, 1.2, 1.3])
-      with st.value_type(st.SampleValue()):
-        dt = st.StochasticTensor(
-            NormalNotParam(
-                loc=mu, scale=sigma), loss_fn=None)
-        self.assertEqual(None, dt.loss(constant_op.constant([2.0])))
-
-  def testExplicitStochasticTensors(self):
-    with self.test_session() as sess:
-      mu = constant_op.constant([0.0, 0.1, 0.2])
-      sigma = constant_op.constant([1.1, 1.2, 1.3])
-      with st.value_type(st.SampleValue()):
-        dt1 = st.StochasticTensor(NormalNotParam(loc=mu, scale=sigma))
-        dt2 = st.StochasticTensor(NormalNotParam(loc=mu, scale=sigma))
-        loss = math_ops.square(array_ops.identity(dt1)) + 10. + dt2
-
-        sl_all = sg.surrogate_loss([loss])
-        sl_dt1 = sg.surrogate_loss([loss], stochastic_tensors=[dt1])
-        sl_dt2 = sg.surrogate_loss([loss], stochastic_tensors=[dt2])
-
-        dt1_term = dt1.distribution.log_prob(dt1) * loss
-        dt2_term = dt2.distribution.log_prob(dt2) * loss
-
-        self.assertAllClose(*sess.run(
-            [sl_all, sum([loss, dt1_term, dt2_term])]))
-        self.assertAllClose(*sess.run([sl_dt1, sum([loss, dt1_term])]))
-        self.assertAllClose(*sess.run([sl_dt2, sum([loss, dt2_term])]))
-
-
-class StochasticDependenciesMapTest(test.TestCase):
-
-  def testBuildsMapOfUpstreamNodes(self):
-    dt1 = st.StochasticTensor(distributions.Normal(loc=0., scale=1.))
-    dt2 = st.StochasticTensor(distributions.Normal(loc=0., scale=1.))
-    out1 = dt1.value() + 1.
-    out2 = dt2.value() + 2.
-    x = out1 + out2
-    y = out2 * 3.
-    dep_map = sg._stochastic_dependencies_map([x, y])
-    self.assertEqual(dep_map[dt1], set([x]))
-    self.assertEqual(dep_map[dt2], set([x, y]))
-
-  def testHandlesStackedStochasticNodes(self):
-    dt1 = st.StochasticTensor(distributions.Normal(loc=0., scale=1.))
-    out1 = dt1.value() + 1.
-    dt2 = st.StochasticTensor(distributions.Normal(loc=out1, scale=1.))
-    x = dt2.value() + 2.
-    dt3 = st.StochasticTensor(distributions.Normal(loc=0., scale=1.))
-    y = dt3.value() * 3.
-    dep_map = sg._stochastic_dependencies_map([x, y])
-    self.assertEqual(dep_map[dt1], set([x]))
-    self.assertEqual(dep_map[dt2], set([x]))
-    self.assertEqual(dep_map[dt3], set([y]))
-
-  def testTraversesControlInputs(self):
-    dt1 = st.StochasticTensor(distributions.Normal(loc=0., scale=1.))
-    logits = dt1.value() * 3.
-    dt2 = st.StochasticTensor(distributions.Bernoulli(logits=logits))
-    dt3 = st.StochasticTensor(distributions.Normal(loc=0., scale=1.))
-    x = dt3.value()
-    y = array_ops.ones((2, 2)) * 4.
-    z = array_ops.ones((2, 2)) * 3.
-    out = control_flow_ops.cond(
-        math_ops.cast(dt2, dtypes.bool), lambda: math_ops.add(x, y),
-        lambda: math_ops.square(z))
-    out += 5.
-    dep_map = sg._stochastic_dependencies_map([out])
-    self.assertEqual(dep_map[dt1], set([out]))
-    self.assertEqual(dep_map[dt2], set([out]))
-    self.assertEqual(dep_map[dt3], set([out]))
-
-
-if __name__ == "__main__":
-  test.main()
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_tensor_test.py
@ -1,239 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for stochastic graphs."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.bayesflow.python.ops import stochastic_gradient_estimators
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor_impl
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.distributions import normal
-from tensorflow.python.platform import test
-
-sge = stochastic_gradient_estimators
-st = stochastic_tensor_impl
-
-
-class StochasticTensorTest(test.TestCase):
-
-  def testConstructionAndValue(self):
-    with self.test_session() as sess:
-      mu = [0.0, 0.1, 0.2]
-      sigma = constant_op.constant([1.1, 1.2, 1.3])
-      sigma2 = constant_op.constant([0.1, 0.2, 0.3])
-
-      prior_default = st.StochasticTensor(
-          normal.Normal(loc=mu, scale=sigma))
-      self.assertTrue(isinstance(prior_default.value_type, st.SampleValue))
-      prior_0 = st.StochasticTensor(
-          normal.Normal(loc=mu, scale=sigma),
-          dist_value_type=st.SampleValue())
-      self.assertTrue(isinstance(prior_0.value_type, st.SampleValue))
-
-      with st.value_type(st.SampleValue()):
-        prior = st.StochasticTensor(normal.Normal(loc=mu, scale=sigma))
-        self.assertTrue(isinstance(prior.value_type, st.SampleValue))
-        likelihood = st.StochasticTensor(
-            normal.Normal(loc=prior, scale=sigma2))
-        self.assertTrue(isinstance(likelihood.value_type, st.SampleValue))
-
-      coll = ops.get_collection(st.STOCHASTIC_TENSOR_COLLECTION)
-      self.assertEqual(coll, [prior_default, prior_0, prior, likelihood])
-
-      # Also works: tf.convert_to_tensor(prior)
-      prior_default = array_ops.identity(prior_default)
-      prior_0 = array_ops.identity(prior_0)
-      prior = array_ops.identity(prior)
-      likelihood = array_ops.identity(likelihood)
-
-      # Mostly a smoke test for now...
-      prior_0_val, prior_val, prior_default_val, _ = sess.run(
-          [prior_0, prior, prior_default, likelihood])
-
-      self.assertEqual(prior_0_val.shape, prior_val.shape)
-      self.assertEqual(prior_default_val.shape, prior_val.shape)
-      # These are different random samples from the same distribution,
-      # so the values should differ.
-      self.assertGreater(np.abs(prior_0_val - prior_val).sum(), 1e-6)
-      self.assertGreater(np.abs(prior_default_val - prior_val).sum(), 1e-6)
-
-  def testMeanValue(self):
-    with self.test_session() as sess:
-      mu = [0.0, -1.0, 1.0]
-      sigma = constant_op.constant([1.1, 1.2, 1.3])
-
-      with st.value_type(st.MeanValue()):
-        prior = st.StochasticTensor(normal.Normal(loc=mu, scale=sigma))
-        self.assertTrue(isinstance(prior.value_type, st.MeanValue))
-
-      prior_mean = prior.mean()
-      prior_value = prior.value()
-
-      prior_mean_val, prior_value_val = sess.run([prior_mean, prior_value])
-      self.assertAllEqual(prior_mean_val, mu)
-      self.assertAllEqual(prior_mean_val, prior_value_val)
-
-  def testSampleValueScalar(self):
-    with self.test_session() as sess:
-      mu = [[0.0, -1.0, 1.0], [0.0, -1.0, 1.0]]
-      sigma = constant_op.constant([[1.1, 1.2, 1.3], [1.1, 1.2, 1.3]])
-
-      with st.value_type(st.SampleValue()):
-        prior_single = st.StochasticTensor(
-            normal.Normal(loc=mu, scale=sigma))
-
-      prior_single_value = prior_single.value()
-      self.assertEqual(prior_single_value.get_shape(), (2, 3))
-
-      prior_single_value_val = sess.run([prior_single_value])[0]
-      self.assertEqual(prior_single_value_val.shape, (2, 3))
-
-      with st.value_type(st.SampleValue(1)):
-        prior_single = st.StochasticTensor(
-            normal.Normal(loc=mu, scale=sigma))
-        self.assertTrue(isinstance(prior_single.value_type, st.SampleValue))
-
-      prior_single_value = prior_single.value()
-      self.assertEqual(prior_single_value.get_shape(), (1, 2, 3))
-
-      prior_single_value_val = sess.run([prior_single_value])[0]
-      self.assertEqual(prior_single_value_val.shape, (1, 2, 3))
-
-      with st.value_type(st.SampleValue(2)):
-        prior_double = st.StochasticTensor(
-            normal.Normal(loc=mu, scale=sigma))
-
-      prior_double_value = prior_double.value()
-      self.assertEqual(prior_double_value.get_shape(), (2, 2, 3))
-
-      prior_double_value_val = sess.run([prior_double_value])[0]
-      self.assertEqual(prior_double_value_val.shape, (2, 2, 3))
-
-  def testDistributionEntropy(self):
-    with self.test_session() as sess:
-      mu = [0.0, -1.0, 1.0]
-      sigma = constant_op.constant([1.1, 1.2, 1.3])
-      with st.value_type(st.MeanValue()):
-        prior = st.StochasticTensor(normal.Normal(loc=mu, scale=sigma))
-        entropy = prior.entropy()
-        deep_entropy = prior.distribution.entropy()
-        expected_deep_entropy = normal.Normal(
-            loc=mu, scale=sigma).entropy()
-        entropies = sess.run([entropy, deep_entropy, expected_deep_entropy])
-        self.assertAllEqual(entropies[2], entropies[0])
-        self.assertAllEqual(entropies[1], entropies[0])
-
-  def testSurrogateLoss(self):
-    with self.test_session():
-      mu = [[3.0, -4.0, 5.0], [6.0, -7.0, 8.0]]
-      sigma = constant_op.constant(1.0)
-
-      # With default
-      with st.value_type(st.MeanValue(stop_gradient=True)):
-        dt = st.StochasticTensor(normal.Normal(loc=mu, scale=sigma))
-      loss = dt.loss([constant_op.constant(2.0)])
-      self.assertTrue(loss is not None)
-      self.assertAllClose(
-          dt.distribution.log_prob(mu).eval() * 2.0, loss.eval())
-
-      # With passed-in loss_fn.
-      dt = st.StochasticTensor(
-          normal.Normal(loc=mu, scale=sigma),
-          dist_value_type=st.MeanValue(stop_gradient=True),
-          loss_fn=sge.get_score_function_with_constant_baseline(
-              baseline=constant_op.constant(8.0)))
-      loss = dt.loss([constant_op.constant(2.0)])
-      self.assertTrue(loss is not None)
-      self.assertAllClose((dt.distribution.log_prob(mu) * (2.0 - 8.0)).eval(),
-                          loss.eval())
-
-
-class ValueTypeTest(test.TestCase):
-
-  def testValueType(self):
-    type_mean = st.MeanValue()
-    type_reshape = st.SampleValue()
-    type_full = st.SampleValue()
-    with st.value_type(type_mean):
-      self.assertEqual(st.get_current_value_type(), type_mean)
-      with st.value_type(type_reshape):
-        self.assertEqual(st.get_current_value_type(), type_reshape)
-      with st.value_type(type_full):
-        self.assertEqual(st.get_current_value_type(), type_full)
-      self.assertEqual(st.get_current_value_type(), type_mean)
-    with self.assertRaisesRegexp(ValueError, "No value type currently set"):
-      st.get_current_value_type()
-
-
-class ObservedStochasticTensorTest(test.TestCase):
-
-  def testConstructionAndValue(self):
-    with self.test_session() as sess:
-      mu = [0.0, 0.1, 0.2]
-      sigma = constant_op.constant([1.1, 1.2, 1.3])
-      obs = array_ops.zeros((2, 3))
-      z = st.ObservedStochasticTensor(
-          normal.Normal(loc=mu, scale=sigma), value=obs)
-      [obs_val, z_val] = sess.run([obs, z.value()])
-      self.assertAllEqual(obs_val, z_val)
-
-      coll = ops.get_collection(st.STOCHASTIC_TENSOR_COLLECTION)
-      self.assertEqual(coll, [z])
-
-  def testConstructionWithUnknownShapes(self):
-    mu = array_ops.placeholder(dtypes.float32)
-    sigma = array_ops.placeholder(dtypes.float32)
-    obs = array_ops.placeholder(dtypes.float32)
-    z = st.ObservedStochasticTensor(
-        normal.Normal(loc=mu, scale=sigma), value=obs)
-
-    mu2 = array_ops.placeholder(dtypes.float32, shape=[None])
-    sigma2 = array_ops.placeholder(dtypes.float32, shape=[None])
-    obs2 = array_ops.placeholder(dtypes.float32, shape=[None, None])
-    z2 = st.ObservedStochasticTensor(
-        normal.Normal(loc=mu2, scale=sigma2), value=obs2)
-
-    coll = ops.get_collection(st.STOCHASTIC_TENSOR_COLLECTION)
-    self.assertEqual(coll, [z, z2])
-
-  def testConstructionErrors(self):
-    mu = [0., 0.]
-    sigma = [1., 1.]
-    self.assertRaises(
-        ValueError,
-        st.ObservedStochasticTensor,
-        normal.Normal(loc=mu, scale=sigma),
-        value=array_ops.zeros((3,)))
-    self.assertRaises(
-        ValueError,
-        st.ObservedStochasticTensor,
-        normal.Normal(loc=mu, scale=sigma),
-        value=array_ops.zeros((3, 1)))
-    self.assertRaises(
-        ValueError,
-        st.ObservedStochasticTensor,
-        normal.Normal(loc=mu, scale=sigma),
-        value=array_ops.zeros((1, 2), dtype=dtypes.int32))
-
-
-if __name__ == "__main__":
-  test.main()
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/stochastic_variables_test.py
@ -1,168 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for stochastic graphs."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from tensorflow.contrib import distributions
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
-from tensorflow.contrib.bayesflow.python.ops import stochastic_variables
-from tensorflow.contrib.bayesflow.python.ops import variational_inference_impl
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-sv = stochastic_variables
-st = stochastic_tensor
-vi = variational_inference_impl
-dist = distributions
-
-
-class StochasticVariablesTest(test.TestCase):
-
-  def testStochasticVariables(self):
-    shape = (10, 20)
-    with variable_scope.variable_scope(
-        "stochastic_variables",
-        custom_getter=sv.make_stochastic_variable_getter(
-            dist_cls=dist.NormalWithSoftplusScale)):
-      v = variable_scope.get_variable("sv", shape)
-
-    self.assertTrue(isinstance(v, st.StochasticTensor))
-    self.assertTrue(isinstance(v.distribution, dist.NormalWithSoftplusScale))
-
-    self.assertEqual(
-        {"stochastic_variables/sv_loc", "stochastic_variables/sv_scale"},
-        set([v.op.name for v in variables.global_variables()]))
-    self.assertEqual(
-        set(variables.trainable_variables()), set(variables.global_variables()))
-
-    v = ops.convert_to_tensor(v)
-    self.assertEqual(list(shape), v.get_shape().as_list())
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertEqual(shape, sess.run(v).shape)
-
-  def testStochasticVariablesWithConstantInitializer(self):
-    shape = (10, 20)
-    with variable_scope.variable_scope(
-        "stochastic_variables",
-        custom_getter=sv.make_stochastic_variable_getter(
-            dist_cls=dist.NormalWithSoftplusScale,
-            dist_kwargs={"validate_args": True},
-            param_initializers={
-                "loc": np.ones(shape) * 4.,
-                "scale": np.ones(shape) * 2.
-            })):
-      v = variable_scope.get_variable("sv")
-
-    for var in variables.global_variables():
-      if "loc" in var.name:
-        mu_var = var
-      if "scale" in var.name:
-        sigma_var = var
-
-    v = ops.convert_to_tensor(v)
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual(np.ones(shape) * 4., sess.run(mu_var))
-      self.assertAllEqual(np.ones(shape) * 2., sess.run(sigma_var))
-      self.assertEqual(shape, sess.run(v).shape)
-
-  def testStochasticVariablesWithCallableInitializer(self):
-    shape = (10, 20)
-
-    def sigma_init(shape, dtype, partition_info):
-      _ = partition_info
-      return array_ops.ones(shape, dtype=dtype) * 2.
-
-    with variable_scope.variable_scope(
-        "stochastic_variables",
-        custom_getter=sv.make_stochastic_variable_getter(
-            dist_cls=dist.NormalWithSoftplusScale,
-            dist_kwargs={"validate_args": True},
-            param_initializers={
-                "loc": np.ones(
-                    shape, dtype=np.float32) * 4.,
-                "scale": sigma_init
-            })):
-      v = variable_scope.get_variable("sv", shape)
-
-    for var in variables.global_variables():
-      if "loc" in var.name:
-        mu_var = var
-      if "scale" in var.name:
-        sigma_var = var
-
-    v = ops.convert_to_tensor(v)
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual(np.ones(shape) * 4., sess.run(mu_var))
-      self.assertAllEqual(np.ones(shape) * 2., sess.run(sigma_var))
-      self.assertEqual(shape, sess.run(v).shape)
-
-  def testStochasticVariablesWithPrior(self):
-    shape = (10, 20)
-    prior = dist.Normal(0., 1.)
-    with variable_scope.variable_scope(
-        "stochastic_variables",
-        custom_getter=sv.make_stochastic_variable_getter(
-            dist_cls=dist.NormalWithSoftplusScale, prior=prior)):
-      w = variable_scope.get_variable("weights", shape)
-
-    x = random_ops.random_uniform((8, 10))
-    y = math_ops.matmul(x, w)
-
-    prior_map = vi._find_variational_and_priors(y, None)
-    self.assertEqual(prior_map[w], prior)
-    elbo = vi.elbo(y, keep_batch_dim=False)
-
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      sess.run(elbo)
-
-  def testStochasticVariablesWithCallablePriorInitializer(self):
-
-    def prior_init(shape, dtype):
-      return dist.Normal(
-          array_ops.zeros(shape, dtype), array_ops.ones(shape, dtype))
-
-    with variable_scope.variable_scope(
-        "stochastic_variables",
-        custom_getter=sv.make_stochastic_variable_getter(
-            dist_cls=dist.NormalWithSoftplusScale, prior=prior_init)):
-      w = variable_scope.get_variable("weights", (10, 20))
-
-    x = random_ops.random_uniform((8, 10))
-    y = math_ops.matmul(x, w)
-
-    prior_map = vi._find_variational_and_priors(y, None)
-    self.assertTrue(isinstance(prior_map[w], dist.Normal))
-    elbo = vi.elbo(y, keep_batch_dim=False)
-
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      sess.run(elbo)
-
-
-if __name__ == "__main__":
-  test.main()
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/variational_inference_test.py
@ -1,146 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for variational inference."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib import distributions as distributions_lib
-from tensorflow.contrib import layers
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor
-from tensorflow.contrib.bayesflow.python.ops import variational_inference_impl
-from tensorflow.python.framework import constant_op
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.distributions import kullback_leibler
-from tensorflow.python.ops.distributions import normal
-from tensorflow.python.platform import test
-
-st = stochastic_tensor
-vi = variational_inference_impl
-distributions = distributions_lib
-
-
-class NormalNoEntropy(distributions.Normal):
-
-  def entropy(self):
-    raise NotImplementedError("entropy not implemented")
-
-
-# For mini-VAE
-def inference_net(x, latent_size):
-  return layers.linear(x, latent_size)
-
-
-def generative_net(z, data_size):
-  return layers.linear(z, data_size)
-
-
-def mini_vae():
-  x = [[-6., 3., 6.], [-8., 4., 8.]]
-  prior = distributions.Normal(loc=0., scale=1.)
-  variational = st.StochasticTensor(
-      distributions.Normal(
-          loc=inference_net(x, 1), scale=1.))
-  vi.register_prior(variational, prior)
-  px = distributions.Normal(loc=generative_net(variational, 3), scale=1.)
-  log_likelihood = math_ops.reduce_sum(px.log_prob(x), 1)
-  log_likelihood = array_ops.expand_dims(log_likelihood, -1)
-  return x, prior, variational, px, log_likelihood
-
-
-class VariationalInferenceTest(test.TestCase):
-
-  def testDefaultVariationalAndPrior(self):
-    _, prior, variational, _, log_likelihood = mini_vae()
-    elbo = vi.elbo(log_likelihood)
-    expected_elbo = log_likelihood - kullback_leibler.kl_divergence(
-        variational.distribution, prior)
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual(*sess.run([expected_elbo, elbo]))
-
-  def testExplicitVariationalAndPrior(self):
-    with self.test_session() as sess:
-      _, _, variational, _, log_likelihood = mini_vae()
-      prior = normal.Normal(loc=3., scale=2.)
-      elbo = vi.elbo(
-          log_likelihood, variational_with_prior={variational: prior})
-      expected_elbo = log_likelihood - kullback_leibler.kl_divergence(
-          variational.distribution, prior)
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual(*sess.run([expected_elbo, elbo]))
-
-  def testExplicitForms(self):
-    _, prior, variational, _, log_likelihood = mini_vae()
-
-    elbos = []
-    forms = vi.ELBOForms
-    for form in [
-        forms.default, forms.analytic_kl, forms.sample, forms.analytic_entropy
-    ]:
-      elbo = vi.elbo(
-          log_likelihood=log_likelihood,
-          variational_with_prior={variational: prior},
-          form=form)
-      elbos.append(elbo)
-
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      log_likelihood_shape = array_ops.shape(log_likelihood).eval()
-      for elbo in elbos:
-        elbo.eval()
-        elbo_shape = array_ops.shape(elbo).eval()
-        self.assertAllEqual(log_likelihood_shape, elbo_shape)
-        self.assertEqual(elbo.dtype, log_likelihood.dtype)
-
-  def testDefaultsSampleKLWithoutAnalyticKLOrEntropy(self):
-    x = constant_op.constant([[-6., 3., 6.]])
-
-    prior = distributions.Bernoulli(0.5)
-    variational = st.StochasticTensor(
-        NormalNoEntropy(
-            loc=inference_net(x, 1), scale=1.))
-    vi.register_prior(variational, prior)
-    px = distributions.Normal(loc=generative_net(variational, 3), scale=1.)
-    log_likelihood = math_ops.reduce_sum(px.log_prob(x), 1)
-
-    # No analytic KL available between prior and variational distributions.
-    with self.assertRaisesRegexp(NotImplementedError, "No KL"):
-      distributions.kl_divergence(variational.distribution, prior)
-
-    elbo = vi.elbo(
-        variational_with_prior={variational: prior},
-        log_likelihood=log_likelihood)
-    expected_elbo = log_likelihood + prior.log_prob(
-        variational) - variational.distribution.log_prob(variational)
-
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual(*sess.run([expected_elbo, elbo]))
-
-  def testElboWithLogJoint(self):
-    with self.test_session() as sess:
-      _, prior, variational, _, log_likelihood = mini_vae()
-      log_joint = log_likelihood + prior.log_prob(variational)
-      elbo = vi.elbo_with_log_joint(log_joint)
-      sess.run(variables.global_variables_initializer())
-      elbo.eval()
-
-
-if __name__ == "__main__":
-  test.main()
--- a/tensorflow/contrib/bayesflow/python/ops/entropy.py
+++ b/tensorflow/contrib/bayesflow/python/ops/entropy.py
@ -1,31 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Support for Entropy Ops. See ${python/contrib.bayesflow.entropy}."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.entropy_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    'ELBOForms', 'elbo_ratio', 'entropy_shannon', 'renyi_ratio', 'renyi_alpha'
-]
-
-remove_undocumented(__name__, _allowed_symbols)
--- a/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/entropy_impl.py
@ -1,386 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Support for Entropy Ops. See ${python/contrib.bayesflow.entropy}.
-
-@@elbo_ratio
-@@entropy_shannon
-@@renyi_ratio
-@@renyi_alpha
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-from tensorflow.contrib.bayesflow.python.ops import monte_carlo_impl as monte_carlo
-from tensorflow.contrib.bayesflow.python.ops import variational_inference
-from tensorflow.contrib.bayesflow.python.ops.monte_carlo_impl import _get_samples as get_samples
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import tf_logging as logging
-
-
-# Make utility functions from monte_carlo available.
-# pylint: disable=protected-access
-_get_samples = get_samples
-_logspace_mean = monte_carlo._logspace_mean
-_sample_mean = monte_carlo._sample_mean
-
-# pylint: enable=protected-access
-
-__all__ = [
-    'elbo_ratio',
-    'entropy_shannon',
-    'renyi_ratio',
-    'renyi_alpha',
-]
-
-ELBOForms = variational_inference.ELBOForms  # pylint: disable=invalid-name
-
-
-def elbo_ratio(log_p,
-               q,
-               z=None,
-               n=None,
-               seed=None,
-               form=None,
-               name='elbo_ratio'):
-  r"""Estimate of the ratio appearing in the `ELBO` and `KL` divergence.
-
-  With `p(z) := exp{log_p(z)}`, this `Op` returns an approximation of
-
-  ```
-  E_q[ Log[p(Z) / q(Z)] ]
-  ```
-
-  The term `E_q[ Log[p(Z)] ]` is always computed as a sample mean.
-  The term `E_q[ Log[q(z)] ]` can be computed with samples, or an exact formula
-  if `q.entropy()` is defined.  This is controlled with the kwarg `form`.
-
-  This log-ratio appears in different contexts:
-
-  #### `KL[q || p]`
-
-  If `log_p(z) = Log[p(z)]` for distribution `p`, this `Op` approximates
-  the negative Kullback-Leibler divergence.
-
-  ```
-  elbo_ratio(log_p, q, n=100) = -1 * KL[q || p],
-  KL[q || p] = E[ Log[q(Z)] - Log[p(Z)] ]
-  ```
-
-  Note that if `p` is a `Distribution`, then
-  `distributions.kl_divergence(q, p)` may be defined and available as an
-  exact result.
-
-  #### ELBO
-
-  If `log_p(z) = Log[p(z, x)]` is the log joint of a distribution `p`, this is
-  the Evidence Lower BOund (ELBO):
-
-  ```
-  ELBO ~= E[ Log[p(Z, x)] - Log[q(Z)] ]
-        = Log[p(x)] - KL[q || p]
-       <= Log[p(x)]
-  ```
-
-  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-  Args:
-    log_p:  Callable mapping samples from `q` to `Tensors` with
-      shape broadcastable to `q.batch_shape`.
-      For example, `log_p` works "just like" `q.log_prob`.
-    q:  `tf.contrib.distributions.Distribution`.
-    z:  `Tensor` of samples from `q`, produced by `q.sample(n)` for some `n`.
-    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-    seed:  Python integer to seed the random number generator.
-    form:  Either `ELBOForms.analytic_entropy` (use formula for entropy of `q`)
-      or `ELBOForms.sample` (sample estimate of entropy), or `ELBOForms.default`
-      (attempt analytic entropy, fallback on sample).
-      Default value is `ELBOForms.default`.
-    name:  A name to give this `Op`.
-
-  Returns:
-    Scalar `Tensor` holding sample mean KL divergence.  `shape` is the batch
-      shape of `q`, and `dtype` is the same as `q`.
-
-  Raises:
-    ValueError:  If `form` is not handled by this function.
-  """
-  form = ELBOForms.default if form is None else form
-
-  with ops.name_scope(name, values=[n, z]):
-    z = _get_samples(q, z, n, seed)
-
-    entropy = entropy_shannon(q, z=z, form=form)
-
-    # If log_p(z) = Log[p(z)], cross entropy = -E_q[log(p(Z))]
-    negative_cross_entropy = _sample_mean(log_p(z))
-
-    return entropy + negative_cross_entropy
-
-
-def entropy_shannon(p,
-                    z=None,
-                    n=None,
-                    seed=None,
-                    form=None,
-                    name='entropy_shannon'):
-  r"""Monte Carlo or deterministic computation of Shannon's entropy.
-
-  Depending on the kwarg `form`, this `Op` returns either the analytic entropy
-  of the distribution `p`, or the sampled entropy:
-
-  ```
-  -n^{-1} sum_{i=1}^n p.log_prob(z_i),  where z_i ~ p,
-      \approx - E_p[ Log[p(Z)] ]
-      = Entropy[p]
-  ```
-
-  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-  Args:
-    p:  `tf.contrib.distributions.Distribution`
-    z:  `Tensor` of samples from `p`, produced by `p.sample(n)` for some `n`.
-    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
-    seed:  Python integer to seed the random number generator.
-    form:  Either `ELBOForms.analytic_entropy` (use formula for entropy of `q`)
-      or `ELBOForms.sample` (sample estimate of entropy), or `ELBOForms.default`
-      (attempt analytic entropy, fallback on sample).
-      Default value is `ELBOForms.default`.
-    name:  A name to give this `Op`.
-
-  Returns:
-    A `Tensor` with same `dtype` as `p`, and shape equal to `p.batch_shape`.
-
-  Raises:
-    ValueError:  If `form` not handled by this function.
-    ValueError:  If `form` is `ELBOForms.analytic_entropy` and `n` was provided.
-  """
-  form = ELBOForms.default if form is None else form
-
-  if n is not None and form == ELBOForms.analytic_entropy:
-    raise ValueError('If form == ELBOForms.analytic_entropy, n must be None.')
-
-  with ops.name_scope(name, values=[n, z]):
-    # Entropy: -E_p[log(p(Z))].
-    entropy = None
-
-    # Try analytic path
-    if form in [ELBOForms.default, ELBOForms.analytic_entropy]:
-      try:
-        entropy = p.entropy()
-        logging.info('Using analytic entropy(p:%s)', p)
-      except NotImplementedError as e:
-        if form == ELBOForms.analytic_entropy:
-          raise e
-    elif form != ELBOForms.sample:
-      raise ValueError('ELBOForm not handled by this function: %s' % form)
-
-    # Sample path
-    if entropy is None:
-      logging.info('Using sampled entropy(p:%s)', p)
-      if z is None:
-        z = p.sample(n, seed=seed)
-      entropy = -monte_carlo.expectation(p.log_prob, z)
-
-    return entropy
-
-
-def renyi_ratio(log_p, q, alpha, z=None, n=None, seed=None, name='renyi_ratio'):
-  r"""Monte Carlo estimate of the ratio appearing in Renyi divergence.
-
-  This can be used to compute the Renyi (alpha) divergence, or a log evidence
-  approximation based on Renyi divergence.
-
-  #### Definition
-
-  With `z_i` iid samples from `q`, and `exp{log_p(z)} = p(z)`, this `Op` returns
-  the (biased for finite `n`) estimate:
-
-  ```
-  (1 - alpha)^{-1} Log[ n^{-1} sum_{i=1}^n ( p(z_i) / q(z_i) )^{1 - alpha},
-  \approx (1 - alpha)^{-1} Log[ E_q[ (p(Z) / q(Z))^{1 - alpha} ]  ]
-  ```
-
-  This ratio appears in different contexts:
-
-  #### Renyi divergence
-
-  If `log_p(z) = Log[p(z)]` is the log prob of a distribution, and
-  `alpha > 0`, `alpha != 1`, this `Op` approximates `-1` times Renyi divergence:
-
-  ```
-  # Choose reasonably high n to limit bias, see below.
-  renyi_ratio(log_p, q, alpha, n=100)
-                  \approx -1 * D_alpha[q || p],  where
-  D_alpha[q || p] := (1 - alpha)^{-1} Log E_q[(p(Z) / q(Z))^{1 - alpha}]
-  ```
-
-  The Renyi (or "alpha") divergence is non-negative and equal to zero iff
-  `q = p`.  Various limits of `alpha` lead to different special case results:
-
-  ```
-  alpha       D_alpha[q || p]
-  -----       ---------------
-  --> 0       Log[ int_{q > 0} p(z) dz ]
-  = 0.5,      -2 Log[1 - Hel^2[q || p]],  (\propto squared Hellinger distance)
-  --> 1       KL[q || p]
-  = 2         Log[ 1 + chi^2[q || p] ],   (\propto squared Chi-2 divergence)
-  --> infty   Log[ max_z{q(z) / p(z)} ],  (min description length principle).
-  ```
-
-  See "Renyi Divergence Variational Inference", by Li and Turner.
-
-  #### Log evidence approximation
-
-  If `log_p(z) = Log[p(z, x)]` is the log of the joint distribution `p`, this is
-  an alternative to the ELBO common in variational inference.
-
-  ```
-  L_alpha(q, p) = Log[p(x)] - D_alpha[q || p]
-  ```
-
-  If `q` and `p` have the same support, and `0 < a <= b < 1`, one can show
-  `ELBO <= D_b <= D_a <= Log[p(x)]`.  Thus, this `Op` allows a smooth
-  interpolation between the ELBO and the true evidence.
-
-  #### Stability notes
-
-  Note that when `1 - alpha` is not small, the ratio `(p(z) / q(z))^{1 - alpha}`
-  is subject to underflow/overflow issues.  For that reason, it is evaluated in
-  log-space after centering.  Nonetheless, infinite/NaN results may occur.  For
-  that reason, one may wish to shrink `alpha` gradually.  See the `Op`
-  `renyi_alpha`.  Using `float64` will also help.
-
-
-  #### Bias for finite sample size
-
-  Due to nonlinearity of the logarithm, for random variables `{X_1,...,X_n}`,
-  `E[ Log[sum_{i=1}^n X_i] ] != Log[ E[sum_{i=1}^n X_i] ]`.  As a result, this
-  estimate is biased for finite `n`.  For `alpha < 1`, it is non-decreasing
-  with `n` (in expectation).  For example, if `n = 1`, this estimator yields the
-  same result as `elbo_ratio`, and as `n` increases the expected value
-  of the estimator increases.
-
-  #### Call signature
-
-  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`
-
-  Args:
-    log_p:  Callable mapping samples from `q` to `Tensors` with
-      shape broadcastable to `q.batch_shape`.
-      For example, `log_p` works "just like" `q.log_prob`.
-    q: `tf.contrib.distributions.Distribution`.
-       `float64` `dtype` recommended.
-       `log_p` and `q` should be supported on the same set.
-    alpha:  `Tensor` with shape `q.batch_shape` and values not equal to 1.
-    z:  `Tensor` of samples from `q`, produced by `q.sample` for some `n`.
-    n:  Integer `Tensor`.  The number of samples to use if `z` is not provided.
-      Note that this can be highly biased for small `n`, see docstring.
-    seed:  Python integer to seed the random number generator.
-    name:  A name to give this `Op`.
-
-  Returns:
-    renyi_result:  The scaled log of sample mean.  `Tensor` with `shape` equal
-      to batch shape of `q`, and `dtype` = `q.dtype`.
-  """
-  with ops.name_scope(name, values=[alpha, n, z]):
-    z = _get_samples(q, z, n, seed)
-
-    # Evaluate sample mean in logspace.  Note that _logspace_mean will compute
-    # (among other things) the mean of q.log_prob(z), which could also be
-    # obtained with q.entropy().  However, DON'T use analytic entropy, because
-    # that increases variance, and could result in NaN/Inf values of a sensitive
-    # term.
-
-    # log_values
-    # = (1 - alpha) * ( Log p - Log q )
-    log_values = (1. - alpha) * (log_p(z) - q.log_prob(z))
-
-    # log_mean_values
-    # = Log[ E[ values ] ]
-    # = Log[ E[ (p / q)^{1-alpha} ] ]
-    log_mean_values = _logspace_mean(log_values)
-
-    return log_mean_values / (1. - alpha)
-
-
-def renyi_alpha(step,
-                decay_time,
-                alpha_min,
-                alpha_max=0.99999,
-                name='renyi_alpha'):
-  r"""Exponentially decaying `Tensor` appropriate for Renyi ratios.
-
-  When minimizing the Renyi divergence for `0 <= alpha < 1` (or maximizing the
-  Renyi equivalent of elbo) in high dimensions, it is not uncommon to experience
-  `NaN` and `inf` values when `alpha` is far from `1`.
-
-  For that reason, it is often desirable to start the optimization with `alpha`
-  very close to 1, and reduce it to a final `alpha_min` according to some
-  schedule.  The user may even want to optimize using `elbo_ratio` for
-  some fixed time before switching to Renyi based methods.
-
-  This `Op` returns an `alpha` decaying exponentially with step:
-
-  ```
-  s(step) = (exp{step / decay_time} - 1) / (e - 1)
-  t(s) = max(0, min(s, 1)),  (smooth growth from 0 to 1)
-  alpha(t) = (1 - t) alpha_min + t alpha_max
-  ```
-
-  Args:
-    step:  Non-negative scalar `Tensor`.  Typically the global step or an
-      offset version thereof.
-    decay_time:  Positive scalar `Tensor`.
-    alpha_min:  `float` or `double` `Tensor`.
-      The minimal, final value of `alpha`, achieved when `step >= decay_time`
-    alpha_max:  `Tensor` of same `dtype` as `alpha_min`.
-      The maximal, beginning value of `alpha`, achieved when `step == 0`
-    name:  A name to give this `Op`.
-
-  Returns:
-    alpha:  A `Tensor` of same `dtype` as `alpha_min`.
-  """
-  with ops.name_scope(name, values=[step, decay_time, alpha_min, alpha_max]):
-    alpha_min = ops.convert_to_tensor(alpha_min, name='alpha_min')
-    dtype = alpha_min.dtype
-
-    alpha_max = ops.convert_to_tensor(alpha_max, dtype=dtype, name='alpha_max')
-    decay_time = math_ops.cast(decay_time, dtype)
-    step = math_ops.cast(step, dtype)
-
-    check_scalars = [
-        check_ops.assert_rank(step, 0, message='step must be scalar'),
-        check_ops.assert_rank(
-            decay_time, 0, message='decay_time must be scalar'),
-        check_ops.assert_rank(alpha_min, 0, message='alpha_min must be scalar'),
-        check_ops.assert_rank(alpha_max, 0, message='alpha_max must be scalar'),
-    ]
-    check_sign = [
-        check_ops.assert_non_negative(
-            step, message='step must be non-negative'),
-        check_ops.assert_positive(
-            decay_time, message='decay_time must be positive'),
-    ]
-
-    with ops.control_dependencies(check_scalars + check_sign):
-      theta = (math_ops.exp(step / decay_time) - 1.) / (math.e - 1.)
-      theta = math_ops.minimum(math_ops.maximum(theta, 0.), 1.)
-      return alpha_max * (1. - theta) + alpha_min * theta
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py
+++ b/tensorflow/contrib/bayesflow/python/ops/stochastic_gradient_estimators.py
@ -1,317 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Stochastic gradient estimators.
-
-These functions are meant to be used in conjunction with `StochasticTensor`
-(`loss_fn` parameter) and `surrogate_loss`.
-
-See Gradient Estimation Using Stochastic Computation Graphs
-(http://arxiv.org/abs/1506.05254) by Schulman et al., eq. 1 and section 4, for
-mathematical details.
-
-## Score function estimator
-
-The score function is an unbiased estimator of the gradient of `E_p(x)[f(x)]`,
-where `f(x)` can be considered to be a "loss" term. It is computed as
-`E_p(x)[f(x) grad(log p(x))]`. A constant `b`, referred to here as the
-"baseline", can be subtracted from `f(x)` without affecting the expectation. The
-term `(f(x) - b)` is referred to here as the "advantage".
-
-Note that the methods defined in this module actually compute the integrand of
-the score function, such that when taking the gradient, the true score function
-is computed.
-
-@@score_function
-@@get_score_function_with_baseline
-@@get_score_function_with_constant_baseline
-@@get_score_function_with_advantage
-
-## Baseline functions
-
-Baselines reduce the variance of Monte Carlo estimate of an expectation. The
-baseline for a stochastic node can be a function of all non-influenced nodes
-(see section 4 of Schulman et al., linked above). Baselines are also known as
-"control variates."
-
-In the context of a MC estimate of `E_p(x)[f(x) - b]`, baseline functions have
-the signature `(st, fx) => Tensor`, where `st` is a `StochasticTensor` backed by
-the distribution `p(x)` and `fx` is the influenced loss.
-
-@@get_mean_baseline
-
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.training import training
-from tensorflow.python.util.all_util import make_all
-
-
-def score_function(stochastic_tensor, value, loss, baseline=None,
-                   name="ScoreFunction"):
-  """Score function estimator.
-
-  Computes the integrand of the score function with a baseline:
-  `p.log_prob(value) * (loss - baseline)`.
-
-  It will add a `stop_gradient` to the advantage `(loss - baseline)`.
-
-  Args:
-    stochastic_tensor: `StochasticTensor` p(x).
-    value: `Tensor` x. Samples from p(x).
-    loss: `Tensor`.
-    baseline: `Tensor` broadcastable to `loss`.
-    name: name to prepend ops with.
-
-  Returns:
-    `Tensor` `p.log_prob(x) * (loss - b)`. Taking the gradient yields the score
-    function estimator.
-  """
-  with ops.name_scope(name, values=[value, loss, baseline]):
-    value = ops.convert_to_tensor(value)
-    loss = ops.convert_to_tensor(loss)
-    if baseline is not None:
-      baseline = ops.convert_to_tensor(baseline)
-      advantage = loss - baseline
-    else:
-      advantage = loss
-
-    advantage = array_ops.stop_gradient(advantage)
-    return stochastic_tensor.distribution.log_prob(value) * advantage
-
-
-def get_score_function_with_advantage(advantage_fn=None,
-                                      name="ScoreFunctionWithAdvantage"):
-  """Score function estimator with advantage function.
-
-  Args:
-    advantage_fn: callable that takes the `StochasticTensor` and the
-      downstream `loss` and returns a `Tensor` advantage
-      (e.g. `loss - baseline`).
-    name: name to prepend ops with.
-
-  Returns:
-    Callable score function estimator that takes the `StochasticTensor`, the
-    sampled `value`, and the downstream `loss`, and uses the provided advantage.
-  """
-
-  def score_function_with_advantage(stochastic_tensor, value, loss):
-    with ops.name_scope(name, values=[value, loss]):
-      advantage = advantage_fn(stochastic_tensor, loss)
-      advantage = array_ops.stop_gradient(advantage)
-      return stochastic_tensor.distribution.log_prob(value) * advantage
-
-  return score_function_with_advantage
-
-
-def get_score_function_with_constant_baseline(baseline, name="ScoreFunction"):
-  """Score function estimator with constant baseline.
-
-  Args:
-    baseline: `Tensor` to be subtracted from loss.
-    name: name to prepend ops with.
-
-  Returns:
-    Callable score function estimator that takes the `StochasticTensor`, the
-    sampled `value`, and the downstream `loss`, and subtracts the provided
-    `baseline` from the `loss`.
-  """
-
-  def score_function_with_constant_baseline(stochastic_tensor, value, loss):
-    return score_function(stochastic_tensor, value, loss, baseline, name)
-
-  return score_function_with_constant_baseline
-
-
-def get_score_function_with_baseline(baseline_fn=None, name="ScoreFunction"):
-  """Score function estimator with baseline function.
-
-  Args:
-    baseline_fn: callable that takes the `StochasticTensor` and the downstream
-      `loss` and returns a `Tensor` baseline to be subtracted from the `loss`.
-      If None, defaults to `get_mean_baseline`, which is an EMA of the loss.
-    name: name to prepend ops with.
-
-  Returns:
-    Callable score function estimator that takes the `StochasticTensor`, the
-    sampled `value`, and the downstream `loss`, and subtracts the provided
-    `baseline` from the `loss`.
-  """
-  if baseline_fn is None:
-    baseline_fn = get_mean_baseline()
-
-  def score_function_with_baseline(stochastic_tensor, value, loss):
-    with ops.name_scope(name):
-      b = baseline_fn(stochastic_tensor, loss)
-      return score_function(stochastic_tensor, value, loss, b)
-
-  return score_function_with_baseline
-
-
-def get_mean_baseline(ema_decay=0.99, name=None):
-  """ExponentialMovingAverage baseline.
-
-  Args:
-    ema_decay: decay rate for the ExponentialMovingAverage.
-    name: name for variable scope of the ExponentialMovingAverage.
-
-  Returns:
-    Callable baseline function that takes the `StochasticTensor` (unused) and
-    the downstream `loss`, and returns an EMA of the loss.
-  """
-
-  def mean_baseline(_, loss):
-    with vs.variable_scope(name, default_name="MeanBaseline"):
-      reduced_loss = math_ops.reduce_mean(loss)
-
-      ema = training.ExponentialMovingAverage(decay=ema_decay, zero_debias=True)
-      update_op = ema.apply([reduced_loss])
-
-      with ops.control_dependencies([update_op]):
-        # Using `identity` causes an op to be added in this context, which
-        # triggers the update. Removing the `identity` means nothing is updated.
-        baseline = array_ops.identity(ema.average(reduced_loss))
-
-      return baseline
-
-  return mean_baseline
-
-
-def get_vimco_advantage_fn(have_log_loss=False):
-  """VIMCO (Variational Inference for Monte Carlo Objectives) baseline.
-
-  Implements VIMCO baseline from the article of the same name:
-
-  https://arxiv.org/pdf/1602.06725v2.pdf
-
-  Given a `loss` tensor (containing non-negative probabilities or ratios),
-  calculates the advantage VIMCO advantage via Eq. 9 of the above paper.
-
-  The tensor `loss` should be shaped `[n, ...]`, with rank at least 1.  Here,
-  the first axis is considered the single sampling dimension and `n` must
-  be at least 2.  Specifically, the `StochasticTensor` is assumed to have
-  used the `SampleValue(n)` value type with `n > 1`.
-
-  Args:
-    have_log_loss: Python `Boolean`.  If `True`, the loss is assumed to be the
-      log loss.  If `False` (the default), it is assumed to be a nonnegative
-      probability or probability ratio.
-
-  Returns:
-    Callable baseline function that takes the `StochasticTensor` (unused) and
-    the downstream `loss`, and returns the VIMCO baseline for the loss.
-  """
-  def vimco_advantage_fn(_, loss, name=None):
-    """Internal VIMCO function.
-
-    Args:
-      _: ignored `StochasticTensor`.
-      loss: The loss `Tensor`.
-      name: Python string, the name scope to use.
-
-    Returns:
-      The advantage `Tensor`.
-    """
-    with ops.name_scope(name, "VIMCOAdvantage", values=[loss]):
-      loss = ops.convert_to_tensor(loss)
-      loss_shape = loss.get_shape()
-      loss_num_elements = loss_shape[0].value
-      n = math_ops.cast(
-          loss_num_elements or array_ops.shape(loss)[0], dtype=loss.dtype)
-
-      if have_log_loss:
-        log_loss = loss
-      else:
-        log_loss = math_ops.log(loss)
-
-      # Calculate L_hat, Eq. (4) -- stably
-      log_mean = math_ops.reduce_logsumexp(log_loss, [0]) - math_ops.log(n)
-
-      # expand_dims: Expand shape [a, b, c] to [a, 1, b, c]
-      log_loss_expanded = array_ops.expand_dims(log_loss, [1])
-
-      # divide: log_loss_sub with shape [a, a, b, c], where
-      #
-      #  log_loss_sub[i] = log_loss - log_loss[i]
-      #
-      #       = [ log_loss[j] - log_loss[i] for rows j = 0 ... i - 1     ]
-      #         [ zeros                                                  ]
-      #         [ log_loss[j] - log_loss[i] for rows j = i + 1 ... a - 1 ]
-      #
-      log_loss_sub = log_loss - log_loss_expanded
-
-      # reduce_sum: Sums each row across all the sub[i]'s; result is:
-      #   reduce_sum[j] = (n - 1) * log_loss[j] - (sum_{i != j} loss[i])
-      # divide by (n - 1) to get:
-      #   geometric_reduction[j] =
-      #     log_loss[j] - (sum_{i != j} log_loss[i]) / (n - 1)
-      geometric_reduction = math_ops.reduce_sum(log_loss_sub, [0]) / (n - 1)
-
-      # subtract this from the original log_loss to get the baseline:
-      #   geometric_mean[j] = exp((sum_{i != j} log_loss[i]) / (n - 1))
-      log_geometric_mean = log_loss - geometric_reduction
-
-      ## Equation (9)
-
-      # Calculate sum_{i != j} loss[i] -- via exp(reduce_logsumexp(.))
-      # reduce_logsumexp: log-sum-exp each row across all the
-      # -sub[i]'s, result is:
-      #
-      #  exp(reduce_logsumexp[j]) =
-      #    1 + sum_{i != j} exp(log_loss[i] - log_loss[j])
-      log_local_learning_reduction = math_ops.reduce_logsumexp(
-          -log_loss_sub, [0])
-
-      # convert local_learning_reduction to the sum-exp of the log-sum-exp
-      #  (local_learning_reduction[j] - 1) * exp(log_loss[j])
-      #    = sum_{i != j} exp(log_loss[i])
-      local_learning_log_sum = (
-          _logexpm1(log_local_learning_reduction) + log_loss)
-
-      # Add (logaddexp) the local learning signals (Eq. 9)
-      local_learning_signal = (
-          math_ops.reduce_logsumexp(
-              array_ops.stack((local_learning_log_sum, log_geometric_mean)),
-              [0])
-          - math_ops.log(n))
-
-      advantage = log_mean - local_learning_signal
-
-      return advantage
-
-  return vimco_advantage_fn
-
-
-def _logexpm1(x):
-  """Stably calculate log(exp(x)-1)."""
-  with ops.name_scope("logsumexp1"):
-    eps = np.finfo(x.dtype.as_numpy_dtype).eps
-    # Choose a small offset that makes gradient calculations stable for
-    # float16, float32, and float64.
-    safe_log = lambda y: math_ops.log(y + eps / 1e8)  # For gradient stability
-    return array_ops.where(
-        math_ops.abs(x) < eps,
-        safe_log(x) + x/2 + x*x/24,  # small x approximation to log(expm1(x))
-        safe_log(math_ops.exp(x) - 1))
-
-
-__all__ = make_all(__name__)
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_graph.py
+++ b/tensorflow/contrib/bayesflow/python/ops/stochastic_graph.py
@ -1,37 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Support for Stochastic Computation Graphs.
-
-See the @{$python/contrib.bayesflow.stochastic_graph} guide.
-
-@@surrogate_loss
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.stochastic_graph_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-
-_allowed_symbols = [
-    "surrogate_loss"
-]
-
-remove_undocumented(__name__, _allowed_symbols)
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_graph_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/stochastic_graph_impl.py
@ -1,175 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Classes and helper functions for Stochastic Computation Graphs.
-
-## Stochastic Computation Graph Helper Functions
-
-@@surrogate_loss
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor_impl
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import tf_logging as logging
-
-
-def _upstream_stochastic_nodes(tensors):
-  """Map tensors to the stochastic tensors upstream of them.
-
-  Args:
-    tensors: a list of Tensors.
-
-  Returns:
-    A dict that maps the tensors passed in to the `StochasticTensor` objects
-    upstream of them.
-  """
-  reverse_map = _stochastic_dependencies_map(tensors)
-  upstream = collections.defaultdict(set)
-  for st, ts in reverse_map.items():
-    for t in ts:
-      upstream[t].add(st)
-  return upstream
-
-
-def _stochastic_dependencies_map(fixed_losses, stochastic_tensors=None):
-  """Map stochastic tensors to the fixed losses that depend on them.
-
-  Args:
-    fixed_losses: a list of `Tensor`s.
-    stochastic_tensors: a list of `StochasticTensor`s to map to fixed losses.
-      If `None`, all `StochasticTensor`s in the graph will be used.
-
-  Returns:
-    A dict `dependencies` that maps `StochasticTensor` objects to subsets of
-    `fixed_losses`.
-
-    If `loss in dependencies[st]`, for some `loss` in `fixed_losses` then there
-    is a direct path from `st.value()` to `loss` in the graph.
-  """
-  stoch_value_collection = stochastic_tensors or ops.get_collection(
-      stochastic_tensor_impl.STOCHASTIC_TENSOR_COLLECTION)
-
-  if not stoch_value_collection:
-    return {}
-
-  stoch_value_map = dict(
-      (node.value(), node) for node in stoch_value_collection)
-
-  # Step backwards through the graph to see which surrogate losses correspond
-  # to which fixed_losses.
-  #
-  # TODO(ebrevdo): Ensure that fixed_losses and stochastic values are in the
-  # same frame.
-  stoch_dependencies_map = collections.defaultdict(set)
-  for loss in fixed_losses:
-    boundary = set([loss])
-    while boundary:
-      edge = boundary.pop()
-      edge_stoch_node = stoch_value_map.get(edge, None)
-      if edge_stoch_node:
-        stoch_dependencies_map[edge_stoch_node].add(loss)
-      boundary.update(edge.op.inputs)
-
-  return stoch_dependencies_map
-
-
-def surrogate_loss(sample_losses,
-                   stochastic_tensors=None,
-                   name="SurrogateLoss"):
-  """Surrogate loss for stochastic graphs.
-
-  This function will call `loss_fn` on each `StochasticTensor`
-  upstream of `sample_losses`, passing the losses that it influenced.
-
-  Note that currently `surrogate_loss` does not work with `StochasticTensor`s
-  instantiated in `while_loop`s or other control structures.
-
-  Args:
-    sample_losses: a list or tuple of final losses. Each loss should be per
-      example in the batch (and possibly per sample); that is, it should have
-      dimensionality of 1 or greater. All losses should have the same shape.
-    stochastic_tensors: a list of `StochasticTensor`s to add loss terms for.
-      If None, defaults to all `StochasticTensor`s in the graph upstream of
-      the `Tensor`s in `sample_losses`.
-    name: the name with which to prepend created ops.
-
-  Returns:
-    `Tensor` loss, which is the sum of `sample_losses` and the
-    `loss_fn`s returned by the `StochasticTensor`s.
-
-  Raises:
-    TypeError: if `sample_losses` is not a list or tuple, or if its elements
-      are not `Tensor`s.
-    ValueError: if any loss in `sample_losses` does not have dimensionality 1
-      or greater.
-  """
-  with ops.name_scope(name, values=sample_losses):
-    if not isinstance(sample_losses, (list, tuple)):
-      raise TypeError("sample_losses must be a list or tuple")
-    for loss in sample_losses:
-      if not isinstance(loss, ops.Tensor):
-        raise TypeError("loss is not a Tensor: %s" % loss)
-      ndims = loss.get_shape().ndims
-      if not (ndims is not None and ndims >= 1):
-        raise ValueError("loss must have dimensionality 1 or greater: %s" %
-                         loss)
-
-    stoch_dependencies_map = _stochastic_dependencies_map(
-        sample_losses, stochastic_tensors=stochastic_tensors)
-    if not stoch_dependencies_map:
-      logging.warn(
-          "No collection of Stochastic Tensors found for current graph.")
-      return math_ops.add_n(sample_losses)
-
-    # Iterate through all of the stochastic dependencies, adding
-    # surrogate terms where necessary.
-    sample_losses = [ops.convert_to_tensor(loss) for loss in sample_losses]
-    loss_terms = sample_losses
-    for (stoch_node, dependent_losses) in stoch_dependencies_map.items():
-      dependent_losses = list(dependent_losses)
-
-      logging.info("Losses influenced by StochasticTensor %s: [%s]",
-                   stoch_node.name, ", ".join(
-                       [loss.name for loss in dependent_losses]))
-
-      # Sum up the downstream losses for this ST
-      influenced_loss = _add_n_or_sum(dependent_losses)
-
-      # Compute surrogate loss term
-      loss_term = stoch_node.loss(array_ops.stop_gradient(influenced_loss))
-      if loss_term is not None:
-        loss_terms.append(loss_term)
-
-    return _add_n_or_sum(loss_terms)
-
-
-def _add_n_or_sum(terms):
-  # add_n works for Tensors of the same dtype and shape
-  shape = terms[0].get_shape()
-  dtype = terms[0].dtype
-
-  if all(term.get_shape().is_fully_defined() and
-         term.get_shape().is_compatible_with(shape) and term.dtype == dtype
-         for term in terms):
-    return math_ops.add_n(terms)
-  else:
-    return sum(terms)
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor.py
+++ b/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor.py
@ -1,48 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Support for creating Stochastic Tensors.
-
-See the @{$python/contrib.bayesflow.stochastic_tensor} guide.
-
-@@BaseStochasticTensor
-@@StochasticTensor
-@@MeanValue
-@@SampleValue
-@@value_type
-@@get_current_value_type
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.stochastic_tensor_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-
-_allowed_symbols = [
-    "BaseStochasticTensor",
-    "StochasticTensor",
-    "ObservedStochasticTensor",
-    "MeanValue",
-    "SampleValue",
-    "value_type",
-    "get_current_value_type",
-]
-
-remove_undocumented(__name__, _allowed_symbols)
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/stochastic_tensor_impl.py
@ -1,477 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Classes and helper functions for creating Stochastic Tensors.
-
-`StochasticTensor` objects wrap `Distribution` objects.  Their
-values may be samples from the underlying distribution, or the distribution
-mean (as governed by `value_type`).  These objects provide a `loss`
-method for use when sampling from a non-reparameterized distribution.
-The `loss`method is used in conjunction with `stochastic_graph.surrogate_loss`
-to produce a single differentiable loss in stochastic graphs having
-both continuous and discrete stochastic nodes.
-
-## Stochastic Tensor Classes
-
-@@BaseStochasticTensor
-@@StochasticTensor
-
-## Stochastic Tensor Value Types
-
-@@MeanValue
-@@SampleValue
-
-@@value_type
-@@get_current_value_type
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import collections
-import contextlib
-import threading
-
-import six
-
-from tensorflow.contrib.bayesflow.python.ops import stochastic_gradient_estimators as sge
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.distributions import distribution
-
-STOCHASTIC_TENSOR_COLLECTION = "_stochastic_tensor_collection_"
-
-
-@six.add_metaclass(abc.ABCMeta)
-class BaseStochasticTensor(object):
-  """Base Class for Tensor-like objects that emit stochastic values."""
-
-  def __init__(self):
-    # Add self to this graph's Stochsatic Tensor collection for
-    # purposes of later performing correct surrogate loss calculation.
-    ops.add_to_collection(STOCHASTIC_TENSOR_COLLECTION, self)
-
-  @abc.abstractproperty
-  def name(self):
-    pass
-
-  @abc.abstractproperty
-  def dtype(self):
-    pass
-
-  @abc.abstractproperty
-  def graph(self):
-    pass
-
-  @abc.abstractmethod
-  def value(self, name=None):
-    pass
-
-  @abc.abstractmethod
-  def loss(self, sample_loss):
-    """Returns the term to add to the surrogate loss.
-
-    This method is called by `surrogate_loss`.  The input `sample_loss` should
-    have already had `stop_gradient` applied to it.  This is because the
-    surrogate_loss usually provides a Monte Carlo sample term of the form
-    `differentiable_surrogate * sample_loss` where `sample_loss` is considered
-    constant with respect to the input for purposes of the gradient.
-
-    Args:
-      sample_loss: `Tensor`, sample loss downstream of this `StochasticTensor`.
-
-    Returns:
-      Either `None` or a `Tensor`.
-    """
-    raise NotImplementedError("surrogate_loss not implemented")
-
-  @staticmethod
-  def _tensor_conversion_function(v, dtype=None, name=None, as_ref=False):
-    _ = name
-    if dtype and not dtype.is_compatible_with(v.dtype):
-      raise ValueError(
-          "Incompatible type conversion requested to type '%s' for variable "
-          "of type '%s'" % (dtype.name, v.dtype.name))
-    if as_ref:
-      raise ValueError("%s: Ref type is not supported." % v)
-    return v.value()
-
-
-# pylint: disable=protected-access
-ops.register_tensor_conversion_function(
-    BaseStochasticTensor, BaseStochasticTensor._tensor_conversion_function)
-
-# pylint: enable=protected-access
-
-
-class _StochasticValueType(object):
-  """Interface for the ValueType classes.
-
-  This is the base class for MeanValue, SampleValue, and their descendants.
-  """
-
-  def pushed_above(self, unused_value_type):
-    pass
-
-  def popped_above(self, unused_value_type):
-    pass
-
-  def declare_inputs(self, unused_stochastic_tensor, unused_inputs_dict):
-    pass
-
-  @abc.abstractproperty
-  def stop_gradient(self):
-    """Whether the value should be wrapped in stop_gradient.
-
-    StochasticTensors must respect this property.
-    """
-    pass
-
-
-class MeanValue(_StochasticValueType):
-
-  def __init__(self, stop_gradient=False):
-    self._stop_gradient = stop_gradient
-
-  @property
-  def stop_gradient(self):
-    return self._stop_gradient
-
-
-class SampleValue(_StochasticValueType):
-  """Draw samples, possibly adding new outer dimensions along the way.
-
-  This ValueType draws samples from StochasticTensors run within its
-  context, increasing the rank according to the requested shape.
-
-  Examples:
-
-  ```python
-  mu = tf.zeros((2,3))
-  sigma = tf.ones((2, 3))
-  with sg.value_type(sg.SampleValue()):
-    st = sg.StochasticTensor(
-      tf.contrib.distributions.Normal, mu=mu, sigma=sigma)
-  # draws 1 sample and does not reshape
-  assertEqual(st.value().get_shape(), (2, 3))
-  ```
-
-  ```python
-  mu = tf.zeros((2,3))
-  sigma = tf.ones((2, 3))
-  with sg.value_type(sg.SampleValue(4)):
-    st = sg.StochasticTensor(
-      tf.contrib.distributions.Normal, mu=mu, sigma=sigma)
-  # draws 4 samples each with shape (2, 3) and concatenates
-  assertEqual(st.value().get_shape(), (4, 2, 3))
-  ```
-  """
-
-  def __init__(self, shape=(), stop_gradient=False):
-    """Sample according to shape.
-
-    For the given StochasticTensor `st` using this value type,
-    the shape of `st.value()` will match that of
-    `st.distribution.sample(shape)`.
-
-    Args:
-      shape: A shape tuple or int32 tensor.  The sample shape.
-        Default is a scalar: take one sample and do not change the size.
-      stop_gradient: If `True`, StochasticTensors' values are wrapped in
-        `stop_gradient`, to avoid backpropagation through.
-    """
-    self._shape = shape
-    self._stop_gradient = stop_gradient
-
-  @property
-  def shape(self):
-    return self._shape
-
-  @property
-  def stop_gradient(self):
-    return self._stop_gradient
-
-
-# Keeps track of how a StochasticTensor's value should be accessed.
-# Used by value_type and get_current_value_type below.
-_STOCHASTIC_VALUE_STACK = collections.defaultdict(list)
-
-
-@contextlib.contextmanager
-def value_type(dist_value_type):
-  """Creates a value type context for any StochasticTensor created within.
-
-  Typical usage:
-
-  ```
-  with sg.value_type(sg.MeanValue(stop_gradients=True)):
-    st = sg.StochasticTensor(tf.contrib.distributions.Normal, mu=mu,
-                             sigma=sigma)
-  ```
-
-  In the example above, `st.value()` (or equivalently, `tf.identity(st)`) will
-  be the mean value of the Normal distribution, i.e., `mu` (possibly
-  broadcasted to the shape of `sigma`).  Furthermore, because the `MeanValue`
-  was marked with `stop_gradients=True`, this value will have been wrapped
-  in a `stop_gradients` call to disable any possible backpropagation.
-
-  Args:
-    dist_value_type: An instance of `MeanValue`, `SampleValue`, or
-      any other stochastic value type.
-
-  Yields:
-    A context for `StochasticTensor` objects that controls the
-    value created when they are initialized.
-
-  Raises:
-    TypeError: if `dist_value_type` is not an instance of a stochastic value
-      type.
-  """
-  if not isinstance(dist_value_type, _StochasticValueType):
-    raise TypeError("dist_value_type must be a Distribution Value Type")
-  thread_id = threading.current_thread().ident
-  stack = _STOCHASTIC_VALUE_STACK[thread_id]
-  if stack:
-    stack[-1].pushed_above(dist_value_type)
-  stack.append(dist_value_type)
-  yield
-  stack.pop()
-  if stack:
-    stack[-1].popped_above(dist_value_type)
-
-
-class NoValueTypeSetError(ValueError):
-  pass
-
-
-def get_current_value_type():
-  thread_id = threading.current_thread().ident
-  if not _STOCHASTIC_VALUE_STACK[thread_id]:
-    raise NoValueTypeSetError(
-        "No value type currently set for this thread (%s).  Did you forget to "
-        "wrap 'with stochastic_graph.value_type(...)'?" % thread_id)
-  return _STOCHASTIC_VALUE_STACK[thread_id][-1]
-
-
-class StochasticTensor(BaseStochasticTensor):
-  """StochasticTensor is a BaseStochasticTensor backed by a distribution."""
-
-  def __init__(self,
-               dist,
-               name="StochasticTensor",
-               dist_value_type=None,
-               loss_fn=sge.score_function):
-    """Construct a `StochasticTensor`.
-
-    `StochasticTensor` is backed by the `dist` distribution and its `value`
-    method will return the same value each time it is called. What `value` is
-    returned is controlled by the `dist_value_type` (defaults to
-    `SampleValue`).
-
-    Some distributions' sample functions are not differentiable (e.g. a sample
-    from a discrete distribution like a Bernoulli) and so to differentiate
-    wrt parameters upstream of the sample requires a gradient estimator like
-    the score function estimator. This is accomplished by passing a
-    differentiable `loss_fn` to the `StochasticTensor`, which
-    defaults to a function whose derivative is the score function estimator.
-    Calling `stochastic_graph.surrogate_loss(final_losses)` will call
-    `loss()` on every `StochasticTensor` upstream of final losses.
-
-    `loss()` will return None for `StochasticTensor`s backed by
-    reparameterized distributions; it will also return None if the value type is
-    `MeanValueType` or if `loss_fn=None`.
-
-    Args:
-      dist: an instance of `Distribution`.
-      name: a name for this `StochasticTensor` and its ops.
-      dist_value_type: a `_StochasticValueType`, which will determine what the
-          `value` of this `StochasticTensor` will be. If not provided, the
-          value type set with the `value_type` context manager will be used.
-      loss_fn: callable that takes
-          `(st, st.value(), influenced_loss)`, where
-          `st` is this `StochasticTensor`, and returns a `Tensor` loss. By
-          default, `loss_fn` is the `score_function`, or more precisely, the
-          integral of the score function, such that when the gradient is taken,
-          the score function results. See the `stochastic_gradient_estimators`
-          module for additional loss functions and baselines.
-
-    Raises:
-      TypeError: if `dist` is not an instance of `Distribution`.
-      TypeError: if `loss_fn` is not `callable`.
-    """
-    if not isinstance(dist, distribution.Distribution):
-      raise TypeError("dist must be an instance of Distribution")
-    if dist_value_type is None:
-      try:
-        self._value_type = get_current_value_type()
-      except NoValueTypeSetError:
-        self._value_type = SampleValue()
-    else:
-      # We want to enforce a value type here, but use the value_type()
-      # context manager to enforce some error checking.
-      with value_type(dist_value_type):
-        self._value_type = get_current_value_type()
-
-    if loss_fn is not None and not callable(loss_fn):
-      raise TypeError("loss_fn must be callable")
-    self._loss_fn = loss_fn
-
-    with ops.name_scope(name) as scope:
-      self._name = scope
-      self._dist = dist
-      self._value = self._create_value()
-
-    super(StochasticTensor, self).__init__()
-
-  @property
-  def value_type(self):
-    return self._value_type
-
-  @property
-  def distribution(self):
-    return self._dist
-
-  def _create_value(self):
-    """Create the value Tensor based on the value type, store as self._value."""
-
-    if isinstance(self._value_type, MeanValue):
-      value_tensor = self._dist.mean()
-    elif isinstance(self._value_type, SampleValue):
-      value_tensor = self._dist.sample(self._value_type.shape)
-    else:
-      raise TypeError("Unrecognized Distribution Value Type: %s",
-                      self._value_type)
-
-    if self._value_type.stop_gradient:
-      # stop_gradient is being enforced by the value type
-      return array_ops.stop_gradient(value_tensor)
-
-    if isinstance(self._value_type, MeanValue):
-      return value_tensor  # Using pathwise-derivative for this one.
-    if self._dist.reparameterization_type == distribution.FULLY_REPARAMETERIZED:
-      return value_tensor  # Using pathwise-derivative for this one.
-    else:
-      # Will have to perform some variant of score function
-      # estimation.  Call stop_gradient on the sampler just in case we
-      # may accidentally leak some gradient from it.
-      return array_ops.stop_gradient(value_tensor)
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def graph(self):
-    return self._value.graph
-
-  @property
-  def dtype(self):
-    return self._dist.dtype
-
-  def entropy(self, name="entropy"):
-    return self._dist.entropy(name=name)
-
-  def mean(self, name="mean"):
-    return self._dist.mean(name=name)
-
-  def value(self, name="value"):
-    return self._value
-
-  def loss(self, final_loss, name="Loss"):
-    # Return a loss based on final_loss and the distribution. Returns
-    # None if pathwise derivatives are supported, if the loss_fn
-    # was explicitly set to None, or if the value type is MeanValue.
-    if self._loss_fn is None:
-      return None
-
-    if (self._dist.reparameterization_type == distribution.FULLY_REPARAMETERIZED
-        and not self._value_type.stop_gradient):
-      # Can perform pathwise-derivative on this one; no additional loss needed.
-      return None
-
-    with ops.name_scope(self.name, values=[final_loss]):
-      with ops.name_scope(name):
-        if (self._value_type.stop_gradient or
-            isinstance(self._value_type, SampleValue)):
-          return self._loss_fn(self, self._value, final_loss)
-        elif isinstance(self._value_type, MeanValue):
-          return None  # MeanValue generally provides its own gradient
-        else:
-          raise TypeError("Unrecognized Distribution Value Type: %s",
-                          self._value_type)
-
-
-class ObservedStochasticTensor(StochasticTensor):
-  """A StochasticTensor with an observed value."""
-
-  # pylint: disable=super-init-not-called
-  def __init__(self, dist, value, name=None):
-    """Construct an `ObservedStochasticTensor`.
-
-    `ObservedStochasticTensor` is backed by distribution `dist` and uses the
-    provided value instead of using the current value type to draw a value from
-    the distribution. The provided value argument must be appropriately shaped
-    to have come from the distribution.
-
-    Args:
-      dist: an instance of `Distribution`.
-      value: a Tensor containing the observed value
-      name: a name for this `ObservedStochasticTensor` and its ops.
-
-    Raises:
-      TypeError: if `dist` is not an instance of `Distribution`.
-      ValueError: if `value` is not compatible with the distribution.
-    """
-    if not isinstance(dist, distribution.Distribution):
-      raise TypeError("dist must be an instance of Distribution")
-    with ops.name_scope(name, "ObservedStochasticTensor", [value]) as scope:
-      self._name = scope
-      self._dist = dist
-      dist_shape = self._dist.batch_shape.concatenate(
-          self._dist.event_shape)
-      value = ops.convert_to_tensor(value)
-      value_shape = value.get_shape()
-
-      if not value_shape.is_compatible_with(dist_shape):
-        if value_shape.ndims < dist_shape.ndims:
-          raise ValueError(
-              "Rank of observed value (%d) must be >= rank of a sample from the"
-              " distribution (%d)." % (value_shape.ndims, dist_shape.ndims))
-        sample_shape = value_shape[(value_shape.ndims - dist_shape.ndims):]
-        if not sample_shape.is_compatible_with(dist_shape):
-          raise ValueError(
-              "Shape of observed value %s is incompatible with the shape of a "
-              "sample from the distribution %s." % (value_shape, dist_shape))
-      if value.dtype != self._dist.dtype:
-        raise ValueError("Type of observed value (%s) does not match type of "
-                         "distribution (%s)." % (value.dtype, self._dist.dtype))
-      self._value = array_ops.identity(value)
-    # pylint: disable=non-parent-init-called
-    BaseStochasticTensor.__init__(self)
-
-  def loss(self, final_loss, name=None):
-    return None
-
-
-__all__ = [
-    "BaseStochasticTensor",
-    "StochasticTensor",
-    "ObservedStochasticTensor",
-    "MeanValue",
-    "SampleValue",
-    "value_type",
-    "get_current_value_type",
-]
--- a/tensorflow/contrib/bayesflow/python/ops/stochastic_variables.py
+++ b/tensorflow/contrib/bayesflow/python/ops/stochastic_variables.py
@ -1,151 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Custom `get_variable` for stochastic variables.
-
-@@get_stochastic_variable
-@@make_stochastic_variable_getter
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor as st
-from tensorflow.contrib.bayesflow.python.ops import variational_inference as vi
-
-
-def get_stochastic_variable(getter,
-                            name,
-                            shape=None,
-                            dist_cls=None,
-                            dist_kwargs=None,
-                            param_initializers=None,
-                            prior=None,
-                            **kwargs):
-  """Custom variable getter for stochastic variables.
-
-  `get_stochastic_variable` will create variables backing the parameters of a
-  distribution, defined by `dist_cls`, and return a `StochasticTensor` which
-  represents a sample from the backing distribution.
-
-  Meant to be passed as the `custom_getter` to a `variable_scope`. Use
-  `make_stochastic_variable_getter` to partially apply distribution-related
-  args.
-
-  Usage:
-
-  ```python
-
-  sv = tf.contrib.bayesflow.stochastic_variables
-  dist = tf.contrib.distributions
-
-  with tf.variable_scope('my_scope',
-                         custom_getter=sv.make_stochastic_variable_getter(
-                             dist_cls=dist.NormalWithSoftplusSigma
-                             param_initializers={
-                               "sigma": lambda shape, dtype, pi: (
-                                   tf.constant(0.5, dtype=dtype, shape=shape))
-                             })):
-    v = tf.get_variable('my_var', (10, 20))
-  ```
-
-  `v` is a `StochasticTensor`, which is a sample from a backing
-  `NormalWithSoftplusSigma` distribution. Underneath, 2 variables have been
-  created: `my_var_mu` and `my_var_sigma`. `my_var_sigma` has been appropriately
-  constrained to be positive by the `NormalWithSoftplusSigma` constructor, and
-  initialized to a value of 0.5, which results in a sigma of ~1 after the
-  softplus. The sample will have shape `(10, 20)`.
-
-  Args:
-    getter: original variable getter.
-    name: prefix for variable(s) backing distribution parameters.
-    shape: shape of the sample from the distribution (i.e. shape of the
-        returned `StochasticTensor`).
-    dist_cls: subclass of `Distribution` that implements `param_shapes`. Should
-        accept unconstrained parameters (e.g. `NormalWithSoftplusSigma` accepts
-        real-valued `sigma` and constrains it to be positive with `softplus`).
-    dist_kwargs: `dict` of kwargs to be forwarded to `dist_cls`.
-    param_initializers: `dict` from parameter name to initializer (see
-        `get_variable` for initializer docs). Will override `initializer` in
-        `kwargs`. `param_initializers` may contain initializers for only some of
-        the parameters. Those parameters that do not contain entries will be
-        initialized by `kwargs['initializer']`, if provided; otherwise, the
-        default initialization of `getter` will be used.
-    prior: instance of `Distribution` or a callable
-        `(TensorShape, dtype) => Distribution`. If provided, will be registered
-        as the prior for the `StochasticTensor` using
-        `variational_inference.register_prior`.
-    **kwargs: kwargs forwarded to `getter`.
-
-  Returns:
-    `StochasticTensor`, which represents a sample from the backing distribution.
-  """
-  param_initializers = param_initializers or {}
-  param_shapes = {}
-
-  if shape is not None:
-    param_shapes = dist_cls.param_static_shapes(shape)
-
-  param_names = set(list(param_shapes.keys()) + list(param_initializers.keys()))
-  params = {}
-  for param_name in param_names:
-    # For each parameter, its param_initializer is used, if provided. Otherwise,
-    # kwargs['initializer'] is used. If neither were provided, the default
-    # variable initialization in getter will be used (i.e. getter will be passed
-    # initializer=None.
-    original_initializer = kwargs.pop('initializer', None)
-    param_initializer = param_initializers.get(param_name, None)
-    if param_initializer is None:
-      param_initializer = original_initializer
-
-    if callable(param_initializer) or param_initializer is None:
-      param_shape = param_shapes.get(param_name, None)
-    else:
-      param_shape = None
-
-    params[param_name] = getter(
-        name + '_' + param_name,
-        shape=param_shape,
-        initializer=param_initializer,
-        **kwargs)
-
-  dist_kwargs = dist_kwargs or {}
-  dist_kwargs.update(params)
-  sample = st.StochasticTensor(dist_cls(**dist_kwargs))
-
-  if prior is not None:
-    if callable(prior):
-      sample_value = sample.value()
-      sample_value.get_shape().assert_is_fully_defined()
-      prior = prior(sample_value.get_shape(), sample_value.dtype)
-
-    vi.register_prior(sample, prior)
-
-  return sample
-
-
-def make_stochastic_variable_getter(dist_cls,
-                                    dist_kwargs=None,
-                                    param_initializers=None,
-                                    prior=None):
-  """`get_stochastic_variable` with args partially applied."""
-  return functools.partial(
-      get_stochastic_variable,
-      dist_cls=dist_cls,
-      dist_kwargs=dist_kwargs,
-      param_initializers=param_initializers,
-      prior=prior)
--- a/tensorflow/contrib/bayesflow/python/ops/variational_inference.py
+++ b/tensorflow/contrib/bayesflow/python/ops/variational_inference.py
@ -1,34 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Variational inference.
-
-See the ${@python/contrib.bayesflow.variational_inference} guide.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.variational_inference_impl import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    "elbo", "elbo_with_log_joint", "ELBOForms", "register_prior"
-]
-
-remove_undocumented(__name__, _allowed_symbols)
--- a/tensorflow/contrib/bayesflow/python/ops/variational_inference_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/variational_inference_impl.py
@ -1,327 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Variational inference.
-
-See the ${@python/contrib.bayesflow.variational_inference} guide.
-
-@@elbo
-@@elbo_with_log_joint
-@@ELBOForms
-@@register_prior
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.bayesflow.python.ops import stochastic_graph_impl as sg
-from tensorflow.contrib.bayesflow.python.ops import stochastic_tensor_impl as st
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.distributions import distribution
-from tensorflow.python.ops.distributions import kullback_leibler
-from tensorflow.python.platform import tf_logging as logging
-
-VI_PRIORS = "__vi_priors__"
-
-
-def register_prior(variational, prior):
-  """Associate a variational `StochasticTensor` with a `Distribution` prior.
-
-  This is a helper function used in conjunction with `elbo` that allows users
-  to specify the mapping between variational distributions and their priors
-  without having to pass in `variational_with_prior` explicitly.
-
-  Args:
-    variational: `StochasticTensor` q(Z). Approximating distribution.
-    prior: `Distribution` p(Z). Prior distribution.
-
-  Returns:
-    None
-
-  Raises:
-    ValueError: if variational is not a `StochasticTensor` or `prior` is not
-      a `Distribution`.
-  """
-  if not isinstance(variational, st.StochasticTensor):
-    raise TypeError("variational must be a StochasticTensor")
-  if not isinstance(prior, distribution.Distribution):
-    raise TypeError("prior must be a Distribution")
-  ops.add_to_collection(VI_PRIORS, (variational, prior))
-
-
-class _ELBOForm(object):
-  pass
-
-
-class ELBOForms(object):
-  """Constants to control the `elbo` calculation.
-
-  `analytic_kl` uses the analytic KL divergence between the
-  variational distribution(s) and the prior(s).
-
-  `analytic_entropy` uses the analytic entropy of the variational
-  distribution(s).
-
-  `sample` uses the sample KL or the sample entropy is the joint is provided.
-
-  See `elbo` for what is used with `default`.
-  """
-  default, analytic_kl, analytic_entropy, sample = (_ELBOForm()
-                                                    for _ in range(4))
-
-  @staticmethod
-  def check_form(form):
-    if form not in {
-        ELBOForms.default, ELBOForms.analytic_kl, ELBOForms.analytic_entropy,
-        ELBOForms.sample
-    }:
-      raise TypeError("form must be an ELBOForms constant")
-
-
-def elbo(log_likelihood,
-         variational_with_prior=None,
-         keep_batch_dim=True,
-         form=None,
-         name="ELBO"):
-  r"""Evidence Lower BOund. `log p(x) >= ELBO`.
-
-  Optimization objective for inference of hidden variables by variational
-  inference.
-
-  This function is meant to be used in conjunction with `StochasticTensor`.
-  The user should build out the inference network, using `StochasticTensor`s
-  as latent variables, and the generative network. `elbo` at minimum needs
-  `p(x|Z)` and assumes that all `StochasticTensor`s upstream of `p(x|Z)` are
-  the variational distributions. Use `register_prior` to register `Distribution`
-  priors for each `StochasticTensor`. Alternatively, pass in
-  `variational_with_prior` specifying all variational distributions and their
-  priors.
-
-  Mathematical details:
-
-  ```
-  log p(x) =  log \int p(x, Z) dZ
-           =  log \int \frac {q(Z)p(x, Z)}{q(Z)} dZ
-           =  log E_q[\frac {p(x, Z)}{q(Z)}]
-           >= E_q[log \frac {p(x, Z)}{q(Z)}] = L[q; p, x]  # ELBO
-
-  L[q; p, x] = E_q[log p(x|Z)p(Z)] - E_q[log q(Z)]
-             = E_q[log p(x|Z)p(Z)] + H[q]           (1)
-             = E_q[log p(x|Z)] - KL(q || p)         (2)
-
-  H - Entropy
-  KL - Kullback-Leibler divergence
-  ```
-
-  See section 2.2 of Stochastic Variational Inference by Hoffman et al. for
-  more, including the ELBO's equivalence to minimizing `KL(q(Z)||p(Z|x))`
-  in the fully Bayesian setting. https://arxiv.org/pdf/1206.7051.pdf.
-
-  `form` specifies which form of the ELBO is used. `form=ELBOForms.default`
-  tries, in order of preference: analytic KL, analytic entropy, sampling.
-
-  Multiple entries in the `variational_with_prior` dict implies a factorization.
-  e.g. `q(Z) = q(z1)q(z2)q(z3)`.
-
-  Args:
-    log_likelihood: `Tensor` log p(x|Z).
-    variational_with_prior: dict from `StochasticTensor` q(Z) to
-      `Distribution` p(Z). If `None`, defaults to all `StochasticTensor`
-      objects upstream of `log_likelihood` with priors registered with
-      `register_prior`.
-    keep_batch_dim: bool. Whether to keep the batch dimension when summing
-      entropy/KL term. When the sample is per data point, this should be True;
-      otherwise (e.g. in a Bayesian NN), this should be False.
-    form: ELBOForms constant. Controls how the ELBO is computed. Defaults to
-      ELBOForms.default.
-    name: name to prefix ops with.
-
-  Returns:
-    `Tensor` ELBO of the same type and shape as `log_likelihood`.
-
-  Raises:
-    TypeError: if variationals in `variational_with_prior` are not
-      `StochasticTensor`s or if priors are not `Distribution`s.
-    TypeError: if form is not a valid ELBOForms constant.
-    ValueError: if `variational_with_prior` is None and there are no
-      `StochasticTensor`s upstream of `log_likelihood`.
-    ValueError: if any variational does not have a prior passed or registered.
-  """
-  if form is None:
-    form = ELBOForms.default
-  with ops.name_scope(name):
-    model = ops.convert_to_tensor(log_likelihood)
-    variational_with_prior = _find_variational_and_priors(
-        model, variational_with_prior)
-    return _elbo(form, log_likelihood, None, variational_with_prior,
-                 keep_batch_dim)
-
-
-def elbo_with_log_joint(log_joint,
-                        variational=None,
-                        keep_batch_dim=True,
-                        form=None,
-                        name="ELBO"):
-  """Evidence Lower BOund. `log p(x) >= ELBO`.
-
-  This method is for models that have computed `p(x,Z)` instead of `p(x|Z)`.
-  See `elbo` for further details.
-
-  Because only the joint is specified, analytic KL is not available.
-
-  Args:
-    log_joint: `Tensor` log p(x, Z).
-    variational: list of `StochasticTensor` q(Z). If `None`, defaults to all
-      `StochasticTensor` objects upstream of `log_joint`.
-    keep_batch_dim: bool. Whether to keep the batch dimension when summing
-      entropy term. When the sample is per data point, this should be True;
-      otherwise (e.g. in a Bayesian NN), this should be False.
-    form: ELBOForms constant. Controls how the ELBO is computed. Defaults to
-      ELBOForms.default.
-    name: name to prefix ops with.
-
-  Returns:
-    `Tensor` ELBO of the same type and shape as `log_joint`.
-
-  Raises:
-    TypeError: if variationals in `variational` are not `StochasticTensor`s.
-    TypeError: if form is not a valid ELBOForms constant.
-    ValueError: if `variational` is None and there are no `StochasticTensor`s
-      upstream of `log_joint`.
-    ValueError: if form is ELBOForms.analytic_kl.
-  """
-  if form is None:
-    form = ELBOForms.default
-  if form == ELBOForms.analytic_kl:
-    raise ValueError("ELBOForms.analytic_kl is not available when using "
-                     "elbo_with_log_joint. Use elbo or a different form.")
-
-  with ops.name_scope(name):
-    model = ops.convert_to_tensor(log_joint)
-
-    variational_with_prior = None
-    if variational is not None:
-      variational_with_prior = dict(zip(variational, [None] * len(variational)))
-    variational_with_prior = _find_variational_and_priors(
-        model, variational_with_prior, require_prior=False)
-    return _elbo(form, None, log_joint, variational_with_prior, keep_batch_dim)
-
-
-def _elbo(form, log_likelihood, log_joint, variational_with_prior,
-          keep_batch_dim):
-  """Internal implementation of ELBO. Users should use `elbo`.
-
-  Args:
-    form: ELBOForms constant. Controls how the ELBO is computed.
-    log_likelihood: `Tensor` log p(x|Z).
-    log_joint: `Tensor` log p(x, Z).
-    variational_with_prior: `dict<StochasticTensor, Distribution>`, varational
-      distributions to prior distributions.
-    keep_batch_dim: bool. Whether to keep the batch dimension when reducing
-      the entropy/KL.
-
-  Returns:
-    ELBO `Tensor` with same shape and dtype as `log_likelihood`/`log_joint`.
-  """
-  ELBOForms.check_form(form)
-
-  # Order of preference
-  # 1. Analytic KL: log_likelihood - KL(q||p)
-  # 2. Analytic entropy: log_likelihood + log p(Z) + H[q], or log_joint + H[q]
-  # 3. Sample: log_likelihood - (log q(Z) - log p(Z)) =
-  #            log_likelihood + log p(Z) - log q(Z), or log_joint - q(Z)
-
-  def _reduce(val):
-    if keep_batch_dim:
-      return val
-    else:
-      return math_ops.reduce_sum(val)
-
-  kl_terms = []
-  entropy_terms = []
-  prior_terms = []
-  for q, z, p in [(qz.distribution, qz.value(), pz)
-                  for qz, pz in variational_with_prior.items()]:
-    # Analytic KL
-    kl = None
-    if log_joint is None and form in {ELBOForms.default, ELBOForms.analytic_kl}:
-      try:
-        kl = kullback_leibler.kl_divergence(q, p)
-        logging.info("Using analytic KL between q:%s, p:%s", q, p)
-      except NotImplementedError as e:
-        if form == ELBOForms.analytic_kl:
-          raise e
-    if kl is not None:
-      kl_terms.append(-1. * _reduce(kl))
-      continue
-
-    # Analytic entropy
-    entropy = None
-    if form in {ELBOForms.default, ELBOForms.analytic_entropy}:
-      try:
-        entropy = q.entropy()
-        logging.info("Using analytic entropy for q:%s", q)
-      except NotImplementedError as e:
-        if form == ELBOForms.analytic_entropy:
-          raise e
-    if entropy is not None:
-      entropy_terms.append(_reduce(entropy))
-      if log_likelihood is not None:
-        prior = p.log_prob(z)
-        prior_terms.append(_reduce(prior))
-      continue
-
-    # Sample
-    if form in {ELBOForms.default, ELBOForms.sample}:
-      entropy = -q.log_prob(z)
-      entropy_terms.append(_reduce(entropy))
-      if log_likelihood is not None:
-        prior = p.log_prob(z)
-        prior_terms.append(_reduce(prior))
-
-  first_term = log_joint if log_joint is not None else log_likelihood
-  return sum([first_term] + kl_terms + entropy_terms + prior_terms)
-
-
-def _find_variational_and_priors(model,
-                                 variational_with_prior,
-                                 require_prior=True):
-  """Find upstream StochasticTensors and match with registered priors."""
-  if variational_with_prior is None:
-    # pylint: disable=protected-access
-    upstreams = sg._upstream_stochastic_nodes([model])
-    # pylint: enable=protected-access
-    upstreams = list(upstreams[model])
-    if not upstreams:
-      raise ValueError("No upstream stochastic nodes found for tensor: %s",
-                       model)
-    prior_map = dict(ops.get_collection(VI_PRIORS))
-    variational_with_prior = {}
-    for q in upstreams:
-      if require_prior and (q not in prior_map or prior_map[q] is None):
-        raise ValueError("No prior specified for StochasticTensor: %s", q)
-      variational_with_prior[q] = prior_map.get(q)
-
-  if not all(
-      [isinstance(q, st.StochasticTensor) for q in variational_with_prior]):
-    raise TypeError("variationals must be StochasticTensors")
-  if not all([
-      p is None or isinstance(p, distribution.Distribution)
-      for p in variational_with_prior.values()
-  ]):
-    raise TypeError("priors must be Distribution objects")
-
-  return variational_with_prior
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@ -2,12 +2,15 @@
 #   Contains ops for statistical distributions (with pdf, cdf, sample, etc...).
 #   APIs here are meant to evolve over time.

+package(default_visibility = [
+    "//learning/brain/contrib/bayesflow:__subpackages__",
+    "//tensorflow:__subpackages__",
+])
+
 licenses(["notice"])  # Apache 2.0

 exports_files(["LICENSE"])

-package(default_visibility = ["//tensorflow:__subpackages__"])
-
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")

 py_library(
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.entropy.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.entropy.md
@ -1,47 +1 @@
 # BayesFlow Entropy (contrib)
-[TOC]
-
-Entropy Ops.
-
-## Background
-
-Common Shannon entropy, the Evidence Lower BOund (ELBO), KL divergence, and more
-all have information theoretic use and interpretations.  They are also often
-used in variational inference.  This library brings together `Ops` for
-estimating them, e.g. using Monte Carlo expectations.
-
-## Examples
-
-Example of fitting a variational posterior with the ELBO.
-
-```python
-# We start by assuming knowledge of the log of a joint density p(z, x) over
-# latent variable z and fixed measurement x.  Since x is fixed, the Python
-# function does not take x as an argument.
-def log_joint(z):
-  theta = tf.Variable(0.)  # Trainable variable that helps define log_joint.
-  ...
-
-# Next, define a Normal distribution with trainable parameters.
-q = distributions.Normal(mu=tf.Variable(0.), sigma=tf.Variable(1.))
-
-# Now, define a loss function (negative ELBO) that, when minimized, will adjust
-# mu, sigma, and theta, increasing the ELBO, which we hope will both reduce the
-# KL divergence between q(z) and p(z | x), and increase p(x).  Note that we
-# cannot guarantee both, but in general we expect both to happen.
-elbo = entropy.elbo_ratio(log_p, q, n=10)
-loss = -elbo
-
-# Minimize the loss
-train_op = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
-tf.global_variables_initializer().run()
-for step in range(100):
-  train_op.run()
-```
-
-## Ops
-
-*   @{tf.contrib.bayesflow.entropy.elbo_ratio}
-*   @{tf.contrib.bayesflow.entropy.entropy_shannon}
-*   @{tf.contrib.bayesflow.entropy.renyi_ratio}
-*   @{tf.contrib.bayesflow.entropy.renyi_alpha}
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_graph.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_graph.md
@ -1,8 +1 @@
 # BayesFlow Stochastic Graph (contrib)
-[TOC]
-
-Classes and helper functions for Stochastic Computation Graphs.
-
-## Stochastic Computation Graph Helper Functions
-
-*   @{tf.contrib.bayesflow.stochastic_graph.surrogate_loss}
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_tensor.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.stochastic_tensor.md
@ -1,24 +1,3 @@
 # BayesFlow Stochastic Tensors (contrib)
 [TOC]

-Classes and helper functions for creating Stochastic Tensors.
-
-`StochasticTensor` objects wrap `Distribution` objects.  Their
-values may be samples from the underlying distribution, or the distribution
-mean (as governed by `value_type`).  These objects provide a `loss`
-method for use when sampling from a non-reparameterized distribution.
-The `loss`method is used in conjunction with `stochastic_graph.surrogate_loss`
-to produce a single differentiable loss in stochastic graphs having
-both continuous and discrete stochastic nodes.
-
-## Stochastic Tensor Classes
-
-*   @{tf.contrib.bayesflow.stochastic_tensor.BaseStochasticTensor}
-*   @{tf.contrib.bayesflow.stochastic_tensor.StochasticTensor}
-
-## Stochastic Tensor Value Types
-
-*   @{tf.contrib.bayesflow.stochastic_tensor.MeanValue}
-*   @{tf.contrib.bayesflow.stochastic_tensor.SampleValue}
-*   @{tf.contrib.bayesflow.stochastic_tensor.value_type}
-*   @{tf.contrib.bayesflow.stochastic_tensor.get_current_value_type}
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.variational_inference.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.variational_inference.md
@ -2,10 +2,3 @@
 [TOC]

 Variational inference.
-
-## Ops
-
-*   @{tf.contrib.bayesflow.variational_inference.elbo}
-*   @{tf.contrib.bayesflow.variational_inference.elbo_with_log_joint}
-*   @{tf.contrib.bayesflow.variational_inference.ELBOForms}
-*   @{tf.contrib.bayesflow.variational_inference.register_prior}