Fix: add GDN to __init__. Also put it in alphabetical order.

PiperOrigin-RevId: 163842410
2025-12-07 00:20:20 +01:00 · 2017-08-01 09:44:30 -07:00 · 2017-08-01 09:44:30 -07:00 · 1a44996072
commit 1a44996072
parent db0e1c6c8e
2 changed files with 314 additions and 294 deletions
--- a/tensorflow/contrib/layers/init.py
+++ b/tensorflow/contrib/layers/init.py
@ -32,6 +32,8 @@ See the @{$python/contrib.layers} guide.
@@embedding_lookup_unique
@@flatten
@@fully_connected
+@@GDN
+@@gdn
@@layer_norm
@@linear
@@max_pool2d
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@ -71,6 +71,8 @@ __all__ = ['avg_pool2d',
           'elu',
           'flatten',
           'fully_connected',
+           'GDN',
+           'gdn',
           'layer_norm',
           'linear',
           'pool',
@ -1682,6 +1684,316 @@ def fully_connected(inputs,
        outputs_collections, sc.original_name_scope, outputs)


+class GDN(base.Layer):
+  """Generalized divisive normalization layer.
+
+  Based on the papers:
+
+    "Density Modeling of Images using a Generalized Normalization
+    Transformation"
+
+    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
+
+    https://arxiv.org/abs/1511.06281
+
+    "End-to-end Optimized Image Compression"
+
+    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
+
+    https://arxiv.org/abs/1611.01704
+
+  Implements an activation function that is essentially a multivariate
+  generalization of a particular sigmoid-type function:
+
+  ```
+  y[i] = x[i] / sqrt(beta[i] + sum_j(gamma[j, i] * x[j]))
+  ```
+
+  where `i` and `j` run over channels. This implementation never sums across
+  spatial dimensions. It is similar to local response normalization, but much
+  more flexible, as `beta` and `gamma` are trainable parameters.
+
+  Arguments:
+    inverse: If `False` (default), compute GDN response. If `True`, compute IGDN
+      response (one step of fixed point iteration to invert GDN; the division
+      is replaced by multiplication).
+    beta_min: Lower bound for beta, to prevent numerical error from causing
+      square root of zero or negative values.
+    gamma_init: The gamma matrix will be initialized as the identity matrix
+      multiplied with this value. If set to zero, the layer is effectively
+      initialized to the identity operation, since beta is initialized as one.
+      A good default setting is somewhere between 0 and 0.5.
+    reparam_offset: Offset added to the reparameterization of beta and gamma.
+      The reparameterization of beta and gamma as their square roots lets the
+      training slow down when their values are close to zero, which is desirable
+      as small values in the denominator can lead to a situation where gradient
+      noise on beta/gamma leads to extreme amounts of noise in the GDN
+      activations. However, without the offset, we would get zero gradients if
+      any elements of beta or gamma were exactly zero, and thus the training
+      could get stuck. To prevent this, we add this small constant. The default
+      value was empirically determined as a good starting point. Making it
+      bigger potentially leads to more gradient noise on the activations, making
+      it too small may lead to numerical precision issues.
+    data_format: Format of input tensor. Currently supports `'channels_first'`
+      and `'channels_last'`.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True`, also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: String, the name of the layer. Layers with the same name will
+      share weights, but to avoid mistakes we require `reuse=True` in such
+      cases.
+
+  Properties:
+    inverse: Boolean, whether GDN is computed (`True`) or IGDN (`False`).
+    data_format: Format of input tensor. Currently supports `'channels_first'`
+      and `'channels_last'`.
+    beta: The beta parameter as defined above (1D `Tensor`).
+    gamma: The gamma parameter as defined above (2D `Tensor`).
+  """
+
+  def __init__(self,
+               inverse=False,
+               beta_min=1e-6,
+               gamma_init=.1,
+               reparam_offset=2 ** -18,
+               data_format='channels_last',
+               activity_regularizer=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(GDN, self).__init__(trainable=trainable, name=name, **kwargs)
+    self.inverse = inverse
+    self._beta_min = beta_min
+    self._gamma_init = gamma_init
+    self._reparam_offset = reparam_offset
+    self.data_format = data_format
+    self.activity_regularizer = activity_regularizer
+    self._channel_axis()  # trigger ValueError early
+    self.input_spec = base.InputSpec(min_ndim=3, max_ndim=5)
+
+  def _channel_axis(self):
+    try:
+      return {'channels_first': 1, 'channels_last': -1}[self.data_format]
+    except KeyError:
+      raise ValueError('Unsupported `data_format` for GDN layer: {}.'.format(
+          self.data_format))
+
+  @staticmethod
+  def _lower_bound(inputs, bound, name=None):
+    """Same as tf.maximum, but with helpful gradient for inputs < bound.
+
+    The gradient is overwritten so that it is passed through if the input is not
+    hitting the bound. If it is, only gradients that push `inputs` higher than
+    the bound are passed through. No gradients are passed through to the bound.
+
+    Args:
+      inputs: input tensor
+      bound: lower bound for the input tensor
+      name: name for this op
+
+    Returns:
+      tf.maximum(inputs, bound)
+    """
+    with ops.name_scope(name, 'GDNLowerBound', [inputs, bound]) as scope:
+      inputs = ops.convert_to_tensor(inputs, name='inputs')
+      bound = ops.convert_to_tensor(bound, name='bound')
+      with ops.get_default_graph().gradient_override_map(
+          {'Maximum': 'GDNLowerBound'}):
+        return math_ops.maximum(inputs, bound, name=scope)
+
+  @staticmethod
+  def _lower_bound_grad(op, grad):
+    """Gradient for `_lower_bound`.
+
+    Args:
+      op: the tensorflow op for which to calculate a gradient
+      grad: gradient with respect to the output of the op
+
+    Returns:
+      gradients with respect to the inputs of the op
+    """
+    inputs = op.inputs[0]
+    bound = op.inputs[1]
+    pass_through_if = math_ops.logical_or(inputs >= bound, grad < 0)
+    return [math_ops.cast(pass_through_if, grad.dtype) * grad, None]
+
+  def build(self, input_shape):
+    channel_axis = self._channel_axis()
+    input_shape = tensor_shape.TensorShape(input_shape)
+    num_channels = input_shape[channel_axis].value
+    if num_channels is None:
+      raise ValueError('The channel dimension of the inputs to `GDN` '
+                       'must be defined.')
+    self._input_rank = input_shape.ndims
+    self.input_spec = base.InputSpec(ndim=input_shape.ndims,
+                                     axes={channel_axis: num_channels})
+
+    pedestal = array_ops.constant(self._reparam_offset ** 2, dtype=self.dtype)
+    beta_bound = array_ops.constant(
+        (self._beta_min + self._reparam_offset ** 2) ** .5, dtype=self.dtype)
+    gamma_bound = array_ops.constant(self._reparam_offset, dtype=self.dtype)
+
+    def beta_initializer(shape, dtype=None, partition_info=None):
+      del partition_info  # unused
+      return math_ops.sqrt(array_ops.ones(shape, dtype=dtype) + pedestal)
+
+    def gamma_initializer(shape, dtype=None, partition_info=None):
+      del partition_info  # unused
+      assert len(shape) == 2
+      assert shape[0] == shape[1]
+      eye = linalg_ops.eye(shape[0], dtype=dtype)
+      return math_ops.sqrt(self._gamma_init * eye + pedestal)
+
+    beta = self.add_variable('reparam_beta',
+                             shape=[num_channels],
+                             initializer=beta_initializer,
+                             dtype=self.dtype,
+                             trainable=True)
+    beta = self._lower_bound(beta, beta_bound)
+    self.beta = math_ops.square(beta) - pedestal
+
+    gamma = self.add_variable('reparam_gamma',
+                              shape=[num_channels, num_channels],
+                              initializer=gamma_initializer,
+                              dtype=self.dtype,
+                              trainable=True)
+    gamma = self._lower_bound(gamma, gamma_bound)
+    self.gamma = math_ops.square(gamma) - pedestal
+
+    self.built = True
+
+  def call(self, inputs):
+    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
+    ndim = self._input_rank
+
+    shape = self.gamma.get_shape().as_list()
+    gamma = array_ops.reshape(self.gamma, (ndim - 2) * [1] + shape)
+
+    # Compute normalization pool.
+    if self.data_format == 'channels_first':
+      norm_pool = nn.convolution(math_ops.square(inputs), gamma, 'VALID',
+                                 data_format='NC' + 'DHW'[-(ndim - 2):])
+      if ndim == 3:
+        norm_pool = array_ops.expand_dims(norm_pool, 2)
+        norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
+        norm_pool = array_ops.squeeze(norm_pool, [2])
+      elif ndim == 5:
+        shape = array_ops.shape(norm_pool)
+        norm_pool = array_ops.reshape(norm_pool, shape[:3] + [-1])
+        norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
+        norm_pool = array_ops.reshape(norm_pool, shape)
+      else:  # ndim == 4
+        norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
+    else:  # channels_last
+      norm_pool = nn.convolution(math_ops.square(inputs), gamma, 'VALID')
+      norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NHWC')
+    norm_pool = math_ops.sqrt(norm_pool)
+
+    if self.inverse:
+      outputs = inputs * norm_pool
+    else:
+      outputs = inputs / norm_pool
+    outputs.set_shape(inputs.get_shape())
+    return outputs
+
+  def _compute_output_shape(self, input_shape):
+    channel_axis = self._channel_axis()
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if not 3 <= input_shape.ndim <= 5:
+      raise ValueError('`input_shape` must be of rank 3 to 5, inclusive.')
+    if input_shape[channel_axis].value is None:
+      raise ValueError(
+          'The channel dimension of `input_shape` must be defined.')
+    return input_shape
+
+
+ops.RegisterGradient('GDNLowerBound')(GDN._lower_bound_grad)  # pylint:disable=protected-access
+
+
+def gdn(inputs,
+        inverse=False,
+        beta_min=1e-6,
+        gamma_init=.1,
+        reparam_offset=2 ** -18,
+        data_format='channels_last',
+        activity_regularizer=None,
+        trainable=True,
+        name=None,
+        reuse=None):
+  """Functional interface for GDN layer.
+
+  Based on the papers:
+
+    "Density Modeling of Images using a Generalized Normalization
+    Transformation"
+    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
+    https://arxiv.org/abs/1511.06281
+
+    "End-to-end Optimized Image Compression"
+    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
+    https://arxiv.org/abs/1611.01704
+
+  Implements an activation function that is essentially a multivariate
+  generalization of a particular sigmoid-type function:
+
+  ```
+  y[i] = x[i] / sqrt(beta[i] + sum_j(gamma[j, i] * x[j]))
+  ```
+
+  where `i` and `j` run over channels. This implementation never sums across
+  spatial dimensions. It is similar to local response normalization, but much
+  more flexible, as `beta` and `gamma` are trainable parameters.
+
+  Arguments:
+    inputs: Tensor input.
+    inverse: If `False` (default), compute GDN response. If `True`, compute IGDN
+      response (one step of fixed point iteration to invert GDN; the division
+      is replaced by multiplication).
+    beta_min: Lower bound for beta, to prevent numerical error from causing
+      square root of zero or negative values.
+    gamma_init: The gamma matrix will be initialized as the identity matrix
+      multiplied with this value. If set to zero, the layer is effectively
+      initialized to the identity operation, since beta is initialized as one.
+      A good default setting is somewhere between 0 and 0.5.
+    reparam_offset: Offset added to the reparameterization of beta and gamma.
+      The reparameterization of beta and gamma as their square roots lets the
+      training slow down when their values are close to zero, which is desirable
+      as small values in the denominator can lead to a situation where gradient
+      noise on beta/gamma leads to extreme amounts of noise in the GDN
+      activations. However, without the offset, we would get zero gradients if
+      any elements of beta or gamma were exactly zero, and thus the training
+      could get stuck. To prevent this, we add this small constant. The default
+      value was empirically determined as a good starting point. Making it
+      bigger potentially leads to more gradient noise on the activations, making
+      it too small may lead to numerical precision issues.
+    data_format: Format of input tensor. Currently supports `'channels_first'`
+      and `'channels_last'`.
+    activity_regularizer: Regularizer function for the output.
+    trainable: Boolean, if `True`, also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: String, the name of the layer. Layers with the same name will
+      share weights, but to avoid mistakes we require `reuse=True` in such
+      cases.
+    reuse: Boolean, whether to reuse the weights of a previous layer by the same
+      name.
+
+  Returns:
+    Output tensor.
+  """
+  layer = GDN(inverse=inverse,
+              beta_min=beta_min,
+              gamma_init=gamma_init,
+              reparam_offset=reparam_offset,
+              data_format=data_format,
+              activity_regularizer=activity_regularizer,
+              trainable=trainable,
+              name=name,
+              dtype=inputs.dtype.base_dtype,
+              _scope=name,
+              _reuse=reuse)
+  return layer.apply(inputs)
+
+
@add_arg_scope
 def layer_norm(inputs,
               center=True,
@ -1812,300 +2124,6 @@ def layer_norm(inputs,
                                       outputs)


-class GDN(base.Layer):
-  """Generalized divisive normalization layer.
-
-  Based on the papers:
-
-    "Density Modeling of Images using a Generalized Normalization
-    Transformation"
-    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
-    https://arxiv.org/abs/1511.06281
-
-    "End-to-end Optimized Image Compression"
-    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
-    https://arxiv.org/abs/1611.01704
-
-  Implements an activation function that is essentially a multivariate
-  generalization of a particular sigmoid-type function:
-
-  y[i] = x[i] / sqrt(beta[i] + sum_j(gamma[j, i] * x[j]))
-
-  where i and j run over channels. This implementation never sums across spatial
-  dimensions. It is similar to local response normalization, but more powerful,
-  as beta and gamma are trainable parameters.
-
-  Arguments:
-    inverse: If False (default), compute GDN response. If True, compute IGDN
-      response (one step of fixed point iteration to invert GDN; the division
-      is replaced by multiplication).
-    beta_min: Lower bound for beta, to prevent numerical error from causing
-      square root of zero or negative values.
-    gamma_init: The gamma matrix will be initialized as the identity matrix
-      multiplied with this value. If set to zero, the layer is effectively
-      initialized to the identity operation, since beta is initialized as one.
-      A good default setting is somewhere between 0 and 0.5.
-    reparam_offset: Offset added to the reparameterization of beta and gamma.
-      The reparameterization of beta and gamma as their square roots lets the
-      training slow down when their values are close to zero, which is desirable
-      as small values in the denominator can lead to a situation where gradient
-      noise on beta/gamma leads to extreme amounts of noise in the GDN
-      activations. However, without the offset, we would get zero gradients if
-      any elements of beta or gamma were exactly zero, and thus the training
-      could get stuck. To prevent this, we add this small constant. The default
-      value was empirically determined as a good starting point. Making it
-      bigger potentially leads to more gradient noise on the activations, making
-      it too small may lead to numerical precision issues.
-    data_format: Format of input tensor. Currently supports 'channels_first' and
-      'channels_last'.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require reuse=True in such cases.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Properties:
-    inverse: Boolean, whether GDN is computed (True) or IGDN (False).
-    data_format: Format of input tensor. Currently supports 'channels_first' and
-      'channels_last'.
-    beta: The beta parameter as defined above (1D TensorFlow tensor).
-    gamma: The gamma parameter as defined above (2D TensorFlow tensor).
-  """
-
-  def __init__(self,
-               inverse=False,
-               beta_min=1e-6,
-               gamma_init=.1,
-               reparam_offset=2 ** -18,
-               data_format='channels_last',
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(GDN, self).__init__(trainable=trainable, name=name, **kwargs)
-    self.inverse = inverse
-    self._beta_min = beta_min
-    self._gamma_init = gamma_init
-    self._reparam_offset = reparam_offset
-    self.data_format = data_format
-    self._channel_axis()  # trigger ValueError early
-    self.input_spec = base.InputSpec(min_ndim=3, max_ndim=5)
-
-  def _channel_axis(self):
-    try:
-      return {'channels_first': 1, 'channels_last': -1}[self.data_format]
-    except KeyError:
-      raise ValueError('Unsupported `data_format` for GDN layer: {}.'.format(
-          self.data_format))
-
-  @staticmethod
-  def _lower_bound(inputs, bound, name=None):
-    """Same as tf.maximum, but with helpful gradient for inputs < bound.
-
-    The gradient is overwritten so that it is passed through if the input is not
-    hitting the bound. If it is, only gradients that push `inputs` higher than
-    the bound are passed through. No gradients are passed through to the bound.
-
-    Args:
-      inputs: input tensor
-      bound: lower bound for the input tensor
-      name: name for this op
-
-    Returns:
-      tf.maximum(inputs, bound)
-    """
-    with ops.name_scope(name, 'GDNLowerBound', [inputs, bound]) as scope:
-      inputs = ops.convert_to_tensor(inputs, name='inputs')
-      bound = ops.convert_to_tensor(bound, name='bound')
-      with ops.get_default_graph().gradient_override_map(
-          {'Maximum': 'GDNLowerBound'}):
-        return math_ops.maximum(inputs, bound, name=scope)
-
-  @ops.RegisterGradient('GDNLowerBound')
-  @staticmethod
-  def _lower_bound_grad(op, grad):
-    """Gradient for `_lower_bound`.
-
-    Args:
-      op: the tensorflow op for which to calculate a gradient
-      grad: gradient with respect to the output of the op
-
-    Returns:
-      gradients with respect to the inputs of the op
-    """
-    inputs = op.inputs[0]
-    bound = op.inputs[1]
-    pass_through_if = math_ops.logical_or(inputs >= bound, grad < 0)
-    return [math_ops.cast(pass_through_if, grad.dtype) * grad, None]
-
-  def build(self, input_shape):
-    channel_axis = self._channel_axis()
-    input_shape = tensor_shape.TensorShape(input_shape)
-    num_channels = input_shape[channel_axis].value
-    if num_channels is None:
-      raise ValueError('The channel dimension of the inputs to `GDN` '
-                       'must be defined.')
-    self._input_rank = input_shape.ndims
-    self.input_spec = base.InputSpec(ndim=input_shape.ndims,
-                                     axes={channel_axis: num_channels})
-
-    pedestal = array_ops.constant(self._reparam_offset ** 2, dtype=self.dtype)
-    beta_bound = array_ops.constant(
-        (self._beta_min + self._reparam_offset ** 2) ** .5, dtype=self.dtype)
-    gamma_bound = array_ops.constant(self._reparam_offset, dtype=self.dtype)
-
-    def beta_initializer(shape, dtype=None, partition_info=None):
-      del partition_info  # unused
-      return math_ops.sqrt(array_ops.ones(shape, dtype=dtype) + pedestal)
-
-    def gamma_initializer(shape, dtype=None, partition_info=None):
-      del partition_info  # unused
-      assert len(shape) == 2
-      assert shape[0] == shape[1]
-      eye = linalg_ops.eye(shape[0], dtype=dtype)
-      return math_ops.sqrt(self._gamma_init * eye + pedestal)
-
-    beta = self.add_variable('reparam_beta',
-                             shape=[num_channels],
-                             initializer=beta_initializer,
-                             dtype=self.dtype,
-                             trainable=True)
-    beta = self._lower_bound(beta, beta_bound)
-    self.beta = math_ops.square(beta) - pedestal
-
-    gamma = self.add_variable('reparam_gamma',
-                              shape=[num_channels, num_channels],
-                              initializer=gamma_initializer,
-                              dtype=self.dtype,
-                              trainable=True)
-    gamma = self._lower_bound(gamma, gamma_bound)
-    self.gamma = math_ops.square(gamma) - pedestal
-
-    self.built = True
-
-  def call(self, inputs):
-    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
-    ndim = self._input_rank
-
-    shape = self.gamma.get_shape().as_list()
-    gamma = array_ops.reshape(self.gamma, (ndim - 2) * [1] + shape)
-
-    # Compute normalization pool.
-    if self.data_format == 'channels_first':
-      norm_pool = nn.convolution(math_ops.square(inputs), gamma, 'VALID',
-                                 data_format='NC' + 'DHW'[-(ndim - 2):])
-      if ndim == 3:
-        norm_pool = array_ops.expand_dims(norm_pool, 2)
-        norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
-        norm_pool = array_ops.squeeze(norm_pool, [2])
-      elif ndim == 5:
-        shape = array_ops.shape(norm_pool)
-        norm_pool = array_ops.reshape(norm_pool, shape[:3] + [-1])
-        norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
-        norm_pool = array_ops.reshape(norm_pool, shape)
-      else:  # ndim == 4
-        norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
-    else:  # channels_last
-      norm_pool = nn.convolution(math_ops.square(inputs), gamma, 'VALID')
-      norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NHWC')
-    norm_pool = math_ops.sqrt(norm_pool)
-
-    if self.inverse:
-      outputs = inputs * norm_pool
-    else:
-      outputs = inputs / norm_pool
-    outputs.set_shape(inputs.get_shape())
-    return outputs
-
-  def _compute_output_shape(self, input_shape):
-    channel_axis = self._channel_axis()
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if not 3 <= input_shape.ndim <= 5:
-      raise ValueError('`input_shape` must be of rank 3 to 5, inclusive.')
-    if input_shape[channel_axis].value is None:
-      raise ValueError(
-          'The channel dimension of `input_shape` must be defined.')
-    return input_shape
-
-
-def gdn(inputs,
-        inverse=False,
-        beta_min=1e-6,
-        gamma_init=.1,
-        reparam_offset=2 ** -18,
-        data_format='channels_last',
-        trainable=True,
-        name=None,
-        reuse=None):
-  """Functional interface for GDN layer.
-
-  Based on the papers:
-
-    "Density Modeling of Images using a Generalized Normalization
-    Transformation"
-    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
-    https://arxiv.org/abs/1511.06281
-
-    "End-to-end Optimized Image Compression"
-    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
-    https://arxiv.org/abs/1611.01704
-
-  Implements an activation function that is essentially a multivariate
-  generalization of a particular sigmoid-type function:
-
-  y[i] = x[i] / sqrt(beta[i] + sum_j(gamma[j, i] * x[j]))
-
-  where i and j run over channels. This implementation never sums across spatial
-  dimensions. It is similar to local response normalization, but more powerful,
-  as beta and gamma are trainable parameters.
-
-  Arguments:
-    inputs: Tensor input.
-    inverse: If False (default), compute GDN response. If True, compute IGDN
-      response (one step of fixed point iteration to invert GDN; the division
-      is replaced by multiplication).
-    beta_min: Lower bound for beta, to prevent numerical error from causing
-      square root of zero or negative values.
-    gamma_init: The gamma matrix will be initialized as the identity matrix
-      multiplied with this value. If set to zero, the layer is effectively
-      initialized to the identity operation, since beta is initialized as one.
-      A good default setting is somewhere between 0 and 0.5.
-    reparam_offset: Offset added to the reparameterization of beta and gamma.
-      The reparameterization of beta and gamma as their square roots lets the
-      training slow down when their values are close to zero, which is desirable
-      as small values in the denominator can lead to a situation where gradient
-      noise on beta/gamma leads to extreme amounts of noise in the GDN
-      activations. However, without the offset, we would get zero gradients if
-      any elements of beta or gamma were exactly zero, and thus the training
-      could get stuck. To prevent this, we add this small constant. The default
-      value was empirically determined as a good starting point. Making it
-      bigger potentially leads to more gradient noise on the activations, making
-      it too small may lead to numerical precision issues.
-    data_format: Format of input tensor. Currently supports 'channels_first' and
-      'channels_last'.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: String, the name of the layer. Layers with the same name will
-      share weights, but to avoid mistakes we require reuse=True in such cases.
-    reuse: Boolean, whether to reuse the weights of a previous layer
-      by the same name.
-
-  Returns:
-    Output tensor.
-  """
-  layer = GDN(inverse=inverse,
-              beta_min=beta_min,
-              gamma_init=gamma_init,
-              reparam_offset=reparam_offset,
-              data_format=data_format,
-              trainable=trainable,
-              name=name,
-              dtype=inputs.dtype.base_dtype,
-              _scope=name,
-              _reuse=reuse)
-  return layer.apply(inputs)
-
-
@add_arg_scope
 def max_pool2d(inputs,
               kernel_size,