1# -*- coding: utf-8 -*-
2# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#     http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15# ==============================================================================
16
17# pylint: disable=g-short-docstring-punctuation
18"""Higher level ops for building layers."""
19
20from __future__ import absolute_import
21from __future__ import division
22from __future__ import print_function
23
24import functools
25import six
26
27from tensorflow.contrib.framework.python.ops import add_arg_scope
28from tensorflow.contrib.framework.python.ops import variables
29from tensorflow.contrib.layers.python.layers import initializers
30from tensorflow.contrib.layers.python.layers import utils
31from tensorflow.python.eager import context
32from tensorflow.python.framework import constant_op
33from tensorflow.python.framework import dtypes
34from tensorflow.python.framework import function
35from tensorflow.python.framework import ops
36from tensorflow.python.framework import sparse_tensor
37from tensorflow.python.framework import tensor_shape
38from tensorflow.python.keras.engine import input_spec
39from tensorflow.python.layers import base
40from tensorflow.python.layers import convolutional as convolutional_layers
41from tensorflow.python.layers import core as core_layers
42from tensorflow.python.layers import normalization as normalization_layers
43from tensorflow.python.layers import pooling as pooling_layers
44from tensorflow.python.ops import array_ops
45from tensorflow.python.ops import check_ops
46from tensorflow.python.ops import init_ops
47from tensorflow.python.ops import linalg_ops
48from tensorflow.python.ops import math_ops
49from tensorflow.python.ops import nn
50from tensorflow.python.ops import sparse_ops
51from tensorflow.python.ops import standard_ops
52from tensorflow.python.ops import variable_scope
53from tensorflow.python.ops import variables as tf_variables
54from tensorflow.python.training import moving_averages
55
56# TODO(b/28426988): Replace legacy_* fns migrated from slim.
57# TODO(b/28426988): Remove legacy_* when all uses have migrated to new API.
58__all__ = [
59    'avg_pool2d', 'avg_pool3d', 'batch_norm', 'bias_add', 'conv1d', 'conv2d',
60    'conv3d', 'conv2d_in_plane', 'conv2d_transpose', 'conv3d_transpose',
61    'convolution', 'convolution1d', 'convolution2d', 'convolution2d_in_plane',
62    'convolution2d_transpose', 'convolution3d', 'convolution3d_transpose',
63    'dense_to_sparse', 'dropout', 'elu', 'flatten', 'fully_connected', 'GDN',
64    'gdn', 'images_to_sequence', 'layer_norm', 'linear', 'pool', 'max_pool2d',
65    'max_pool3d', 'one_hot_encoding', 'relu', 'relu6', 'repeat',
66    'scale_gradient', 'separable_conv2d', 'separable_convolution2d',
67    'sequence_to_images', 'softmax', 'spatial_softmax', 'stack', 'unit_norm',
68    'legacy_fully_connected', 'legacy_linear', 'legacy_relu', 'maxout'
69]
70
71DATA_FORMAT_NCHW = 'NCHW'
72DATA_FORMAT_NHWC = 'NHWC'
73DATA_FORMAT_NCDHW = 'NCDHW'
74DATA_FORMAT_NDHWC = 'NDHWC'
75
76
77@add_arg_scope
78def avg_pool2d(inputs,
79               kernel_size,
80               stride=2,
81               padding='VALID',
82               data_format=DATA_FORMAT_NHWC,
83               outputs_collections=None,
84               scope=None):
85  """Adds a 2D average pooling op.
86
87  It is assumed that the pooling is done per image but not in batch or channels.
88
89  Args:
90    inputs: A 4-D tensor of shape `[batch_size, height, width, channels]` if
91      `data_format` is `NHWC`, and `[batch_size, channels, height, width]` if
92      `data_format` is `NCHW`.
93    kernel_size: A list of length 2: [kernel_height, kernel_width] of the
94      pooling kernel over which the op is computed. Can be an int if both
95      values are the same.
96    stride: A list of length 2: [stride_height, stride_width].
97      Can be an int if both strides are the same. Note that presently
98      both strides must have the same value.
99    padding: The padding method, either 'VALID' or 'SAME'.
100    data_format: A string. `NHWC` (default) and `NCHW` are supported.
101    outputs_collections: The collections to which the outputs are added.
102    scope: Optional scope for name_scope.
103
104  Returns:
105    A `Tensor` representing the results of the pooling operation.
106
107  Raises:
108    ValueError: If `data_format` is neither `NHWC` nor `NCHW`.
109  """
110  if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
111    raise ValueError('data_format has to be either NCHW or NHWC.')
112  with ops.name_scope(scope, 'AvgPool2D', [inputs]) as sc:
113    inputs = ops.convert_to_tensor(inputs)
114    df = ('channels_first'
115          if data_format and data_format.startswith('NC') else 'channels_last')
116    layer = pooling_layers.AveragePooling2D(
117        pool_size=kernel_size,
118        strides=stride,
119        padding=padding,
120        data_format=df,
121        _scope=sc)
122    outputs = layer.apply(inputs)
123    return utils.collect_named_outputs(outputs_collections, sc, outputs)
124
125
126@add_arg_scope
127def avg_pool3d(inputs,
128               kernel_size,
129               stride=2,
130               padding='VALID',
131               data_format=DATA_FORMAT_NDHWC,
132               outputs_collections=None,
133               scope=None):
134  """Adds a 3D average pooling op.
135
136  It is assumed that the pooling is done per image but not in batch or channels.
137
138  Args:
139    inputs: A 5-D tensor of shape `[batch_size, depth, height, width, channels]`
140      if `data_format` is `NDHWC`, and `[batch_size, channels, depth, height,
141      width]` if `data_format` is `NCDHW`.
142    kernel_size: A list of length 3: [kernel_depth, kernel_height, kernel_width]
143      of the pooling kernel over which the op is computed. Can be an int if both
144      values are the same.
145    stride: A list of length 3: [stride_depth, stride_height, stride_width].
146      Can be an int if both strides are the same. Note that presently
147      both strides must have the same value.
148    padding: The padding method, either 'VALID' or 'SAME'.
149    data_format: A string. `NDHWC` (default) and `NCDHW` are supported.
150    outputs_collections: The collections to which the outputs are added.
151    scope: Optional scope for name_scope.
152
153  Returns:
154    A `Tensor` representing the results of the pooling operation.
155
156  Raises:
157    ValueError: If `data_format` is neither `NDHWC` nor `NCDHW`.
158  """
159  if data_format not in (DATA_FORMAT_NCDHW, DATA_FORMAT_NDHWC):
160    raise ValueError('data_format has to be either NCDHW or NDHWC.')
161  with ops.name_scope(scope, 'AvgPool3D', [inputs]) as sc:
162    inputs = ops.convert_to_tensor(inputs)
163    df = ('channels_first'
164          if data_format and data_format.startswith('NC') else 'channels_last')
165    layer = pooling_layers.AveragePooling3D(
166        pool_size=kernel_size,
167        strides=stride,
168        padding=padding,
169        data_format=df,
170        _scope=sc)
171    outputs = layer.apply(inputs)
172    return utils.collect_named_outputs(outputs_collections, sc, outputs)
173
174
175def _fused_batch_norm(inputs,
176                      decay=0.999,
177                      center=True,
178                      scale=False,
179                      epsilon=0.001,
180                      activation_fn=None,
181                      param_initializers=None,
182                      param_regularizers=None,
183                      updates_collections=ops.GraphKeys.UPDATE_OPS,
184                      is_training=True,
185                      reuse=None,
186                      variables_collections=None,
187                      outputs_collections=None,
188                      trainable=True,
189                      data_format=DATA_FORMAT_NHWC,
190                      zero_debias_moving_mean=False,
191                      scope=None):
192  """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167.
193
194    "Batch Normalization: Accelerating Deep Network Training by Reducing
195    Internal Covariate Shift"
196
197    Sergey Ioffe, Christian Szegedy
198
199  Can be used as a normalizer function for conv2d and fully_connected.
200
201  Note: when training, the moving_mean and moving_variance need to be updated.
202  By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
203  need to be added as a dependency to the `train_op`. For example:
204
205  ```python
206    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
207    with tf.control_dependencies(update_ops):
208      train_op = optimizer.minimize(loss)
209  ```
210
211  One can set updates_collections=None to force the updates in place, but that
212  can have a speed penalty, especially in distributed settings.
213
214  Args:
215    inputs: A tensor with 2 or more dimensions, where the first dimension has
216      `batch_size`. The normalization is over all but the last dimension if
217      `data_format` is `NHWC` and the second dimension if `data_format` is
218      `NCHW`.
219    decay: Decay for the moving average. Reasonable values for `decay` are close
220      to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc.
221      Lower `decay` value (recommend trying `decay`=0.9) if model experiences
222      reasonably good training performance but poor validation and/or test
223      performance.
224    center: If True, add offset of `beta` to normalized tensor.  If False,
225      `beta` is ignored.
226    scale: If True, multiply by `gamma`. If False, `gamma` is
227      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
228      disabled since the scaling can be done by the next layer.
229    epsilon: Small float added to variance to avoid dividing by zero.
230    activation_fn: Activation function, default set to None to skip it and
231      maintain a linear activation.
232    param_initializers: Optional initializers for beta, gamma, moving mean and
233      moving variance.
234    param_regularizers: Optional regularizer for beta and gamma.
235    updates_collections: Collections to collect the update ops for computation.
236      The updates_ops need to be executed with the train_op.
237      If None, a control dependency would be added to make sure the updates are
238      computed in place.
239    is_training: Whether or not the layer is in training mode. In training mode
240      it would accumulate the statistics of the moments into `moving_mean` and
241      `moving_variance` using an exponential moving average with the given
242      `decay`. When it is not in training mode then it would use the values of
243      the `moving_mean` and the `moving_variance`.
244    reuse: Whether or not the layer and its variables should be reused. To be
245      able to reuse the layer scope must be given.
246    variables_collections: Optional collections for the variables.
247    outputs_collections: Collections to add the outputs.
248    trainable: If `True` also add variables to the graph collection
249      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
250    data_format: A string. `NHWC` (default) and `NCHW` are supported.
251    zero_debias_moving_mean: Use zero_debias for moving_mean.
252    scope: Optional scope for `variable_scope`.
253
254  Returns:
255    A `Tensor` representing the output of the operation.
256
257  Raises:
258    ValueError: If `data_format` is neither `NHWC` nor `NCHW`.
259    ValueError: If the rank of `inputs` is undefined.
260    ValueError: If the rank of `inputs` is neither 2 or 4.
261    ValueError: If rank or `C` dimension of `inputs` is undefined.
262  """
263  if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
264    raise ValueError('data_format has to be either NCHW or NHWC.')
265  with variable_scope.variable_scope(
266      scope, 'BatchNorm', [inputs], reuse=reuse) as sc:
267    inputs = ops.convert_to_tensor(inputs)
268    original_shape = inputs.get_shape()
269    original_inputs = inputs
270    original_rank = original_shape.ndims
271    if original_rank is None:
272      raise ValueError('Inputs %s has undefined rank' % inputs.name)
273    elif original_rank not in [2, 4]:
274      raise ValueError('Inputs %s has unsupported rank.'
275                       ' Expected 2 or 4 but got %d' % (inputs.name,
276                                                        original_rank))
277    if original_rank == 2:
278      channels = inputs.get_shape().dims[-1].value
279      if channels is None:
280        raise ValueError('`C` dimension must be known but is None')
281      new_shape = [-1, 1, 1, channels]
282      if data_format == DATA_FORMAT_NCHW:
283        new_shape = [-1, channels, 1, 1]
284      inputs = array_ops.reshape(inputs, new_shape)
285    inputs_shape = inputs.get_shape()
286    if data_format == DATA_FORMAT_NHWC:
287      params_shape = inputs_shape[-1:]
288    else:
289      params_shape = inputs_shape[1:2]
290    if not params_shape.is_fully_defined():
291      raise ValueError('Inputs %s has undefined `C` dimension %s.' %
292                       (inputs.name, params_shape))
293
294    # Allocate parameters for the beta and gamma of the normalization.
295    beta_collections = utils.get_variable_collections(variables_collections,
296                                                      'beta')
297    # Float32 required to avoid precision-loss when using fp16 input/output
298    variable_dtype = dtypes.float32
299    if not param_initializers:
300      param_initializers = {}
301    if not param_regularizers:
302      param_regularizers = {}
303    beta_regularizer = param_regularizers.get('beta')
304    gamma_regularizer = param_regularizers.get('gamma')
305
306    if center:
307      beta_initializer = param_initializers.get('beta',
308                                                init_ops.zeros_initializer())
309      beta = variables.model_variable(
310          'beta',
311          shape=params_shape,
312          dtype=variable_dtype,
313          initializer=beta_initializer,
314          regularizer=beta_regularizer,
315          collections=beta_collections,
316          trainable=trainable)
317    else:
318      beta = array_ops.constant(0.0, dtype=variable_dtype, shape=params_shape)
319
320    if scale:
321      gamma_collections = utils.get_variable_collections(
322          variables_collections, 'gamma')
323      gamma_initializer = param_initializers.get('gamma',
324                                                 init_ops.ones_initializer())
325      gamma = variables.model_variable(
326          'gamma',
327          shape=params_shape,
328          dtype=variable_dtype,
329          initializer=gamma_initializer,
330          regularizer=gamma_regularizer,
331          collections=gamma_collections,
332          trainable=trainable)
333    else:
334      gamma = array_ops.constant(1.0, dtype=variable_dtype, shape=params_shape)
335
336    # Create moving_mean and moving_variance variables and add them to the
337    # appropriate collections. We disable variable partitioning while creating
338    # them, because assign_moving_average is not yet supported for partitioned
339    # variables (this needs to be handled carefully, as it may break
340    # the checkpoint backward compatibility).
341    with variable_scope.variable_scope(
342        variable_scope.get_variable_scope()) as local_scope:
343      local_scope.set_partitioner(None)
344      moving_mean_collections = utils.get_variable_collections(
345          variables_collections, 'moving_mean')
346      moving_mean_initializer = param_initializers.get(
347          'moving_mean', init_ops.zeros_initializer())
348      moving_mean = variables.model_variable(
349          'moving_mean',
350          shape=params_shape,
351          dtype=variable_dtype,
352          initializer=moving_mean_initializer,
353          trainable=False,
354          collections=moving_mean_collections)
355      moving_variance_collections = utils.get_variable_collections(
356          variables_collections, 'moving_variance')
357      moving_variance_initializer = param_initializers.get(
358          'moving_variance', init_ops.ones_initializer())
359      moving_variance = variables.model_variable(
360          'moving_variance',
361          shape=params_shape,
362          dtype=variable_dtype,
363          initializer=moving_variance_initializer,
364          trainable=False,
365          collections=moving_variance_collections)
366
367    def _fused_batch_norm_training():
368      return nn.fused_batch_norm(
369          inputs, gamma, beta, epsilon=epsilon, data_format=data_format)
370
371    def _fused_batch_norm_inference():
372      return nn.fused_batch_norm(
373          inputs,
374          gamma,
375          beta,
376          mean=moving_mean,
377          variance=moving_variance,
378          epsilon=epsilon,
379          is_training=False,
380          data_format=data_format)
381
382    outputs, mean, variance = utils.smart_cond(
383        is_training, _fused_batch_norm_training, _fused_batch_norm_inference)
384
385    # If `is_training` doesn't have a constant value, because it is a `Tensor`,
386    # a `Variable` or `Placeholder` then is_training_value will be None and
387    # `need_updates` will be true.
388    is_training_value = utils.constant_value(is_training)
389    need_updates = is_training_value is None or is_training_value
390    if need_updates:
391      if updates_collections is None:
392        no_updates = lambda: outputs
393
394        def _force_updates():
395          """Internal function forces updates moving_vars if is_training."""
396          update_moving_mean = moving_averages.assign_moving_average(
397              moving_mean, mean, decay, zero_debias=zero_debias_moving_mean)
398          update_moving_variance = moving_averages.assign_moving_average(
399              moving_variance, variance, decay, zero_debias=False)
400          with ops.control_dependencies(
401              [update_moving_mean, update_moving_variance]):
402            return array_ops.identity(outputs)
403
404        outputs = utils.smart_cond(is_training, _force_updates, no_updates)
405      else:
406        moving_vars_fn = lambda: (moving_mean, moving_variance)
407
408        def _delay_updates():
409          """Internal function that delay updates moving_vars if is_training."""
410          update_moving_mean = moving_averages.assign_moving_average(
411              moving_mean, mean, decay, zero_debias=zero_debias_moving_mean)
412          update_moving_variance = moving_averages.assign_moving_average(
413              moving_variance, variance, decay, zero_debias=False)
414          return update_moving_mean, update_moving_variance
415
416        update_mean, update_variance = utils.smart_cond(
417            is_training, _delay_updates, moving_vars_fn)
418        ops.add_to_collections(updates_collections, update_mean)
419        ops.add_to_collections(updates_collections, update_variance)
420
421    outputs.set_shape(inputs_shape)
422    if original_shape.ndims == 2:
423      outputs = array_ops.reshape(outputs, array_ops.shape(original_inputs))
424    if activation_fn is not None:
425      outputs = activation_fn(outputs)
426    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
427
428
429@add_arg_scope
430def batch_norm(inputs,
431               decay=0.999,
432               center=True,
433               scale=False,
434               epsilon=0.001,
435               activation_fn=None,
436               param_initializers=None,
437               param_regularizers=None,
438               updates_collections=ops.GraphKeys.UPDATE_OPS,
439               is_training=True,
440               reuse=None,
441               variables_collections=None,
442               outputs_collections=None,
443               trainable=True,
444               batch_weights=None,
445               fused=None,
446               data_format=DATA_FORMAT_NHWC,
447               zero_debias_moving_mean=False,
448               scope=None,
449               renorm=False,
450               renorm_clipping=None,
451               renorm_decay=0.99,
452               adjustment=None):
453  """Adds a Batch Normalization layer from http://arxiv.org/abs/1502.03167.
454
455    "Batch Normalization: Accelerating Deep Network Training by Reducing
456    Internal Covariate Shift"
457
458    Sergey Ioffe, Christian Szegedy
459
460  Can be used as a normalizer function for conv2d and fully_connected. The
461  normalization is over all but the last dimension if `data_format` is `NHWC`
462  and all but the second dimension if `data_format` is `NCHW`.  In case of a 2D
463  tensor this corresponds to the batch dimension, while in case of a 4D tensor
464  this
465  corresponds to the batch and space dimensions.
466
467  Note: when training, the moving_mean and moving_variance need to be updated.
468  By default the update ops are placed in `tf.GraphKeys.UPDATE_OPS`, so they
469  need to be added as a dependency to the `train_op`. For example:
470
471  ```python
472    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
473    with tf.control_dependencies(update_ops):
474      train_op = optimizer.minimize(loss)
475  ```
476
477  One can set updates_collections=None to force the updates in place, but that
478  can have a speed penalty, especially in distributed settings.
479
480  Args:
481    inputs: A tensor with 2 or more dimensions, where the first dimension has
482      `batch_size`. The normalization is over all but the last dimension if
483      `data_format` is `NHWC` and the second dimension if `data_format` is
484      `NCHW`.
485    decay: Decay for the moving average. Reasonable values for `decay` are close
486      to 1.0, typically in the multiple-nines range: 0.999, 0.99, 0.9, etc.
487      Lower `decay` value (recommend trying `decay`=0.9) if model experiences
488      reasonably good training performance but poor validation and/or test
489      performance. Try zero_debias_moving_mean=True for improved stability.
490    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
491      is ignored.
492    scale: If True, multiply by `gamma`. If False, `gamma` is
493      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
494      disabled since the scaling can be done by the next layer.
495    epsilon: Small float added to variance to avoid dividing by zero.
496    activation_fn: Activation function, default set to None to skip it and
497      maintain a linear activation.
498    param_initializers: Optional initializers for beta, gamma, moving mean and
499      moving variance.
500    param_regularizers: Optional regularizer for beta and gamma.
501    updates_collections: Collections to collect the update ops for computation.
502      The updates_ops need to be executed with the train_op.
503      If None, a control dependency would be added to make sure the updates are
504      computed in place.
505    is_training: Whether or not the layer is in training mode. In training mode
506      it would accumulate the statistics of the moments into `moving_mean` and
507      `moving_variance` using an exponential moving average with the given
508      `decay`. When it is not in training mode then it would use the values of
509      the `moving_mean` and the `moving_variance`.
510    reuse: Whether or not the layer and its variables should be reused. To be
511      able to reuse the layer scope must be given.
512    variables_collections: Optional collections for the variables.
513    outputs_collections: Collections to add the outputs.
514    trainable: If `True` also add variables to the graph collection
515      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
516    batch_weights: An optional tensor of shape `[batch_size]`,
517      containing a frequency weight for each batch item. If present,
518      then the batch normalization uses weighted mean and
519      variance. (This can be used to correct for bias in training
520      example selection.)
521    fused: if `None` or `True`, use a faster, fused implementation if possible.
522      If `False`, use the system recommended implementation.
523    data_format: A string. `NHWC` (default) and `NCHW` are supported.
524    zero_debias_moving_mean: Use zero_debias for moving_mean. It creates a new
525      pair of variables 'moving_mean/biased' and 'moving_mean/local_step'.
526    scope: Optional scope for `variable_scope`.
527    renorm: Whether to use Batch Renormalization
528      (https://arxiv.org/abs/1702.03275). This adds extra variables during
529      training. The inference is the same for either value of this parameter.
530    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
531      scalar `Tensors` used to clip the renorm correction. The correction
532      `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
533      `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
534      dmax are set to inf, 0, inf, respectively.
535    renorm_decay: Momentum used to update the moving means and standard
536      deviations with renorm. Unlike `momentum`, this affects training
537      and should be neither too small (which would add noise) nor too large
538      (which would give stale estimates). Note that `decay` is still applied
539      to get the means and variances for inference.
540    adjustment: A function taking the `Tensor` containing the (dynamic) shape of
541      the input tensor and returning a pair (scale, bias) to apply to the
542      normalized values (before gamma and beta), only during training. For
543      example,
544        `adjustment = lambda shape: (
545          tf.random_uniform(shape[-1:], 0.93, 1.07),
546          tf.random_uniform(shape[-1:], -0.1, 0.1))`
547      will scale the normalized value by up to 7% up or down, then shift the
548      result by up to 0.1 (with independent scaling and bias for each feature
549      but shared across all examples), and finally apply gamma and/or beta. If
550      `None`, no adjustment is applied.
551
552  Returns:
553    A `Tensor` representing the output of the operation.
554
555  Raises:
556    ValueError: If `data_format` is neither `NHWC` nor `NCHW`.
557    ValueError: If the rank of `inputs` is undefined.
558    ValueError: If rank or channels dimension of `inputs` is undefined.
559  """
560  if fused is None:
561    fused = True
562
563  # Only use _fused_batch_norm if all of the following three
564  # conditions are true:
565  # (1) fused is set True;
566  # (2) it is possible to use (currently it doesn't support batch weights,
567  #   renorm, and the case when rank is neither 2 nor 4);
568  # (3) it is used with zero_debias_moving_mean, or an input shape of rank 2,
569  #   or non-default updates_collections (not implemented in
570  #   normalization_layers.BatchNormalization yet); otherwise use the fused
571  #   implementation in normalization_layers.BatchNormalization.
572  inputs = ops.convert_to_tensor(inputs)
573  rank = inputs.get_shape().ndims
574  possible_to_fuse = (
575      batch_weights is None and not renorm and rank in [2, 4] and
576      adjustment is None)
577  if fused and possible_to_fuse and (
578      zero_debias_moving_mean or rank == 2 or
579      updates_collections is not ops.GraphKeys.UPDATE_OPS):
580    return _fused_batch_norm(
581        inputs,
582        decay=decay,
583        center=center,
584        scale=scale,
585        epsilon=epsilon,
586        activation_fn=activation_fn,
587        param_initializers=param_initializers,
588        param_regularizers=param_regularizers,
589        updates_collections=updates_collections,
590        is_training=is_training,
591        reuse=reuse,
592        variables_collections=variables_collections,
593        outputs_collections=outputs_collections,
594        trainable=trainable,
595        data_format=data_format,
596        zero_debias_moving_mean=zero_debias_moving_mean,
597        scope=scope)
598
599  if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
600    raise ValueError('data_format has to be either NCHW or NHWC.')
601
602  layer_variable_getter = _build_variable_getter()
603  with variable_scope.variable_scope(
604      scope,
605      'BatchNorm', [inputs],
606      reuse=reuse,
607      custom_getter=layer_variable_getter) as sc:
608    inputs = ops.convert_to_tensor(inputs)
609
610    # Determine whether we can use the core layer class.
611    if (batch_weights is None and
612        updates_collections is ops.GraphKeys.UPDATE_OPS and
613        not zero_debias_moving_mean):
614      # Use the core layer class.
615      axis = 1 if data_format == DATA_FORMAT_NCHW else -1
616      if not param_initializers:
617        param_initializers = {}
618      beta_initializer = param_initializers.get('beta',
619                                                init_ops.zeros_initializer())
620      gamma_initializer = param_initializers.get('gamma',
621                                                 init_ops.ones_initializer())
622      moving_mean_initializer = param_initializers.get(
623          'moving_mean', init_ops.zeros_initializer())
624      moving_variance_initializer = param_initializers.get(
625          'moving_variance', init_ops.ones_initializer())
626      if not param_regularizers:
627        param_regularizers = {}
628      beta_regularizer = param_regularizers.get('beta')
629      gamma_regularizer = param_regularizers.get('gamma')
630      layer = normalization_layers.BatchNormalization(
631          axis=axis,
632          momentum=decay,
633          epsilon=epsilon,
634          center=center,
635          scale=scale,
636          beta_initializer=beta_initializer,
637          gamma_initializer=gamma_initializer,
638          moving_mean_initializer=moving_mean_initializer,
639          moving_variance_initializer=moving_variance_initializer,
640          beta_regularizer=beta_regularizer,
641          gamma_regularizer=gamma_regularizer,
642          trainable=trainable,
643          renorm=renorm,
644          renorm_clipping=renorm_clipping,
645          renorm_momentum=renorm_decay,
646          adjustment=adjustment,
647          name=sc.name,
648          _scope=sc,
649          _reuse=reuse,
650          fused=fused)
651      outputs = layer.apply(inputs, training=is_training)
652
653      # Add variables to collections.
654      _add_variable_to_collections(layer.moving_mean, variables_collections,
655                                   'moving_mean')
656      _add_variable_to_collections(layer.moving_variance, variables_collections,
657                                   'moving_variance')
658      if layer.beta is not None:
659        _add_variable_to_collections(layer.beta, variables_collections, 'beta')
660      if layer.gamma is not None:
661        _add_variable_to_collections(layer.gamma, variables_collections,
662                                     'gamma')
663
664      if activation_fn is not None:
665        outputs = activation_fn(outputs)
666      return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
667
668    # Not supported by layer class: batch_weights argument,
669    # and custom updates_collections. In that case, use the legacy BN
670    # implementation.
671    # Custom updates collections are not supported because the update logic
672    # is different in this case, in particular w.r.t. "forced updates" and
673    # update op reuse.
674    if renorm:
675      raise ValueError('renorm is not supported with batch_weights, '
676                       'updates_collections or zero_debias_moving_mean')
677    inputs_shape = inputs.get_shape()
678    inputs_rank = inputs_shape.ndims
679    if inputs_rank is None:
680      raise ValueError('Inputs %s has undefined rank.' % inputs.name)
681    dtype = inputs.dtype.base_dtype
682    if batch_weights is not None:
683      batch_weights = ops.convert_to_tensor(batch_weights)
684      inputs_shape[0:1].assert_is_compatible_with(batch_weights.get_shape())
685      # Reshape batch weight values so they broadcast across inputs.
686      nshape = [-1] + [1 for _ in range(inputs_rank - 1)]
687      batch_weights = array_ops.reshape(batch_weights, nshape)
688
689    if data_format == DATA_FORMAT_NCHW:
690      moments_axes = [0] + list(range(2, inputs_rank))
691      params_shape = inputs_shape[1:2]
692      # For NCHW format, rather than relying on implicit broadcasting, we
693      # explicitly reshape the params to params_shape_broadcast when computing
694      # the moments and the batch normalization.
695      params_shape_broadcast = list(
696          [1, inputs_shape.dims[1].value] + [1 for _ in range(2, inputs_rank)])
697    else:
698      moments_axes = list(range(inputs_rank - 1))
699      params_shape = inputs_shape[-1:]
700      params_shape_broadcast = None
701    if not params_shape.is_fully_defined():
702      raise ValueError('Inputs %s has undefined channels dimension %s.' %
703                       (inputs.name, params_shape))
704
705    # Allocate parameters for the beta and gamma of the normalization.
706    beta, gamma = None, None
707    if not param_initializers:
708      param_initializers = {}
709    if center:
710      beta_collections = utils.get_variable_collections(variables_collections,
711                                                        'beta')
712      beta_initializer = param_initializers.get('beta',
713                                                init_ops.zeros_initializer())
714      beta = variables.model_variable(
715          'beta',
716          shape=params_shape,
717          dtype=dtype,
718          initializer=beta_initializer,
719          collections=beta_collections,
720          trainable=trainable)
721    if scale:
722      gamma_collections = utils.get_variable_collections(
723          variables_collections, 'gamma')
724      gamma_initializer = param_initializers.get('gamma',
725                                                 init_ops.ones_initializer())
726      gamma = variables.model_variable(
727          'gamma',
728          shape=params_shape,
729          dtype=dtype,
730          initializer=gamma_initializer,
731          collections=gamma_collections,
732          trainable=trainable)
733
734    # Create moving_mean and moving_variance variables and add them to the
735    # appropriate collections. We disable variable partitioning while creating
736    # them, because assign_moving_average is not yet supported for partitioned
737    # variables (this needs to be handled carefully, as it may break
738    # the checkpoint backward compatibility).
739    with variable_scope.variable_scope(
740        variable_scope.get_variable_scope()) as local_scope:
741      local_scope.set_partitioner(None)
742      moving_mean_collections = utils.get_variable_collections(
743          variables_collections, 'moving_mean')
744      moving_mean_initializer = param_initializers.get(
745          'moving_mean', init_ops.zeros_initializer())
746      moving_mean = variables.model_variable(
747          'moving_mean',
748          shape=params_shape,
749          dtype=dtype,
750          initializer=moving_mean_initializer,
751          trainable=False,
752          collections=moving_mean_collections)
753      moving_variance_collections = utils.get_variable_collections(
754          variables_collections, 'moving_variance')
755      moving_variance_initializer = param_initializers.get(
756          'moving_variance', init_ops.ones_initializer())
757      moving_variance = variables.model_variable(
758          'moving_variance',
759          shape=params_shape,
760          dtype=dtype,
761          initializer=moving_variance_initializer,
762          trainable=False,
763          collections=moving_variance_collections)
764
765    # If `is_training` doesn't have a constant value, because it is a `Tensor`,
766    # a `Variable` or `Placeholder` then is_training_value will be None and
767    # `needs_moments` will be true.
768    is_training_value = utils.constant_value(is_training)
769    need_moments = is_training_value is None or is_training_value
770    if need_moments:
771      # Calculate the moments based on the individual batch.
772      if batch_weights is None:
773        if data_format == DATA_FORMAT_NCHW:
774          mean, variance = nn.moments(inputs, moments_axes, keep_dims=True)
775          mean = array_ops.reshape(mean, [-1])
776          variance = array_ops.reshape(variance, [-1])
777        else:
778          mean, variance = nn.moments(inputs, moments_axes)
779      else:
780        if data_format == DATA_FORMAT_NCHW:
781          mean, variance = nn.weighted_moments(
782              inputs, moments_axes, batch_weights, keepdims=True)
783          mean = array_ops.reshape(mean, [-1])
784          variance = array_ops.reshape(variance, [-1])
785        else:
786          mean, variance = nn.weighted_moments(inputs, moments_axes,
787                                               batch_weights)
788
789      moving_vars_fn = lambda: (moving_mean, moving_variance)
790      if updates_collections is None:
791
792        def _force_updates():
793          """Internal function forces updates moving_vars if is_training."""
794          update_moving_mean = moving_averages.assign_moving_average(
795              moving_mean, mean, decay, zero_debias=zero_debias_moving_mean)
796          update_moving_variance = moving_averages.assign_moving_average(
797              moving_variance, variance, decay, zero_debias=False)
798          with ops.control_dependencies(
799              [update_moving_mean, update_moving_variance]):
800            return array_ops.identity(mean), array_ops.identity(variance)
801
802        mean, variance = utils.smart_cond(is_training, _force_updates,
803                                          moving_vars_fn)
804      else:
805
806        def _delay_updates():
807          """Internal function that delay updates moving_vars if is_training."""
808          update_moving_mean = moving_averages.assign_moving_average(
809              moving_mean, mean, decay, zero_debias=zero_debias_moving_mean)
810          update_moving_variance = moving_averages.assign_moving_average(
811              moving_variance, variance, decay, zero_debias=False)
812          return update_moving_mean, update_moving_variance
813
814        update_mean, update_variance = utils.smart_cond(
815            is_training, _delay_updates, moving_vars_fn)
816        ops.add_to_collections(updates_collections, update_mean)
817        ops.add_to_collections(updates_collections, update_variance)
818        # Use computed moments during training and moving_vars otherwise.
819        vars_fn = lambda: (mean, variance)
820        mean, variance = utils.smart_cond(is_training, vars_fn, moving_vars_fn)
821    else:
822      mean, variance = moving_mean, moving_variance
823    if data_format == DATA_FORMAT_NCHW:
824      mean = array_ops.reshape(mean, params_shape_broadcast)
825      variance = array_ops.reshape(variance, params_shape_broadcast)
826      if beta is not None:
827        beta = array_ops.reshape(beta, params_shape_broadcast)
828      if gamma is not None:
829        gamma = array_ops.reshape(gamma, params_shape_broadcast)
830
831    # Compute batch_normalization.
832    outputs = nn.batch_normalization(inputs, mean, variance, beta, gamma,
833                                     epsilon)
834    outputs.set_shape(inputs_shape)
835    if activation_fn is not None:
836      outputs = activation_fn(outputs)
837    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
838
839
840@add_arg_scope
841def bias_add(inputs,
842             activation_fn=None,
843             initializer=init_ops.zeros_initializer(),
844             regularizer=None,
845             reuse=None,
846             variables_collections=None,
847             outputs_collections=None,
848             trainable=True,
849             data_format=DATA_FORMAT_NHWC,
850             scope=None):
851  """Adds a bias to the inputs.
852
853  Can be used as a normalizer function for conv2d and fully_connected.
854
855  Args:
856    inputs: A tensor of with at least rank 2 and value for the last dimension,
857      e.g. `[batch_size, depth]`, `[None, None, None, depth]`.
858    activation_fn: Activation function, default set to None to skip it and
859      maintain a linear activation.
860    initializer: An initializer for the bias, defaults to 0.
861    regularizer: A regularizer like the result of
862      `l1_regularizer` or `l2_regularizer`.
863    reuse: Whether or not the layer and its variables should be reused. To be
864      able to reuse the layer scope must be given.
865    variables_collections: Optional collections for the variables.
866    outputs_collections: Collections to add the outputs.
867    trainable: If `True` also add variables to the graph collection
868      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
869    data_format: A string. 'NHWC' and 'NCHW' are supported.
870    scope: Optional scope for variable_scope.
871
872  Returns:
873    A tensor representing the result of adding biases to the inputs.
874
875  Raises:
876    ValueError: If `data_format` is neither `NHWC` nor `NCHW`.
877    ValueError: If `data_format` is `NCHW` and rank of `inputs` is not 4.
878    ValueError: If the rank of `inputs` is undefined.
879    ValueError: If rank or `C` dimension of `inputs` is undefined.
880  """
881  if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
882    raise ValueError('data_format has to be either NCHW or NHWC.')
883  with variable_scope.variable_scope(
884      scope, 'BiasAdd', [inputs], reuse=reuse) as sc:
885    inputs = ops.convert_to_tensor(inputs)
886    dtype = inputs.dtype.base_dtype
887    inputs_shape = inputs.get_shape()
888    inputs_rank = inputs_shape.ndims
889    if inputs_rank is None:
890      raise ValueError('Dims of shape must be known but is None')
891    elif inputs_rank != 4 and data_format == DATA_FORMAT_NCHW:
892      raise ValueError('Data format NCHW only supports 4D Tensor')
893    axis = 1 if data_format == DATA_FORMAT_NCHW else -1
894    num_features = inputs_shape.dims[axis].value
895    if num_features is None:
896      raise ValueError('`C` dimension must be known but is None')
897    biases_collections = utils.get_variable_collections(variables_collections,
898                                                        'biases')
899    biases = variables.model_variable(
900        'biases',
901        shape=[
902            num_features,
903        ],
904        dtype=dtype,
905        initializer=initializer,
906        regularizer=regularizer,
907        collections=biases_collections,
908        trainable=trainable)
909    outputs = nn.bias_add(inputs, biases, data_format=data_format)
910    if activation_fn is not None:
911      outputs = activation_fn(outputs)
912    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
913
914
915# TODO(jbms): change `rate` parameter to `dilation_rate` for consistency with
916# underlying op.
917@add_arg_scope
918def convolution(inputs,
919                num_outputs,
920                kernel_size,
921                stride=1,
922                padding='SAME',
923                data_format=None,
924                rate=1,
925                activation_fn=nn.relu,
926                normalizer_fn=None,
927                normalizer_params=None,
928                weights_initializer=initializers.xavier_initializer(),
929                weights_regularizer=None,
930                biases_initializer=init_ops.zeros_initializer(),
931                biases_regularizer=None,
932                reuse=None,
933                variables_collections=None,
934                outputs_collections=None,
935                trainable=True,
936                scope=None,
937                conv_dims=None):
938  """Adds an N-D convolution followed by an optional batch_norm layer.
939
940  It is required that 1 <= N <= 3.
941
942  `convolution` creates a variable called `weights`, representing the
943  convolutional kernel, that is convolved (actually cross-correlated) with the
944  `inputs` to produce a `Tensor` of activations. If a `normalizer_fn` is
945  provided (such as `batch_norm`), it is then applied. Otherwise, if
946  `normalizer_fn` is None and a `biases_initializer` is provided then a `biases`
947  variable would be created and added the activations. Finally, if
948  `activation_fn` is not `None`, it is applied to the activations as well.
949
950  Performs atrous convolution with input stride/dilation rate equal to `rate`
951  if a value > 1 for any dimension of `rate` is specified.  In this case
952  `stride` values != 1 are not supported.
953
954  Args:
955    inputs: A Tensor of rank N+2 of shape
956      `[batch_size] + input_spatial_shape + [in_channels]` if data_format does
957      not start with "NC" (default), or
958      `[batch_size, in_channels] + input_spatial_shape` if data_format starts
959      with "NC".
960    num_outputs: Integer, the number of output filters.
961    kernel_size: A sequence of N positive integers specifying the spatial
962      dimensions of the filters.  Can be a single integer to specify the same
963      value for all spatial dimensions.
964    stride: A sequence of N positive integers specifying the stride at which to
965      compute output.  Can be a single integer to specify the same value for all
966      spatial dimensions.  Specifying any `stride` value != 1 is incompatible
967      with specifying any `rate` value != 1.
968    padding: One of `"VALID"` or `"SAME"`.
969    data_format: A string or None.  Specifies whether the channel dimension of
970      the `input` and output is the last dimension (default, or if `data_format`
971      does not start with "NC"), or the second dimension (if `data_format`
972      starts with "NC").  For N=1, the valid values are "NWC" (default) and
973      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
974      For N=3, the valid values are "NDHWC" (default) and "NCDHW".
975    rate: A sequence of N positive integers specifying the dilation rate to use
976      for atrous convolution.  Can be a single integer to specify the same
977      value for all spatial dimensions.  Specifying any `rate` value != 1 is
978      incompatible with specifying any `stride` value != 1.
979    activation_fn: Activation function. The default value is a ReLU function.
980      Explicitly set it to None to skip it and maintain a linear activation.
981    normalizer_fn: Normalization function to use instead of `biases`. If
982      `normalizer_fn` is provided then `biases_initializer` and
983      `biases_regularizer` are ignored and `biases` are not created nor added.
984      default set to None for no normalizer function
985    normalizer_params: Normalization function parameters.
986    weights_initializer: An initializer for the weights.
987    weights_regularizer: Optional regularizer for the weights.
988    biases_initializer: An initializer for the biases. If None skip biases.
989    biases_regularizer: Optional regularizer for the biases.
990    reuse: Whether or not the layer and its variables should be reused. To be
991      able to reuse the layer scope must be given.
992    variables_collections: Optional list of collections for all the variables or
993      a dictionary containing a different list of collection per variable.
994    outputs_collections: Collection to add the outputs.
995    trainable: If `True` also add variables to the graph collection
996      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
997    scope: Optional scope for `variable_scope`.
998    conv_dims: Optional convolution dimensionality, when set it would use the
999      corresponding convolution (e.g. 2 for Conv 2D, 3 for Conv 3D, ..). When
1000      leaved to None it would select the convolution dimensionality based on
1001      the input rank (i.e. Conv ND, with N = input_rank - 2).
1002
1003  Returns:
1004    A tensor representing the output of the operation.
1005
1006  Raises:
1007    ValueError: If `data_format` is invalid.
1008    ValueError: Both 'rate' and `stride` are not uniformly 1.
1009  """
1010  if data_format not in [None, 'NWC', 'NCW', 'NHWC', 'NCHW', 'NDHWC', 'NCDHW']:
1011    raise ValueError('Invalid data_format: %r' % (data_format,))
1012
1013  layer_variable_getter = _build_variable_getter({
1014      'bias': 'biases',
1015      'kernel': 'weights'
1016  })
1017
1018  with variable_scope.variable_scope(
1019      scope, 'Conv', [inputs], reuse=reuse,
1020      custom_getter=layer_variable_getter) as sc:
1021    inputs = ops.convert_to_tensor(inputs)
1022    input_rank = inputs.get_shape().ndims
1023
1024    if conv_dims is not None and conv_dims + 2 != input_rank:
1025      raise ValueError('Convolution expects input with rank %d, got %d' %
1026                       (conv_dims + 2, input_rank))
1027    if input_rank == 3:
1028      layer_class = convolutional_layers.Convolution1D
1029    elif input_rank == 4:
1030      layer_class = convolutional_layers.Convolution2D
1031    elif input_rank == 5:
1032      layer_class = convolutional_layers.Convolution3D
1033    else:
1034      raise ValueError('Convolution not supported for input with rank',
1035                       input_rank)
1036
1037    df = ('channels_first'
1038          if data_format and data_format.startswith('NC') else 'channels_last')
1039    layer = layer_class(
1040        filters=num_outputs,
1041        kernel_size=kernel_size,
1042        strides=stride,
1043        padding=padding,
1044        data_format=df,
1045        dilation_rate=rate,
1046        activation=None,
1047        use_bias=not normalizer_fn and biases_initializer,
1048        kernel_initializer=weights_initializer,
1049        bias_initializer=biases_initializer,
1050        kernel_regularizer=weights_regularizer,
1051        bias_regularizer=biases_regularizer,
1052        activity_regularizer=None,
1053        trainable=trainable,
1054        name=sc.name,
1055        dtype=inputs.dtype.base_dtype,
1056        _scope=sc,
1057        _reuse=reuse)
1058    outputs = layer.apply(inputs)
1059
1060    # Add variables to collections.
1061    _add_variable_to_collections(layer.kernel, variables_collections, 'weights')
1062    if layer.use_bias:
1063      _add_variable_to_collections(layer.bias, variables_collections, 'biases')
1064
1065    if normalizer_fn is not None:
1066      normalizer_params = normalizer_params or {}
1067      outputs = normalizer_fn(outputs, **normalizer_params)
1068
1069    if activation_fn is not None:
1070      outputs = activation_fn(outputs)
1071    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
1072
1073@add_arg_scope
1074def convolution1d(inputs,
1075                  num_outputs,
1076                  kernel_size,
1077                  stride=1,
1078                  padding='SAME',
1079                  data_format=None,
1080                  rate=1,
1081                  activation_fn=nn.relu,
1082                  normalizer_fn=None,
1083                  normalizer_params=None,
1084                  weights_initializer=initializers.xavier_initializer(),
1085                  weights_regularizer=None,
1086                  biases_initializer=init_ops.zeros_initializer(),
1087                  biases_regularizer=None,
1088                  reuse=None,
1089                  variables_collections=None,
1090                  outputs_collections=None,
1091                  trainable=True,
1092                  scope=None):
1093  return convolution(inputs,
1094                     num_outputs,
1095                     kernel_size,
1096                     stride,
1097                     padding,
1098                     data_format,
1099                     rate,
1100                     activation_fn,
1101                     normalizer_fn,
1102                     normalizer_params,
1103                     weights_initializer,
1104                     weights_regularizer,
1105                     biases_initializer,
1106                     biases_regularizer,
1107                     reuse,
1108                     variables_collections,
1109                     outputs_collections,
1110                     trainable,
1111                     scope,
1112                     conv_dims=1)
1113
1114convolution1d.__doc__ = convolution.__doc__
1115
1116@add_arg_scope
1117def convolution2d(inputs,
1118                  num_outputs,
1119                  kernel_size,
1120                  stride=1,
1121                  padding='SAME',
1122                  data_format=None,
1123                  rate=1,
1124                  activation_fn=nn.relu,
1125                  normalizer_fn=None,
1126                  normalizer_params=None,
1127                  weights_initializer=initializers.xavier_initializer(),
1128                  weights_regularizer=None,
1129                  biases_initializer=init_ops.zeros_initializer(),
1130                  biases_regularizer=None,
1131                  reuse=None,
1132                  variables_collections=None,
1133                  outputs_collections=None,
1134                  trainable=True,
1135                  scope=None):
1136  return convolution(inputs,
1137                     num_outputs,
1138                     kernel_size,
1139                     stride,
1140                     padding,
1141                     data_format,
1142                     rate,
1143                     activation_fn,
1144                     normalizer_fn,
1145                     normalizer_params,
1146                     weights_initializer,
1147                     weights_regularizer,
1148                     biases_initializer,
1149                     biases_regularizer,
1150                     reuse,
1151                     variables_collections,
1152                     outputs_collections,
1153                     trainable,
1154                     scope,
1155                     conv_dims=2)
1156
1157convolution2d.__doc__ = convolution.__doc__
1158
1159@add_arg_scope
1160def convolution3d(inputs,
1161                  num_outputs,
1162                  kernel_size,
1163                  stride=1,
1164                  padding='SAME',
1165                  data_format=None,
1166                  rate=1,
1167                  activation_fn=nn.relu,
1168                  normalizer_fn=None,
1169                  normalizer_params=None,
1170                  weights_initializer=initializers.xavier_initializer(),
1171                  weights_regularizer=None,
1172                  biases_initializer=init_ops.zeros_initializer(),
1173                  biases_regularizer=None,
1174                  reuse=None,
1175                  variables_collections=None,
1176                  outputs_collections=None,
1177                  trainable=True,
1178                  scope=None):
1179  return convolution(inputs,
1180                     num_outputs,
1181                     kernel_size,
1182                     stride,
1183                     padding,
1184                     data_format,
1185                     rate,
1186                     activation_fn,
1187                     normalizer_fn,
1188                     normalizer_params,
1189                     weights_initializer,
1190                     weights_regularizer,
1191                     biases_initializer,
1192                     biases_regularizer,
1193                     reuse,
1194                     variables_collections,
1195                     outputs_collections,
1196                     trainable,
1197                     scope,
1198                     conv_dims=3)
1199
1200convolution3d.__doc__ = convolution.__doc__
1201
1202@add_arg_scope
1203def convolution2d_in_plane(
1204    inputs,
1205    kernel_size,
1206    stride=1,
1207    padding='SAME',
1208    activation_fn=nn.relu,
1209    normalizer_fn=None,
1210    normalizer_params=None,
1211    weights_initializer=initializers.xavier_initializer(),
1212    weights_regularizer=None,
1213    biases_initializer=init_ops.zeros_initializer(),
1214    biases_regularizer=None,
1215    reuse=None,
1216    variables_collections=None,
1217    outputs_collections=None,
1218    trainable=True,
1219    scope=None):
1220  """Performs the same in-plane convolution to each channel independently.
1221
1222  This is useful for performing various simple channel-independent convolution
1223  operations such as image gradients:
1224
1225    image = tf.constant(..., shape=(16, 240, 320, 3))
1226    vert_gradients = layers.conv2d_in_plane(image,
1227                                            kernel=[1, -1],
1228                                            kernel_size=[2, 1])
1229    horz_gradients = layers.conv2d_in_plane(image,
1230                                            kernel=[1, -1],
1231                                            kernel_size=[1, 2])
1232
1233  Args:
1234    inputs: A 4-D tensor with dimensions [batch_size, height, width, channels].
1235    kernel_size: A list of length 2 holding the [kernel_height, kernel_width] of
1236      of the pooling. Can be an int if both values are the same.
1237    stride: A list of length 2 `[stride_height, stride_width]`.
1238      Can be an int if both strides are the same. Note that presently
1239      both strides must have the same value.
1240    padding: The padding type to use, either 'SAME' or 'VALID'.
1241    activation_fn: Activation function. The default value is a ReLU function.
1242      Explicitly set it to None to skip it and maintain a linear activation.
1243    normalizer_fn: Normalization function to use instead of `biases`. If
1244      `normalizer_fn` is provided then `biases_initializer` and
1245      `biases_regularizer` are ignored and `biases` are not created nor added.
1246      default set to None for no normalizer function
1247    normalizer_params: Normalization function parameters.
1248    weights_initializer: An initializer for the weights.
1249    weights_regularizer: Optional regularizer for the weights.
1250    biases_initializer: An initializer for the biases. If None skip biases.
1251    biases_regularizer: Optional regularizer for the biases.
1252    reuse: Whether or not the layer and its variables should be reused. To be
1253      able to reuse the layer scope must be given.
1254    variables_collections: Optional list of collections for all the variables or
1255      a dictionary containing a different list of collection per variable.
1256    outputs_collections: Collection to add the outputs.
1257    trainable: If `True` also add variables to the graph collection
1258      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
1259    scope: Optional scope for `variable_scope`.
1260
1261  Returns:
1262    A `Tensor` representing the output of the operation.
1263  """
1264  with variable_scope.variable_scope(
1265      scope, 'ConvInPlane', [inputs], reuse=reuse) as sc:
1266    dtype = inputs.dtype.base_dtype
1267    kernel_h, kernel_w = utils.two_element_tuple(kernel_size)
1268    stride_h, stride_w = utils.two_element_tuple(stride)
1269    num_filters_in = utils.last_dimension(inputs.get_shape(), min_rank=4)
1270    weights_shape = [kernel_h, kernel_w, 1, 1]
1271    weights_collections = utils.get_variable_collections(
1272        variables_collections, 'weights')
1273    weights = variables.model_variable(
1274        'weights',
1275        shape=weights_shape,
1276        dtype=dtype,
1277        initializer=weights_initializer,
1278        regularizer=weights_regularizer,
1279        collections=weights_collections,
1280        trainable=trainable)
1281    depthwise_weights = array_ops.tile(weights, [1, 1, num_filters_in, 1])
1282    outputs = nn.depthwise_conv2d(inputs, depthwise_weights,
1283                                  [1, stride_h, stride_w, 1], padding)
1284    if normalizer_fn is not None:
1285      normalizer_params = normalizer_params or {}
1286      outputs = normalizer_fn(outputs, **normalizer_params)
1287    else:
1288      if biases_initializer is not None:
1289        biases_collections = utils.get_variable_collections(
1290            variables_collections, 'biases')
1291        biases = variables.model_variable(
1292            'biases',
1293            shape=[
1294                num_filters_in,
1295            ],
1296            dtype=dtype,
1297            initializer=biases_initializer,
1298            regularizer=biases_regularizer,
1299            collections=biases_collections,
1300            trainable=trainable)
1301        outputs = nn.bias_add(outputs, biases)
1302
1303    if activation_fn is not None:
1304      outputs = activation_fn(outputs)
1305    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
1306
1307
1308@add_arg_scope
1309def convolution2d_transpose(
1310    inputs,
1311    num_outputs,
1312    kernel_size,
1313    stride=1,
1314    padding='SAME',
1315    data_format=DATA_FORMAT_NHWC,
1316    activation_fn=nn.relu,
1317    normalizer_fn=None,
1318    normalizer_params=None,
1319    weights_initializer=initializers.xavier_initializer(),
1320    weights_regularizer=None,
1321    biases_initializer=init_ops.zeros_initializer(),
1322    biases_regularizer=None,
1323    reuse=None,
1324    variables_collections=None,
1325    outputs_collections=None,
1326    trainable=True,
1327    scope=None):
1328  """Adds a convolution2d_transpose with an optional batch normalization layer.
1329
1330  The function creates a variable called `weights`, representing the
1331  kernel, that is convolved with the input. If `normalizer_fn` is `None`, a
1332  second variable called 'biases' is added to the result of the operation.
1333
1334  Args:
1335    inputs: A 4-D `Tensor` of type `float` and shape
1336      `[batch, height, width, in_channels]` for `NHWC` data format or
1337      `[batch, in_channels, height, width]` for `NCHW` data format.
1338    num_outputs: Integer, the number of output filters.
1339    kernel_size: A list of length 2 holding the [kernel_height, kernel_width] of
1340      of the filters. Can be an int if both values are the same.
1341    stride: A list of length 2: [stride_height, stride_width].
1342      Can be an int if both strides are the same.  Note that presently
1343      both strides must have the same value.
1344    padding: One of 'VALID' or 'SAME'.
1345    data_format: A string. `NHWC` (default) and `NCHW` are supported.
1346    activation_fn: Activation function. The default value is a ReLU function.
1347      Explicitly set it to None to skip it and maintain a linear activation.
1348    normalizer_fn: Normalization function to use instead of `biases`. If
1349      `normalizer_fn` is provided then `biases_initializer` and
1350      `biases_regularizer` are ignored and `biases` are not created nor added.
1351      default set to None for no normalizer function
1352    normalizer_params: Normalization function parameters.
1353    weights_initializer: An initializer for the weights.
1354    weights_regularizer: Optional regularizer for the weights.
1355    biases_initializer: An initializer for the biases. If None skip biases.
1356    biases_regularizer: Optional regularizer for the biases.
1357    reuse: Whether or not the layer and its variables should be reused. To be
1358      able to reuse the layer scope must be given.
1359    variables_collections: Optional list of collections for all the variables or
1360      a dictionary containing a different list of collection per variable.
1361    outputs_collections: Collection to add the outputs.
1362    trainable: Whether or not the variables should be trainable or not.
1363    scope: Optional scope for variable_scope.
1364
1365  Returns:
1366    A tensor representing the output of the operation.
1367
1368  Raises:
1369    ValueError: If 'kernel_size' is not a list of length 2.
1370    ValueError: If `data_format` is neither `NHWC` nor `NCHW`.
1371    ValueError: If `C` dimension of `inputs` is None.
1372  """
1373  layer_variable_getter = _build_variable_getter({
1374      'bias': 'biases',
1375      'kernel': 'weights'
1376  })
1377
1378  with variable_scope.variable_scope(
1379      scope,
1380      'Conv2d_transpose', [inputs],
1381      reuse=reuse,
1382      custom_getter=layer_variable_getter) as sc:
1383    if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
1384      raise ValueError('data_format has to be either NCHW or NHWC.')
1385
1386    inputs = ops.convert_to_tensor(inputs)
1387
1388    df = ('channels_first'
1389          if data_format and data_format.startswith('NC') else 'channels_last')
1390    layer = convolutional_layers.Convolution2DTranspose(
1391        filters=num_outputs,
1392        kernel_size=kernel_size,
1393        strides=stride,
1394        padding=padding,
1395        data_format=df,
1396        activation=None,
1397        use_bias=not normalizer_fn and biases_initializer,
1398        kernel_initializer=weights_initializer,
1399        bias_initializer=biases_initializer,
1400        kernel_regularizer=weights_regularizer,
1401        bias_regularizer=biases_regularizer,
1402        activity_regularizer=None,
1403        trainable=trainable,
1404        name=sc.name,
1405        dtype=inputs.dtype.base_dtype,
1406        _scope=sc,
1407        _reuse=reuse)
1408    outputs = layer.apply(inputs)
1409
1410    # Add variables to collections.
1411    _add_variable_to_collections(layer.kernel, variables_collections, 'weights')
1412    if layer.bias is not None:
1413      _add_variable_to_collections(layer.bias, variables_collections, 'biases')
1414
1415    if normalizer_fn is not None:
1416      normalizer_params = normalizer_params or {}
1417      outputs = normalizer_fn(outputs, **normalizer_params)
1418
1419    if activation_fn is not None:
1420      outputs = activation_fn(outputs)
1421    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
1422
1423
1424@add_arg_scope
1425def convolution3d_transpose(
1426    inputs,
1427    num_outputs,
1428    kernel_size,
1429    stride=1,
1430    padding='SAME',
1431    data_format=DATA_FORMAT_NDHWC,
1432    activation_fn=nn.relu,
1433    normalizer_fn=None,
1434    normalizer_params=None,
1435    weights_initializer=initializers.xavier_initializer(),
1436    weights_regularizer=None,
1437    biases_initializer=init_ops.zeros_initializer(),
1438    biases_regularizer=None,
1439    reuse=None,
1440    variables_collections=None,
1441    outputs_collections=None,
1442    trainable=True,
1443    scope=None):
1444  """Adds a convolution3d_transpose with an optional batch normalization layer.
1445
1446  The function creates a variable called `weights`, representing the
1447  kernel, that is convolved with the input. If `batch_norm_params` is `None`, a
1448  second variable called 'biases' is added to the result of the operation.
1449  Args:
1450    inputs: A 5-D `Tensor` of type `float` and shape
1451      `[batch, depth, height, width, in_channels]` for `NDHWC` data format or
1452      `[batch, in_channels, depth, height, width]` for `NCDHW` data format.
1453    num_outputs: Integer, the number of output filters.
1454    kernel_size: A list of length 3 holding the [kernel_depth, kernel_height,
1455      kernel_width] of the filters. Can be an int if both values are the same.
1456    stride: A list of length 3: [stride_depth, stride_height, stride_width].
1457      Can be an int if both strides are the same.  Note that presently
1458      both strides must have the same value.
1459    padding: One of 'VALID' or 'SAME'.
1460    data_format: A string. `NDHWC` (default) and `NCDHW` are supported.
1461    activation_fn: Activation function. The default value is a ReLU function.
1462      Explicitly set it to None to skip it and maintain a linear activation.
1463    normalizer_fn: Normalization function to use instead of `biases`. If
1464      `normalizer_fn` is provided then `biases_initializer` and
1465      `biases_regularizer` are ignored and `biases` are not created nor added.
1466      default set to None for no normalizer function
1467    normalizer_params: Normalization function parameters.
1468    weights_initializer: An initializer for the weights.
1469    weights_regularizer: Optional regularizer for the weights.
1470    biases_initializer: An initializer for the biases. If None skip biases.
1471    biases_regularizer: Optional regularizer for the biases.
1472    reuse: Whether or not the layer and its variables should be reused. To be
1473      able to reuse the layer scope must be given.
1474    variables_collections: Optional list of collections for all the variables or
1475      a dictionary containing a different list of collection per variable.
1476    outputs_collections: Collection to add the outputs.
1477    trainable: Whether or not the variables should be trainable or not.
1478    scope: Optional scope for variable_scope.
1479  Returns:
1480    A tensor representing the output of the operation.
1481  Raises:
1482    ValueError: If 'kernel_size' is not a list of length 3.
1483    ValueError: If `data_format` is neither `NDHWC` nor `NCDHW`.
1484    ValueError: If `C` dimension of `inputs` is None.
1485  """
1486  layer_variable_getter = _build_variable_getter({
1487      'bias': 'biases',
1488      'kernel': 'weights'
1489  })
1490
1491  with variable_scope.variable_scope(
1492      scope,
1493      'Conv3d_transpose', [inputs],
1494      reuse=reuse,
1495      custom_getter=layer_variable_getter) as sc:
1496    if data_format not in (DATA_FORMAT_NCDHW, DATA_FORMAT_NDHWC):
1497      raise ValueError('data_format has to be either NCDHW or NDHWC.')
1498
1499    inputs = ops.convert_to_tensor(inputs)
1500
1501    df = ('channels_first'
1502          if data_format and data_format.startswith('NC') else 'channels_last')
1503    layer = convolutional_layers.Convolution3DTranspose(
1504        filters=num_outputs,
1505        kernel_size=kernel_size,
1506        strides=stride,
1507        padding=padding,
1508        data_format=df,
1509        activation=None,
1510        use_bias=not normalizer_fn and biases_initializer,
1511        kernel_initializer=weights_initializer,
1512        bias_initializer=biases_initializer,
1513        kernel_regularizer=weights_regularizer,
1514        bias_regularizer=biases_regularizer,
1515        activity_regularizer=None,
1516        trainable=trainable,
1517        name=sc.name,
1518        dtype=inputs.dtype.base_dtype,
1519        _scope=sc,
1520        _reuse=reuse)
1521    outputs = layer.apply(inputs)
1522
1523    # Add variables to collections.
1524    _add_variable_to_collections(layer.kernel, variables_collections, 'weights')
1525    if layer.bias is not None:
1526      _add_variable_to_collections(layer.bias, variables_collections, 'biases')
1527
1528    if normalizer_fn is not None:
1529      normalizer_params = normalizer_params or {}
1530      outputs = normalizer_fn(outputs, **normalizer_params)
1531
1532    if activation_fn is not None:
1533      outputs = activation_fn(outputs)
1534    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
1535
1536
1537@add_arg_scope
1538def dense_to_sparse(tensor, eos_token=0, outputs_collections=None, scope=None):
1539  """Converts a dense tensor into a sparse tensor.
1540
1541  An example use would be to convert dense labels to sparse ones
1542  so that they can be fed to the ctc_loss.
1543
1544  Args:
1545     tensor: An `int` `Tensor` to be converted to a `Sparse`.
1546     eos_token: An integer.
1547       It is part of the target label that signifies the end of a sentence.
1548     outputs_collections: Collection to add the outputs.
1549     scope: Optional scope for name_scope.
1550  """
1551  with variable_scope.variable_scope(scope, 'dense_to_sparse', [tensor]) as sc:
1552    tensor = ops.convert_to_tensor(tensor)
1553    indices = array_ops.where(
1554        math_ops.not_equal(tensor, constant_op.constant(eos_token,
1555                                                        tensor.dtype)))
1556    values = array_ops.gather_nd(tensor, indices)
1557    shape = array_ops.shape(tensor, out_type=dtypes.int64)
1558    outputs = sparse_tensor.SparseTensor(indices, values, shape)
1559    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
1560
1561
1562@add_arg_scope
1563def dropout(inputs,
1564            keep_prob=0.5,
1565            noise_shape=None,
1566            is_training=True,
1567            outputs_collections=None,
1568            scope=None,
1569            seed=None):
1570  """Returns a dropout op applied to the input.
1571
1572  With probability `keep_prob`, outputs the input element scaled up by
1573  `1 / keep_prob`, otherwise outputs `0`.  The scaling is so that the expected
1574  sum is unchanged.
1575
1576  Args:
1577    inputs: The tensor to pass to the nn.dropout op.
1578    keep_prob: A scalar `Tensor` with the same type as x. The probability
1579      that each element is kept.
1580    noise_shape: A 1-D `Tensor` of type `int32`, representing the
1581      shape for randomly generated keep/drop flags.
1582    is_training: A bool `Tensor` indicating whether or not the model
1583      is in training mode. If so, dropout is applied and values scaled.
1584      Otherwise, inputs is returned.
1585    outputs_collections: Collection to add the outputs.
1586    scope: Optional scope for name_scope.
1587    seed: A Python integer. Used to create random seeds. See
1588      `tf.set_random_seed` for behavior.
1589
1590  Returns:
1591    A tensor representing the output of the operation.
1592  """
1593  with variable_scope.variable_scope(
1594      scope, 'Dropout', [inputs], custom_getter=_model_variable_getter) as sc:
1595    inputs = ops.convert_to_tensor(inputs)
1596    layer = core_layers.Dropout(
1597        rate=1 - keep_prob,
1598        noise_shape=noise_shape,
1599        seed=seed,
1600        name=sc.name,
1601        _scope=sc)
1602    outputs = layer.apply(inputs, training=is_training)
1603    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
1604
1605
1606@add_arg_scope
1607def flatten(inputs, outputs_collections=None, scope=None):
1608  """Flattens the input while maintaining the batch_size.
1609
1610    Assumes that the first dimension represents the batch.
1611
1612  Args:
1613    inputs: A tensor of size [batch_size, ...].
1614    outputs_collections: Collection to add the outputs.
1615    scope: Optional scope for name_scope.
1616
1617  Returns:
1618    A flattened tensor with shape [batch_size, k].
1619  Raises:
1620    ValueError: If inputs rank is unknown or less than 2.
1621  """
1622  with ops.name_scope(scope, 'Flatten', [inputs]) as sc:
1623    inputs = ops.convert_to_tensor(inputs)
1624    outputs = core_layers.flatten(inputs)
1625    return utils.collect_named_outputs(outputs_collections, sc, outputs)
1626
1627
1628def _sparse_inner_flatten(inputs, new_rank):
1629  """Helper function for `inner_flatten`."""
1630  inputs_rank = inputs.dense_shape.get_shape().as_list()[0]
1631  if inputs_rank < new_rank:
1632    raise ValueError(
1633        'Inputs has rank less than new_rank. {} must have rank at least'
1634        ' {}. Received rank {}, shape {}'.format(inputs, new_rank, inputs_rank,
1635                                                 inputs.get_shape()))
1636
1637  outer_dimensions = inputs.dense_shape[:new_rank - 1]
1638  inner_dimensions = inputs.dense_shape[new_rank - 1:]
1639  new_shape = array_ops.concat(
1640      (outer_dimensions, [math_ops.reduce_prod(inner_dimensions)]), 0)
1641  flattened = sparse_ops.sparse_reshape(inputs, new_shape)
1642  return flattened
1643
1644
1645def _dense_inner_flatten(inputs, new_rank):
1646  """Helper function for `inner_flatten`."""
1647  rank_assertion = check_ops.assert_rank_at_least(
1648      inputs, new_rank, message='inputs has rank less than new_rank')
1649  with ops.control_dependencies([rank_assertion]):
1650    outer_dimensions = array_ops.strided_slice(
1651        array_ops.shape(inputs), [0], [new_rank - 1])
1652    new_shape = array_ops.concat((outer_dimensions, [-1]), 0)
1653    reshaped = array_ops.reshape(inputs, new_shape)
1654
1655  # if `new_rank` is an integer, try to calculate new shape.
1656  if isinstance(new_rank, six.integer_types):
1657    static_shape = inputs.get_shape()
1658    if static_shape is not None and static_shape.dims is not None:
1659      static_shape = static_shape.as_list()
1660      static_outer_dims = static_shape[:new_rank - 1]
1661      static_inner_dims = static_shape[new_rank - 1:]
1662      flattened_dimension = 1
1663      for inner_dim in static_inner_dims:
1664        if inner_dim is None:
1665          flattened_dimension = None
1666          break
1667        flattened_dimension *= inner_dim
1668      reshaped.set_shape(static_outer_dims + [flattened_dimension])
1669  return reshaped
1670
1671
1672@add_arg_scope
1673def _inner_flatten(inputs, new_rank, output_collections=None, scope=None):
1674  """Flattens inner dimensions of `inputs`, returns a Tensor with `new_rank`.
1675
1676  For example:
1677  '''
1678      x = tf.random_uniform(shape=[1, 2, 3, 4, 5, 6])
1679      y = _inner_flatten(x, 4)
1680      assert y.get_shape().as_list() == [1, 2, 3, (4 * 5 * 6)]
1681  '''
1682  This layer will fail at run time if `new_rank` is greater than the current
1683  rank of `inputs`.
1684
1685  Args:
1686    inputs: A `Tensor` or `SparseTensor`.
1687    new_rank: The desired rank of the returned `Tensor` or `SparseTensor`.
1688    output_collections: Collection to which the outputs will be added.
1689    scope: Optional scope for `name_scope`.
1690  Returns:
1691    A `Tensor` or `SparseTensor` containing the same values as `inputs`, but
1692    with innermost dimensions flattened to obtain rank `new_rank`.
1693
1694  Raises:
1695    TypeError: `inputs` is not a `Tensor` or `SparseTensor`.
1696  """
1697  with ops.name_scope(scope, 'InnerFlatten', [inputs, new_rank]) as sc:
1698    if isinstance(inputs, sparse_tensor.SparseTensor):
1699      flattened = _sparse_inner_flatten(inputs, new_rank)
1700    else:
1701      inputs = ops.convert_to_tensor(inputs)
1702      flattened = _dense_inner_flatten(inputs, new_rank)
1703  return utils.collect_named_outputs(output_collections, sc, flattened)
1704
1705
1706def _model_variable_getter(
1707    getter,
1708    name,
1709    shape=None,
1710    dtype=None,
1711    initializer=None,
1712    regularizer=None,
1713    trainable=True,
1714    collections=None,
1715    caching_device=None,
1716    partitioner=None,
1717    rename=None,
1718    use_resource=None,
1719    synchronization=tf_variables.VariableSynchronization.AUTO,
1720    aggregation=tf_variables.VariableAggregation.NONE,
1721    **_):
1722  """Getter that uses model_variable for compatibility with core layers."""
1723  short_name = name.split('/')[-1]
1724  if rename and short_name in rename:
1725    name_components = name.split('/')
1726    name_components[-1] = rename[short_name]
1727    name = '/'.join(name_components)
1728  return variables.model_variable(
1729      name,
1730      shape=shape,
1731      dtype=dtype,
1732      initializer=initializer,
1733      regularizer=regularizer,
1734      collections=collections,
1735      trainable=trainable,
1736      caching_device=caching_device,
1737      partitioner=partitioner,
1738      custom_getter=getter,
1739      use_resource=use_resource,
1740      synchronization=synchronization,
1741      aggregation=aggregation)
1742
1743
1744def _build_variable_getter(rename=None):
1745  """Build a model variable getter that respects scope getter and renames."""
1746
1747  # VariableScope will nest the getters
1748  def layer_variable_getter(getter, *args, **kwargs):
1749    kwargs['rename'] = rename
1750    return _model_variable_getter(getter, *args, **kwargs)
1751
1752  return layer_variable_getter
1753
1754
1755def _add_variable_to_collections(variable, collections_set, collections_name):
1756  """Adds variable (or all its parts) to all collections with that name."""
1757  collections = utils.get_variable_collections(collections_set,
1758                                               collections_name) or []
1759  variables_list = [variable]
1760  if isinstance(variable, tf_variables.PartitionedVariable):
1761    variables_list = [v for v in variable]
1762  for collection in collections:
1763    for var in variables_list:
1764      if var not in ops.get_collection(collection):
1765        ops.add_to_collection(collection, var)
1766
1767
1768@add_arg_scope
1769def fully_connected(inputs,
1770                    num_outputs,
1771                    activation_fn=nn.relu,
1772                    normalizer_fn=None,
1773                    normalizer_params=None,
1774                    weights_initializer=initializers.xavier_initializer(),
1775                    weights_regularizer=None,
1776                    biases_initializer=init_ops.zeros_initializer(),
1777                    biases_regularizer=None,
1778                    reuse=None,
1779                    variables_collections=None,
1780                    outputs_collections=None,
1781                    trainable=True,
1782                    scope=None):
1783  """Adds a fully connected layer.
1784
1785  `fully_connected` creates a variable called `weights`, representing a fully
1786  connected weight matrix, which is multiplied by the `inputs` to produce a
1787  `Tensor` of hidden units. If a `normalizer_fn` is provided (such as
1788  `batch_norm`), it is then applied. Otherwise, if `normalizer_fn` is
1789  None and a `biases_initializer` is provided then a `biases` variable would be
1790  created and added the hidden units. Finally, if `activation_fn` is not `None`,
1791  it is applied to the hidden units as well.
1792
1793  Note: that if `inputs` have a rank greater than 2, then `inputs` is flattened
1794  prior to the initial matrix multiply by `weights`.
1795
1796  Args:
1797    inputs: A tensor of at least rank 2 and static value for the last dimension;
1798      i.e. `[batch_size, depth]`, `[None, None, None, channels]`.
1799    num_outputs: Integer or long, the number of output units in the layer.
1800    activation_fn: Activation function. The default value is a ReLU function.
1801      Explicitly set it to None to skip it and maintain a linear activation.
1802    normalizer_fn: Normalization function to use instead of `biases`. If
1803      `normalizer_fn` is provided then `biases_initializer` and
1804      `biases_regularizer` are ignored and `biases` are not created nor added.
1805      default set to None for no normalizer function
1806    normalizer_params: Normalization function parameters.
1807    weights_initializer: An initializer for the weights.
1808    weights_regularizer: Optional regularizer for the weights.
1809    biases_initializer: An initializer for the biases. If None skip biases.
1810    biases_regularizer: Optional regularizer for the biases.
1811    reuse: Whether or not the layer and its variables should be reused. To be
1812      able to reuse the layer scope must be given.
1813    variables_collections: Optional list of collections for all the variables or
1814      a dictionary containing a different list of collections per variable.
1815    outputs_collections: Collection to add the outputs.
1816    trainable: If `True` also add variables to the graph collection
1817      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
1818    scope: Optional scope for variable_scope.
1819
1820  Returns:
1821     The tensor variable representing the result of the series of operations.
1822
1823  Raises:
1824    ValueError: If x has rank less than 2 or if its last dimension is not set.
1825  """
1826  if not isinstance(num_outputs, six.integer_types):
1827    raise ValueError('num_outputs type should be one of %s, got %s.' % (
1828        list(six.integer_types), type(num_outputs)))
1829
1830  layer_variable_getter = _build_variable_getter({
1831      'bias': 'biases',
1832      'kernel': 'weights'
1833  })
1834
1835  with variable_scope.variable_scope(
1836      scope,
1837      'fully_connected', [inputs],
1838      reuse=reuse,
1839      custom_getter=layer_variable_getter) as sc:
1840    inputs = ops.convert_to_tensor(inputs)
1841    layer = core_layers.Dense(
1842        units=num_outputs,
1843        activation=None,
1844        use_bias=not normalizer_fn and biases_initializer,
1845        kernel_initializer=weights_initializer,
1846        bias_initializer=biases_initializer,
1847        kernel_regularizer=weights_regularizer,
1848        bias_regularizer=biases_regularizer,
1849        activity_regularizer=None,
1850        trainable=trainable,
1851        name=sc.name,
1852        dtype=inputs.dtype.base_dtype,
1853        _scope=sc,
1854        _reuse=reuse)
1855    outputs = layer.apply(inputs)
1856
1857    # Add variables to collections.
1858    _add_variable_to_collections(layer.kernel, variables_collections, 'weights')
1859    if layer.bias is not None:
1860      _add_variable_to_collections(layer.bias, variables_collections, 'biases')
1861
1862    # Apply normalizer function / layer.
1863    if normalizer_fn is not None:
1864      if not normalizer_params:
1865        normalizer_params = {}
1866      outputs = normalizer_fn(outputs, **normalizer_params)
1867
1868    if activation_fn is not None:
1869      outputs = activation_fn(outputs)
1870
1871    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
1872
1873
1874class GDN(base.Layer):
1875  """Generalized divisive normalization layer.
1876
1877  Based on the papers:
1878
1879    "Density Modeling of Images using a Generalized Normalization
1880    Transformation"
1881
1882    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
1883
1884    https://arxiv.org/abs/1511.06281
1885
1886    "End-to-end Optimized Image Compression"
1887
1888    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
1889
1890    https://arxiv.org/abs/1611.01704
1891
1892  Implements an activation function that is essentially a multivariate
1893  generalization of a particular sigmoid-type function:
1894
1895  ```
1896  y[i] = x[i] / sqrt(beta[i] + sum_j(gamma[j, i] * x[j]))
1897  ```
1898
1899  where `i` and `j` run over channels. This implementation never sums across
1900  spatial dimensions. It is similar to local response normalization, but much
1901  more flexible, as `beta` and `gamma` are trainable parameters.
1902
1903  Arguments:
1904    inverse: If `False` (default), compute GDN response. If `True`, compute IGDN
1905      response (one step of fixed point iteration to invert GDN; the division
1906      is replaced by multiplication).
1907    beta_min: Lower bound for beta, to prevent numerical error from causing
1908      square root of zero or negative values.
1909    gamma_init: The gamma matrix will be initialized as the identity matrix
1910      multiplied with this value. If set to zero, the layer is effectively
1911      initialized to the identity operation, since beta is initialized as one.
1912      A good default setting is somewhere between 0 and 0.5.
1913    reparam_offset: Offset added to the reparameterization of beta and gamma.
1914      The reparameterization of beta and gamma as their square roots lets the
1915      training slow down when their values are close to zero, which is desirable
1916      as small values in the denominator can lead to a situation where gradient
1917      noise on beta/gamma leads to extreme amounts of noise in the GDN
1918      activations. However, without the offset, we would get zero gradients if
1919      any elements of beta or gamma were exactly zero, and thus the training
1920      could get stuck. To prevent this, we add this small constant. The default
1921      value was empirically determined as a good starting point. Making it
1922      bigger potentially leads to more gradient noise on the activations, making
1923      it too small may lead to numerical precision issues.
1924    data_format: Format of input tensor. Currently supports `'channels_first'`
1925      and `'channels_last'`.
1926    activity_regularizer: Regularizer function for the output.
1927    trainable: Boolean, if `True`, also add variables to the graph collection
1928      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
1929    name: String, the name of the layer. Layers with the same name will
1930      share weights, but to avoid mistakes we require `reuse=True` in such
1931      cases.
1932
1933  Properties:
1934    inverse: Boolean, whether GDN is computed (`True`) or IGDN (`False`).
1935    data_format: Format of input tensor. Currently supports `'channels_first'`
1936      and `'channels_last'`.
1937    beta: The beta parameter as defined above (1D `Tensor`).
1938    gamma: The gamma parameter as defined above (2D `Tensor`).
1939  """
1940
1941  def __init__(self,
1942               inverse=False,
1943               beta_min=1e-6,
1944               gamma_init=.1,
1945               reparam_offset=2**-18,
1946               data_format='channels_last',
1947               activity_regularizer=None,
1948               trainable=True,
1949               name=None,
1950               **kwargs):
1951    super(GDN, self).__init__(
1952        trainable=trainable,
1953        name=name,
1954        activity_regularizer=activity_regularizer,
1955        **kwargs)
1956    self.inverse = inverse
1957    self._beta_min = beta_min
1958    self._gamma_init = gamma_init
1959    self._reparam_offset = reparam_offset
1960    self.data_format = data_format
1961    self._channel_axis()  # trigger ValueError early
1962    self.input_spec = input_spec.InputSpec(min_ndim=3, max_ndim=5)
1963
1964  def _channel_axis(self):
1965    try:
1966      return {'channels_first': 1, 'channels_last': -1}[self.data_format]
1967    except KeyError:
1968      raise ValueError('Unsupported `data_format` for GDN layer: {}.'.format(
1969          self.data_format))
1970
1971  @staticmethod
1972  def _lower_bound(inputs, bound, name=None):
1973    """Same as tf.maximum, but with helpful gradient for inputs < bound.
1974
1975    The gradient is overwritten so that it is passed through if the input is not
1976    hitting the bound. If it is, only gradients that push `inputs` higher than
1977    the bound are passed through. No gradients are passed through to the bound.
1978
1979    Args:
1980      inputs: input tensor
1981      bound: lower bound for the input tensor
1982      name: name for this op
1983
1984    Returns:
1985      tf.maximum(inputs, bound)
1986    """
1987    with ops.name_scope(name, 'GDNLowerBound', [inputs, bound]) as scope:
1988      inputs = ops.convert_to_tensor(inputs, name='inputs')
1989      bound = ops.convert_to_tensor(bound, name='bound')
1990      with ops.get_default_graph().gradient_override_map({
1991          'Maximum': 'GDNLowerBound'
1992      }):
1993        return math_ops.maximum(inputs, bound, name=scope)
1994
1995  @staticmethod
1996  def _lower_bound_grad(op, grad):
1997    """Gradient for `_lower_bound`.
1998
1999    Args:
2000      op: the tensorflow op for which to calculate a gradient
2001      grad: gradient with respect to the output of the op
2002
2003    Returns:
2004      gradients with respect to the inputs of the op
2005    """
2006    inputs = op.inputs[0]
2007    bound = op.inputs[1]
2008    pass_through_if = math_ops.logical_or(inputs >= bound, grad < 0)
2009    return [math_ops.cast(pass_through_if, grad.dtype) * grad, None]
2010
2011  def build(self, input_shape):
2012    channel_axis = self._channel_axis()
2013    input_shape = tensor_shape.TensorShape(input_shape)
2014    num_channels = input_shape.dims[channel_axis].value
2015    if num_channels is None:
2016      raise ValueError('The channel dimension of the inputs to `GDN` '
2017                       'must be defined.')
2018    self._input_rank = input_shape.ndims
2019    self.input_spec = input_spec.InputSpec(
2020        ndim=input_shape.ndims, axes={
2021            channel_axis: num_channels
2022        })
2023
2024    pedestal = array_ops.constant(self._reparam_offset**2, dtype=self.dtype)
2025    beta_bound = array_ops.constant(
2026        (self._beta_min + self._reparam_offset**2)**.5, dtype=self.dtype)
2027    gamma_bound = array_ops.constant(self._reparam_offset, dtype=self.dtype)
2028
2029    def beta_initializer(shape, dtype=None, partition_info=None):
2030      del partition_info  # unused
2031      pedestal = array_ops.constant(self._reparam_offset**2, dtype=self.dtype)
2032      return math_ops.sqrt(array_ops.ones(shape, dtype=dtype) + pedestal)
2033
2034    def gamma_initializer(shape, dtype=None, partition_info=None):
2035      del partition_info  # unused
2036      assert len(shape) == 2
2037      assert shape[0] == shape[1]
2038      eye = linalg_ops.eye(shape[0], dtype=dtype)
2039      pedestal = array_ops.constant(self._reparam_offset**2, dtype=self.dtype)
2040      return math_ops.sqrt(self._gamma_init * eye + pedestal)
2041
2042    beta = self.add_variable(
2043        'reparam_beta',
2044        shape=[num_channels],
2045        initializer=beta_initializer,
2046        dtype=self.dtype,
2047        trainable=True)
2048    beta = self._lower_bound(beta, beta_bound)
2049    self.beta = math_ops.square(beta) - pedestal
2050
2051    gamma = self.add_variable(
2052        'reparam_gamma',
2053        shape=[num_channels, num_channels],
2054        initializer=gamma_initializer,
2055        dtype=self.dtype,
2056        trainable=True)
2057    gamma = self._lower_bound(gamma, gamma_bound)
2058    self.gamma = math_ops.square(gamma) - pedestal
2059
2060    self.built = True
2061
2062  def call(self, inputs):
2063    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
2064    ndim = self._input_rank
2065
2066    shape = self.gamma.get_shape().as_list()
2067    gamma = array_ops.reshape(self.gamma, (ndim - 2) * [1] + shape)
2068
2069    # Compute normalization pool.
2070    if self.data_format == 'channels_first':
2071      norm_pool = nn.convolution(
2072          math_ops.square(inputs),
2073          gamma,
2074          'VALID',
2075          data_format='NC' + 'DHW' [-(ndim - 2):])
2076      if ndim == 3:
2077        norm_pool = array_ops.expand_dims(norm_pool, 2)
2078        norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
2079        norm_pool = array_ops.squeeze(norm_pool, [2])
2080      elif ndim == 5:
2081        shape = array_ops.shape(norm_pool)
2082        norm_pool = array_ops.reshape(norm_pool, shape[:3] + [-1])
2083        norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
2084        norm_pool = array_ops.reshape(norm_pool, shape)
2085      else:  # ndim == 4
2086        norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NCHW')
2087    else:  # channels_last
2088      norm_pool = nn.convolution(math_ops.square(inputs), gamma, 'VALID')
2089      norm_pool = nn.bias_add(norm_pool, self.beta, data_format='NHWC')
2090    norm_pool = math_ops.sqrt(norm_pool)
2091
2092    if self.inverse:
2093      outputs = inputs * norm_pool
2094    else:
2095      outputs = inputs / norm_pool
2096    outputs.set_shape(inputs.get_shape())
2097    return outputs
2098
2099  def compute_output_shape(self, input_shape):
2100    channel_axis = self._channel_axis()
2101    input_shape = tensor_shape.TensorShape(input_shape)
2102    if not 3 <= input_shape.ndim <= 5:
2103      raise ValueError('`input_shape` must be of rank 3 to 5, inclusive.')
2104    if input_shape.dims[channel_axis].value is None:
2105      raise ValueError(
2106          'The channel dimension of `input_shape` must be defined.')
2107    return input_shape
2108
2109
2110ops.RegisterGradient('GDNLowerBound')(GDN._lower_bound_grad)  # pylint:disable=protected-access
2111
2112
2113def gdn(inputs,
2114        inverse=False,
2115        beta_min=1e-6,
2116        gamma_init=.1,
2117        reparam_offset=2**-18,
2118        data_format='channels_last',
2119        activity_regularizer=None,
2120        trainable=True,
2121        name=None,
2122        reuse=None):
2123  """Functional interface for GDN layer.
2124
2125  Based on the papers:
2126
2127    "Density Modeling of Images using a Generalized Normalization
2128    Transformation"
2129    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
2130    https://arxiv.org/abs/1511.06281
2131
2132    "End-to-end Optimized Image Compression"
2133    Johannes Ballé, Valero Laparra, Eero P. Simoncelli
2134    https://arxiv.org/abs/1611.01704
2135
2136  Implements an activation function that is essentially a multivariate
2137  generalization of a particular sigmoid-type function:
2138
2139  ```
2140  y[i] = x[i] / sqrt(beta[i] + sum_j(gamma[j, i] * x[j]))
2141  ```
2142
2143  where `i` and `j` run over channels. This implementation never sums across
2144  spatial dimensions. It is similar to local response normalization, but much
2145  more flexible, as `beta` and `gamma` are trainable parameters.
2146
2147  Args:
2148    inputs: Tensor input.
2149    inverse: If `False` (default), compute GDN response. If `True`, compute IGDN
2150      response (one step of fixed point iteration to invert GDN; the division
2151      is replaced by multiplication).
2152    beta_min: Lower bound for beta, to prevent numerical error from causing
2153      square root of zero or negative values.
2154    gamma_init: The gamma matrix will be initialized as the identity matrix
2155      multiplied with this value. If set to zero, the layer is effectively
2156      initialized to the identity operation, since beta is initialized as one.
2157      A good default setting is somewhere between 0 and 0.5.
2158    reparam_offset: Offset added to the reparameterization of beta and gamma.
2159      The reparameterization of beta and gamma as their square roots lets the
2160      training slow down when their values are close to zero, which is desirable
2161      as small values in the denominator can lead to a situation where gradient
2162      noise on beta/gamma leads to extreme amounts of noise in the GDN
2163      activations. However, without the offset, we would get zero gradients if
2164      any elements of beta or gamma were exactly zero, and thus the training
2165      could get stuck. To prevent this, we add this small constant. The default
2166      value was empirically determined as a good starting point. Making it
2167      bigger potentially leads to more gradient noise on the activations, making
2168      it too small may lead to numerical precision issues.
2169    data_format: Format of input tensor. Currently supports `'channels_first'`
2170      and `'channels_last'`.
2171    activity_regularizer: Regularizer function for the output.
2172    trainable: Boolean, if `True`, also add variables to the graph collection
2173      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
2174    name: String, the name of the layer. Layers with the same name will
2175      share weights, but to avoid mistakes we require `reuse=True` in such
2176      cases.
2177    reuse: Boolean, whether to reuse the weights of a previous layer by the same
2178      name.
2179
2180  Returns:
2181    Output tensor.
2182  """
2183  layer = GDN(
2184      inverse=inverse,
2185      beta_min=beta_min,
2186      gamma_init=gamma_init,
2187      reparam_offset=reparam_offset,
2188      data_format=data_format,
2189      activity_regularizer=activity_regularizer,
2190      trainable=trainable,
2191      name=name,
2192      dtype=inputs.dtype.base_dtype,
2193      _scope=name,
2194      _reuse=reuse)
2195  return layer.apply(inputs)
2196
2197
2198@add_arg_scope
2199def layer_norm(inputs,
2200               center=True,
2201               scale=True,
2202               activation_fn=None,
2203               reuse=None,
2204               variables_collections=None,
2205               outputs_collections=None,
2206               trainable=True,
2207               begin_norm_axis=1,
2208               begin_params_axis=-1,
2209               scope=None):
2210  """Adds a Layer Normalization layer.
2211
2212  Based on the paper:
2213
2214    "Layer Normalization"
2215
2216    Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
2217
2218    https://arxiv.org/abs/1607.06450.
2219
2220  Can be used as a normalizer function for conv2d and fully_connected.
2221
2222  Given a tensor `inputs` of rank `R`, moments are calculated and normalization
2223  is performed over axes `begin_norm_axis ... R - 1`.  Scaling and centering,
2224  if requested, is performed over axes `begin_params_axis .. R - 1`.
2225
2226  By default, `begin_norm_axis = 1` and `begin_params_axis = -1`,
2227  meaning that normalization is performed over all but the first axis
2228  (the `HWC` if `inputs` is `NHWC`), while the `beta` and `gamma` trainable
2229  parameters are calculated for the rightmost axis (the `C` if `inputs` is
2230  `NHWC`).  Scaling and recentering is performed via broadcast of the
2231  `beta` and `gamma` parameters with the normalized tensor.
2232
2233  The shapes of `beta` and `gamma` are `inputs.shape[begin_params_axis:]`,
2234  and this part of the inputs' shape must be fully defined.
2235
2236  Args:
2237    inputs: A tensor having rank `R`. The normalization is performed over
2238      axes `begin_norm_axis ... R - 1` and centering and scaling parameters
2239      are calculated over `begin_params_axis ... R - 1`.
2240    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
2241      is ignored.
2242    scale: If True, multiply by `gamma`. If False, `gamma` is
2243      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
2244      disabled since the scaling can be done by the next layer.
2245    activation_fn: Activation function, default set to None to skip it and
2246      maintain a linear activation.
2247    reuse: Whether or not the layer and its variables should be reused. To be
2248      able to reuse the layer scope must be given.
2249    variables_collections: Optional collections for the variables.
2250    outputs_collections: Collections to add the outputs.
2251    trainable: If `True` also add variables to the graph collection
2252      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
2253    begin_norm_axis: The first normalization dimension: normalization will be
2254      performed along dimensions `begin_norm_axis : rank(inputs)`
2255    begin_params_axis: The first parameter (beta, gamma) dimension: scale
2256      and centering parameters will have dimensions
2257      `begin_params_axis : rank(inputs)` and will be broadcast with the
2258      normalized inputs accordingly.
2259    scope: Optional scope for `variable_scope`.
2260
2261  Returns:
2262    A `Tensor` representing the output of the operation, having the same
2263    shape and dtype as `inputs`.
2264
2265  Raises:
2266    ValueError: If the rank of `inputs` is not known at graph build time,
2267      or if `inputs.shape[begin_params_axis:]` is not fully defined at
2268      graph build time.
2269  """
2270  with variable_scope.variable_scope(
2271      scope, 'LayerNorm', [inputs], reuse=reuse) as sc:
2272    inputs = ops.convert_to_tensor(inputs)
2273    inputs_shape = inputs.shape
2274    inputs_rank = inputs_shape.ndims
2275    if inputs_rank is None:
2276      raise ValueError('Inputs %s has undefined rank.' % inputs.name)
2277    dtype = inputs.dtype.base_dtype
2278    if begin_norm_axis < 0:
2279      begin_norm_axis = inputs_rank + begin_norm_axis
2280    if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank:
2281      raise ValueError('begin_params_axis (%d) and begin_norm_axis (%d) '
2282                       'must be < rank(inputs) (%d)' %
2283                       (begin_params_axis, begin_norm_axis, inputs_rank))
2284    params_shape = inputs_shape[begin_params_axis:]
2285    if not params_shape.is_fully_defined():
2286      raise ValueError(
2287          'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' %
2288          (inputs.name, begin_params_axis, inputs_shape))
2289    # Allocate parameters for the beta and gamma of the normalization.
2290    beta, gamma = None, None
2291    if center:
2292      beta_collections = utils.get_variable_collections(variables_collections,
2293                                                        'beta')
2294      beta = variables.model_variable(
2295          'beta',
2296          shape=params_shape,
2297          dtype=dtype,
2298          initializer=init_ops.zeros_initializer(),
2299          collections=beta_collections,
2300          trainable=trainable)
2301    if scale:
2302      gamma_collections = utils.get_variable_collections(
2303          variables_collections, 'gamma')
2304      gamma = variables.model_variable(
2305          'gamma',
2306          shape=params_shape,
2307          dtype=dtype,
2308          initializer=init_ops.ones_initializer(),
2309          collections=gamma_collections,
2310          trainable=trainable)
2311    # By default, compute the moments across all the dimensions except the one with index 0.
2312    norm_axes = list(range(begin_norm_axis, inputs_rank))
2313    mean, variance = nn.moments(inputs, norm_axes, keep_dims=True)
2314    # Compute layer normalization using the batch_normalization function.
2315    # Note that epsilon must be increased for float16 due to the limited
2316    # representable range.
2317    variance_epsilon = 1e-12 if dtype != dtypes.float16 else 1e-3
2318    outputs = nn.batch_normalization(
2319        inputs,
2320        mean,
2321        variance,
2322        offset=beta,
2323        scale=gamma,
2324        variance_epsilon=variance_epsilon)
2325    outputs.set_shape(inputs_shape)
2326    if activation_fn is not None:
2327      outputs = activation_fn(outputs)
2328    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
2329
2330
2331@add_arg_scope
2332def images_to_sequence(inputs,
2333                       data_format=DATA_FORMAT_NHWC,
2334                       outputs_collections=None,
2335                       scope=None):
2336  """Convert a batch of images into a batch of sequences.
2337
2338  Args:
2339    inputs: a (num_images, height, width, depth) tensor
2340    data_format: A string. `NHWC` (default) and `NCHW` are supported.
2341    outputs_collections: The collections to which the outputs are added.
2342    scope: Optional scope for name_scope.
2343
2344  Raises:
2345     ValueError: If `data_format` is not either NCHW or NHWC.
2346
2347  Returns:
2348    (width, num_images*height, depth) sequence tensor
2349  """
2350  if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
2351    raise ValueError('data_format has to be either NCHW or NHWC.')
2352  with ops.name_scope(scope, 'ImagesToSequence', [inputs]) as sc:
2353    inputs = ops.convert_to_tensor(inputs)
2354    df = ('channels_first'
2355          if data_format and data_format.startswith('NC') else 'channels_last')
2356    if df == 'channels_first':
2357      inputs = array_ops.transpose(inputs, [0, 2, 3, 1])
2358    _, _, width, depth = inputs.get_shape().as_list()
2359    s = array_ops.shape(inputs)
2360    batch_size, height = s[0], s[1]
2361    transposed = array_ops.transpose(inputs, [2, 0, 1, 3])
2362    outputs = array_ops.reshape(transposed, [width, batch_size * height, depth])
2363    return utils.collect_named_outputs(outputs_collections, sc, outputs)
2364
2365
2366@add_arg_scope
2367def max_pool2d(inputs,
2368               kernel_size,
2369               stride=2,
2370               padding='VALID',
2371               data_format=DATA_FORMAT_NHWC,
2372               outputs_collections=None,
2373               scope=None):
2374  """Adds a 2D Max Pooling op.
2375
2376  It is assumed that the pooling is done per image but not in batch or channels.
2377
2378  Args:
2379    inputs: A 4-D tensor of shape `[batch_size, height, width, channels]` if
2380      `data_format` is `NHWC`, and `[batch_size, channels, height, width]` if
2381      `data_format` is `NCHW`.
2382    kernel_size: A list of length 2: [kernel_height, kernel_width] of the
2383      pooling kernel over which the op is computed. Can be an int if both
2384      values are the same.
2385    stride: A list of length 2: [stride_height, stride_width].
2386      Can be an int if both strides are the same. Note that presently
2387      both strides must have the same value.
2388    padding: The padding method, either 'VALID' or 'SAME'.
2389    data_format: A string. `NHWC` (default) and `NCHW` are supported.
2390    outputs_collections: The collections to which the outputs are added.
2391    scope: Optional scope for name_scope.
2392
2393  Returns:
2394    A `Tensor` representing the results of the pooling operation.
2395
2396  Raises:
2397    ValueError: If `data_format` is neither `NHWC` nor `NCHW`.
2398    ValueError: If 'kernel_size' is not a 2-D list
2399  """
2400  if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
2401    raise ValueError('data_format has to be either NCHW or NHWC.')
2402  with ops.name_scope(scope, 'MaxPool2D', [inputs]) as sc:
2403    inputs = ops.convert_to_tensor(inputs)
2404    df = ('channels_first'
2405          if data_format and data_format.startswith('NC') else 'channels_last')
2406    layer = pooling_layers.MaxPooling2D(
2407        pool_size=kernel_size,
2408        strides=stride,
2409        padding=padding,
2410        data_format=df,
2411        _scope=sc)
2412    outputs = layer.apply(inputs)
2413    return utils.collect_named_outputs(outputs_collections, sc, outputs)
2414
2415
2416@add_arg_scope
2417def max_pool3d(inputs,
2418               kernel_size,
2419               stride=2,
2420               padding='VALID',
2421               data_format=DATA_FORMAT_NDHWC,
2422               outputs_collections=None,
2423               scope=None):
2424  """Adds a 3D Max Pooling op.
2425
2426  It is assumed that the pooling is done per image but not in batch or channels.
2427
2428  Args:
2429    inputs: A 5-D tensor of shape `[batch_size, depth, height, width, channels]`
2430      if `data_format` is `NDHWC`, and `[batch_size, channels, depth, height,
2431      width]` if `data_format` is `NCDHW`.
2432    kernel_size: A list of length 3: [kernel_depth, kernel_height, kernel_width]
2433      of the pooling kernel over which the op is computed. Can be an int if both
2434      values are the same.
2435    stride: A list of length 3: [stride_depth, stride_height, stride_width].
2436      Can be an int if both strides are the same. Note that presently
2437      both strides must have the same value.
2438    padding: The padding method, either 'VALID' or 'SAME'.
2439    data_format: A string. `NDHWC` (default) and `NCDHW` are supported.
2440    outputs_collections: The collections to which the outputs are added.
2441    scope: Optional scope for name_scope.
2442
2443  Returns:
2444    A `Tensor` representing the results of the pooling operation.
2445
2446  Raises:
2447    ValueError: If `data_format` is neither `NDHWC` nor `NCDHW`.
2448    ValueError: If 'kernel_size' is not a 3-D list
2449  """
2450  if data_format not in (DATA_FORMAT_NCDHW, DATA_FORMAT_NDHWC):
2451    raise ValueError('data_format has to be either NCDHW or NDHWC.')
2452  with ops.name_scope(scope, 'MaxPool3D', [inputs]) as sc:
2453    inputs = ops.convert_to_tensor(inputs)
2454    df = ('channels_first'
2455          if data_format and data_format.startswith('NC') else 'channels_last')
2456    layer = pooling_layers.MaxPooling3D(
2457        pool_size=kernel_size,
2458        strides=stride,
2459        padding=padding,
2460        data_format=df,
2461        _scope=sc)
2462    outputs = layer.apply(inputs)
2463    return utils.collect_named_outputs(outputs_collections, sc, outputs)
2464
2465
2466@add_arg_scope
2467def pool(inputs,
2468         kernel_size,
2469         pooling_type,
2470         padding='VALID',
2471         data_format=None,
2472         dilation_rate=1,
2473         stride=1,
2474         outputs_collections=None,
2475         scope=None):
2476  # pylint: disable=line-too-long
2477  """Adds a pooling op.
2478
2479
2480  Args:
2481    inputs: Tensor of rank N+2, of shape
2482      `[batch_size] + input_spatial_shape + [num_channels]` if data_format does
2483      not start with "NC" (default), or
2484      `[batch_size, num_channels] + input_spatial_shape` if data_format starts
2485      with "NC".  Pooling happens over the spatial dimensions only.
2486    kernel_size: Sequence of N ints >= 1.  Can also be a single integer to
2487      specify the same value for all spatial dimensions.
2488    pooling_type: Specifies pooling operation, must be "AVG" or "MAX".
2489    padding: The padding algorithm, must be "SAME" or "VALID".
2490    data_format: A string or None.  Specifies whether the channel dimension of
2491      the `input` and output is the last dimension (default, or if `data_format`
2492      does not start with "NC"), or the second dimension (if `data_format`
2493      starts with "NC").  For N=1, the valid values are "NWC" (default) and
2494      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW".
2495      For N=3, the valid values are "NDHWC" (default) and "NCDHW".
2496    dilation_rate: Optional.  Dilation rate.  Sequence of N ints >= 1.  Defaults
2497      to [1]*N.  Can also be a single integer to specify the same value for all
2498      spatial dimensions.  If any value of dilation_rate is > 1, then all values
2499      of stride must be 1.
2500    stride: Optional.  Sequence of N ints >= 1.  Defaults to [1]*N.  Can also be
2501      a single integer to specify the same value for all spatial dimensions.  If
2502      any value of stride is > 1, then all values of dilation_rate must be 1.
2503    outputs_collections: The collections to which the outputs are added.
2504    scope: Optional scope for name_scope.
2505
2506  Returns:
2507    A `Tensor` representing the results of the pooling operation.
2508
2509  Raises:
2510    ValueError: If arguments are invalid.
2511
2512  """
2513  # pylint: enable=line-too-long
2514  with ops.name_scope(scope, '%s_pool' % (pooling_type.lower()),
2515                      [inputs]) as sc:
2516    inputs = ops.convert_to_tensor(inputs)
2517    input_rank = inputs.get_shape().ndims
2518    if input_rank is None:
2519      raise ValueError('Rank of inputs must be known')
2520    if input_rank < 3:
2521      raise ValueError('Rank of inputs must be >= 3')
2522    num_spatial_dims = input_rank - 2
2523    output = nn.pool(
2524        input=inputs,
2525        window_shape=utils.n_positive_integers(num_spatial_dims, kernel_size),
2526        pooling_type=pooling_type,
2527        padding=padding,
2528        data_format=data_format,
2529        dilation_rate=utils.n_positive_integers(num_spatial_dims,
2530                                                dilation_rate),
2531        strides=utils.n_positive_integers(num_spatial_dims, stride),
2532        name=sc)
2533    return utils.collect_named_outputs(outputs_collections, sc, output)
2534
2535
2536@add_arg_scope
2537def one_hot_encoding(labels,
2538                     num_classes,
2539                     on_value=1.0,
2540                     off_value=0.0,
2541                     outputs_collections=None,
2542                     scope=None):
2543  """Transform numeric labels into onehot_labels using `tf.one_hot`.
2544
2545  Args:
2546    labels: [batch_size] target labels.
2547    num_classes: Total number of classes.
2548    on_value: A scalar defining the on-value.
2549    off_value: A scalar defining the off-value.
2550    outputs_collections: Collection to add the outputs.
2551    scope: Optional scope for name_scope.
2552
2553  Returns:
2554    One-hot encoding of the labels.
2555  """
2556  with ops.name_scope(scope, 'OneHotEncoding', [labels, num_classes]) as sc:
2557    labels = ops.convert_to_tensor(labels)
2558    if labels.dtype == dtypes.int32:
2559      labels = standard_ops.to_int64(labels)
2560    outputs = standard_ops.one_hot(
2561        labels, num_classes, on_value=on_value, off_value=off_value)
2562    return utils.collect_named_outputs(outputs_collections, sc, outputs)
2563
2564
2565def _apply_activation(y, activation_fn, output_collections):
2566  if activation_fn is not None:
2567    y = activation_fn(y)
2568  ops.add_to_collections(
2569      list(output_collections or []) + [ops.GraphKeys.ACTIVATIONS], y)
2570  return y
2571
2572
2573def repeat(inputs, repetitions, layer, *args, **kwargs):
2574  """Applies the same layer with the same arguments repeatedly.
2575
2576  ```python
2577    y = repeat(x, 3, conv2d, 64, [3, 3], scope='conv1')
2578    # It is equivalent to:
2579
2580    x = conv2d(x, 64, [3, 3], scope='conv1/conv1_1')
2581    x = conv2d(x, 64, [3, 3], scope='conv1/conv1_2')
2582    y = conv2d(x, 64, [3, 3], scope='conv1/conv1_3')
2583  ```
2584
2585  If the `scope` argument is not given in `kwargs`, it is set to
2586  `layer.__name__`, or `layer.func.__name__` (for `functools.partial`
2587  objects). If neither `__name__` nor `func.__name__` is available, the
2588  layers are called with `scope='stack'`.
2589
2590  Args:
2591    inputs: A `Tensor` suitable for layer.
2592    repetitions: Int, number of repetitions.
2593    layer: A layer with arguments `(inputs, *args, **kwargs)`
2594    *args: Extra args for the layer.
2595    **kwargs: Extra kwargs for the layer.
2596
2597  Returns:
2598    A tensor result of applying the layer, repetitions times.
2599  Raises:
2600    ValueError: If the op is unknown or wrong.
2601  """
2602  scope = kwargs.pop('scope', None)
2603  with variable_scope.variable_scope(scope, 'Repeat', [inputs]):
2604    inputs = ops.convert_to_tensor(inputs)
2605    if scope is None:
2606      if hasattr(layer, '__name__'):
2607        scope = layer.__name__
2608      elif hasattr(layer, 'func') and hasattr(layer.func, '__name__'):
2609        scope = layer.func.__name__  # In case layer is a functools.partial.
2610      else:
2611        scope = 'repeat'
2612    outputs = inputs
2613    for i in range(repetitions):
2614      kwargs['scope'] = scope + '_' + str(i + 1)
2615      outputs = layer(outputs, *args, **kwargs)
2616    return outputs
2617
2618
2619def _scale_gradient_shape(op):
2620  """Shape helper function for scale_gradient function below."""
2621  return [op.inputs[0].shape]
2622
2623
2624def _scale_gradient_grad(op, grad):
2625  """Python gradient helper function for scale_gradient function below."""
2626  return [grad * op.inputs[1], None]
2627
2628
2629@function.Defun(
2630    python_grad_func=_scale_gradient_grad, shape_func=_scale_gradient_shape)
2631def scale_gradient(inputs, gradient_multiplier):
2632  """Identity operation, but with the gradient multiplied by a tensor.
2633
2634  The TensorFlow gradient system will compute the gradient with respect to
2635  `inputs` as the product of the gradient with respect to the `output`
2636  multiplied by a specified `gradient_multiplier` tensor.  If
2637  `gradient_multiplier` is equal to 1, then this results in the true gradient.
2638  Otherwise, it results in a scaled gradient.
2639
2640  This can be useful for adjusting the relative learning rate of different
2641  parameter tensors when performing gradient descent, and because this rescaling
2642  can be inserted at arbitrary locations within a graph, is often more
2643  convenient to apply than simply rescaling the final computed gradients.
2644
2645  Args:
2646    inputs: Tensor to be output.
2647    gradient_multiplier: Tensor by which to multiply the gradient with respect
2648      to `output` to compute the gradient with respect to `inputs`.  Its shape
2649      must be broadcastable to the shape of `inputs`.
2650
2651  Returns:
2652    output Tensor, equal to `inputs`.
2653  """
2654  # gradient_multiplier is implicitly saved by decorator, and only used for
2655  # gradient computation.
2656  del gradient_multiplier
2657
2658  return inputs
2659
2660
2661@add_arg_scope
2662def separable_convolution2d(
2663    inputs,
2664    num_outputs,
2665    kernel_size,
2666    depth_multiplier=1,
2667    stride=1,
2668    padding='SAME',
2669    data_format=DATA_FORMAT_NHWC,
2670    rate=1,
2671    activation_fn=nn.relu,
2672    normalizer_fn=None,
2673    normalizer_params=None,
2674    weights_initializer=initializers.xavier_initializer(),
2675    pointwise_initializer=None,
2676    weights_regularizer=None,
2677    biases_initializer=init_ops.zeros_initializer(),
2678    biases_regularizer=None,
2679    reuse=None,
2680    variables_collections=None,
2681    outputs_collections=None,
2682    trainable=True,
2683    scope=None):
2684  """Adds a depth-separable 2D convolution with optional batch_norm layer.
2685
2686  This op first performs a depthwise convolution that acts separately on
2687  channels, creating a variable called `depthwise_weights`. If `num_outputs`
2688  is not None, it adds a pointwise convolution that mixes channels, creating a
2689  variable called `pointwise_weights`. Then, if `normalizer_fn` is None,
2690  it adds bias to the result, creating a variable called 'biases', otherwise,
2691  the `normalizer_fn` is applied. It finally applies an activation function
2692  to produce the end result.
2693
2694  Args:
2695    inputs: A tensor of size [batch_size, height, width, channels].
2696    num_outputs: The number of pointwise convolution output filters. If is
2697      None, then we skip the pointwise convolution stage.
2698    kernel_size: A list of length 2: [kernel_height, kernel_width] of
2699      of the filters. Can be an int if both values are the same.
2700    depth_multiplier: The number of depthwise convolution output channels for
2701      each input channel. The total number of depthwise convolution output
2702      channels will be equal to `num_filters_in * depth_multiplier`.
2703    stride: A list of length 2: [stride_height, stride_width], specifying the
2704      depthwise convolution stride. Can be an int if both strides are the same.
2705    padding: One of 'VALID' or 'SAME'.
2706    data_format: A string. `NHWC` (default) and `NCHW` are supported.
2707    rate: A list of length 2: [rate_height, rate_width], specifying the dilation
2708      rates for atrous convolution. Can be an int if both rates are the same.
2709      If any value is larger than one, then both stride values need to be one.
2710    activation_fn: Activation function. The default value is a ReLU function.
2711      Explicitly set it to None to skip it and maintain a linear activation.
2712    normalizer_fn: Normalization function to use instead of `biases`. If
2713      `normalizer_fn` is provided then `biases_initializer` and
2714      `biases_regularizer` are ignored and `biases` are not created nor added.
2715      default set to None for no normalizer function
2716    normalizer_params: Normalization function parameters.
2717    weights_initializer: An initializer for the depthwise weights.
2718    pointwise_initializer: An initializer for the pointwise weights.
2719      default set to None, means use weights_initializer.
2720    weights_regularizer: Optional regularizer for the weights.
2721    biases_initializer: An initializer for the biases. If None skip biases.
2722    biases_regularizer: Optional regularizer for the biases.
2723    reuse: Whether or not the layer and its variables should be reused. To be
2724      able to reuse the layer scope must be given.
2725    variables_collections: Optional list of collections for all the variables or
2726      a dictionary containing a different list of collection per variable.
2727    outputs_collections: Collection to add the outputs.
2728    trainable: Whether or not the variables should be trainable or not.
2729    scope: Optional scope for variable_scope.
2730
2731  Returns:
2732    A `Tensor` representing the output of the operation.
2733  Raises:
2734    ValueError: If `data_format` is invalid.
2735  """
2736  if data_format not in (DATA_FORMAT_NCHW, DATA_FORMAT_NHWC):
2737    raise ValueError('data_format has to be either NCHW or NHWC.')
2738  layer_variable_getter = _build_variable_getter({
2739      'bias': 'biases',
2740      'depthwise_kernel': 'depthwise_weights',
2741      'pointwise_kernel': 'pointwise_weights'
2742  })
2743
2744  with variable_scope.variable_scope(
2745      scope,
2746      'SeparableConv2d', [inputs],
2747      reuse=reuse,
2748      custom_getter=layer_variable_getter) as sc:
2749    inputs = ops.convert_to_tensor(inputs)
2750
2751    if pointwise_initializer is None:
2752      pointwise_initializer = weights_initializer
2753
2754    df = ('channels_first'
2755          if data_format and data_format.startswith('NC') else 'channels_last')
2756    if num_outputs is not None:
2757      # Apply separable conv using the SeparableConvolution2D layer.
2758      layer = convolutional_layers.SeparableConvolution2D(
2759          filters=num_outputs,
2760          kernel_size=kernel_size,
2761          strides=stride,
2762          padding=padding,
2763          data_format=df,
2764          dilation_rate=utils.two_element_tuple(rate),
2765          activation=None,
2766          depth_multiplier=depth_multiplier,
2767          use_bias=not normalizer_fn and biases_initializer,
2768          depthwise_initializer=weights_initializer,
2769          pointwise_initializer=pointwise_initializer,
2770          bias_initializer=biases_initializer,
2771          depthwise_regularizer=weights_regularizer,
2772          pointwise_regularizer=weights_regularizer,
2773          bias_regularizer=biases_regularizer,
2774          activity_regularizer=None,
2775          trainable=trainable,
2776          name=sc.name,
2777          dtype=inputs.dtype.base_dtype,
2778          _scope=sc,
2779          _reuse=reuse)
2780      outputs = layer.apply(inputs)
2781
2782      # Add variables to collections.
2783      _add_variable_to_collections(layer.depthwise_kernel,
2784                                   variables_collections, 'weights')
2785      _add_variable_to_collections(layer.pointwise_kernel,
2786                                   variables_collections, 'weights')
2787      if layer.bias is not None:
2788        _add_variable_to_collections(layer.bias, variables_collections,
2789                                     'biases')
2790
2791      if normalizer_fn is not None:
2792        normalizer_params = normalizer_params or {}
2793        outputs = normalizer_fn(outputs, **normalizer_params)
2794    else:
2795      # Actually apply depthwise conv instead of separable conv.
2796      dtype = inputs.dtype.base_dtype
2797      kernel_h, kernel_w = utils.two_element_tuple(kernel_size)
2798      stride_h, stride_w = utils.two_element_tuple(stride)
2799      num_filters_in = utils.channel_dimension(
2800          inputs.get_shape(), df, min_rank=4)
2801      weights_collections = utils.get_variable_collections(
2802          variables_collections, 'weights')
2803
2804      depthwise_shape = [kernel_h, kernel_w, num_filters_in, depth_multiplier]
2805      depthwise_weights = variables.model_variable(
2806          'depthwise_weights',
2807          shape=depthwise_shape,
2808          dtype=dtype,
2809          initializer=weights_initializer,
2810          regularizer=weights_regularizer,
2811          trainable=trainable,
2812          collections=weights_collections)
2813      strides = [1, 1, stride_h,
2814                 stride_w] if data_format.startswith('NC') else [
2815                     1, stride_h, stride_w, 1
2816                 ]
2817
2818      outputs = nn.depthwise_conv2d(
2819          inputs,
2820          depthwise_weights,
2821          strides,
2822          padding,
2823          rate=utils.two_element_tuple(rate),
2824          data_format=data_format)
2825      num_outputs = depth_multiplier * num_filters_in
2826
2827      if normalizer_fn is not None:
2828        normalizer_params = normalizer_params or {}
2829        outputs = normalizer_fn(outputs, **normalizer_params)
2830      else:
2831        if biases_initializer is not None:
2832          biases_collections = utils.get_variable_collections(
2833              variables_collections, 'biases')
2834          biases = variables.model_variable(
2835              'biases',
2836              shape=[
2837                  num_outputs,
2838              ],
2839              dtype=dtype,
2840              initializer=biases_initializer,
2841              regularizer=biases_regularizer,
2842              trainable=trainable,
2843              collections=biases_collections)
2844          outputs = nn.bias_add(outputs, biases, data_format=data_format)
2845
2846    if activation_fn is not None:
2847      outputs = activation_fn(outputs)
2848    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
2849
2850
2851@add_arg_scope
2852def sequence_to_images(inputs,
2853                       height,
2854                       output_data_format='channels_last',
2855                       outputs_collections=None,
2856                       scope=None):
2857  """Convert a batch of sequences into a batch of images.
2858
2859  Args:
2860    inputs: (num_steps, num_batches, depth) sequence tensor
2861    height: the height of the images
2862    output_data_format: Format of output tensor.
2863      Currently supports `'channels_first'` and `'channels_last'`.
2864    outputs_collections: The collections to which the outputs are added.
2865    scope: Optional scope for name_scope.
2866
2867  Returns:
2868    A tensor representing the output of the operation.
2869  """
2870  with ops.name_scope(scope, 'SequenceToImages', [inputs]) as sc:
2871    inputs = ops.convert_to_tensor(inputs)
2872    width, num_batches, depth = inputs.get_shape().as_list()
2873    if num_batches is None:
2874      num_batches = -1
2875    else:
2876      num_batches //= height
2877    reshaped = array_ops.reshape(inputs,
2878                                 [width, num_batches, height, depth])
2879    if output_data_format == 'channels_first':
2880      outputs = array_ops.transpose(reshaped, [1, 3, 2, 0])
2881    else:
2882      outputs = array_ops.transpose(reshaped, [1, 2, 0, 3])
2883    return utils.collect_named_outputs(outputs_collections, sc, outputs)
2884
2885
2886@add_arg_scope
2887def softmax(logits, scope=None):
2888  """Performs softmax on Nth dimension of N-dimensional logit tensor.
2889
2890  For two-dimensional logits this reduces to tf.nn.softmax. The N-th dimension
2891  needs to have a specified number of elements (number of classes).
2892
2893  Args:
2894    logits: N-dimensional `Tensor` with logits, where N > 1.
2895    scope: Optional scope for variable_scope.
2896
2897  Returns:
2898    A `Tensor` with same shape and type as logits.
2899  """
2900  # TODO(jrru): Add axis argument which defaults to last dimension.
2901  with variable_scope.variable_scope(scope, 'softmax', [logits]):
2902    num_logits = utils.last_dimension(logits.get_shape(), min_rank=2)
2903    logits_2d = array_ops.reshape(logits, [-1, num_logits])
2904    predictions = nn.softmax(logits_2d)
2905    predictions = array_ops.reshape(predictions, array_ops.shape(logits))
2906    if not context.executing_eagerly():
2907      predictions.set_shape(logits.get_shape())
2908    return predictions
2909
2910
2911@add_arg_scope
2912def spatial_softmax(features,
2913                    temperature=None,
2914                    name=None,
2915                    variables_collections=None,
2916                    trainable=True,
2917                    data_format='NHWC'):
2918  """Computes the spatial softmax of a convolutional feature map.
2919
2920  First computes the softmax over the spatial extent of each channel of a
2921  convolutional feature map. Then computes the expected 2D position of the
2922  points of maximal activation for each channel, resulting in a set of
2923  feature keypoints [x1, y1, ... xN, yN] for all N channels.
2924
2925  Read more here:
2926  "Learning visual feature spaces for robotic manipulation with
2927  deep spatial autoencoders." Finn et al., http://arxiv.org/abs/1509.06113.
2928
2929  Args:
2930    features: A `Tensor` of size [batch_size, W, H, num_channels]; the
2931      convolutional feature map.
2932    temperature: Softmax temperature (optional). If None, a learnable
2933      temperature is created.
2934    name: A name for this operation (optional).
2935    variables_collections: Collections for the temperature variable.
2936    trainable: If `True` also add variables to the graph collection
2937      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
2938    data_format: A string. `NHWC` (default) and `NCHW` are supported.
2939  Returns:
2940    feature_keypoints: A `Tensor` with size [batch_size, num_channels * 2];
2941      the expected 2D locations of each channel's feature keypoint (normalized
2942      to the range (-1,1)). The inner dimension is arranged as
2943      [x1, y1, ... xN, yN].
2944  Raises:
2945    ValueError: If unexpected data_format specified.
2946    ValueError: If num_channels dimension is unspecified.
2947  """
2948  with variable_scope.variable_scope(name, 'spatial_softmax'):
2949    shape = array_ops.shape(features)
2950    static_shape = features.shape
2951    if data_format == DATA_FORMAT_NHWC:
2952      height, width, num_channels = shape[1], shape[2], static_shape[3]
2953    elif data_format == DATA_FORMAT_NCHW:
2954      num_channels, height, width = static_shape[1], shape[2], shape[3]
2955    else:
2956      raise ValueError('data_format has to be either NCHW or NHWC.')
2957    if tensor_shape.dimension_value(num_channels) is None:
2958      raise ValueError('The num_channels dimension of the inputs to '
2959                       '`spatial_softmax` should be defined. Found `None`.')
2960
2961    with ops.name_scope('spatial_softmax_op', 'spatial_softmax_op', [features]):
2962      # Create tensors for x and y coordinate values, scaled to range [-1, 1].
2963      pos_x, pos_y = array_ops.meshgrid(
2964          math_ops.lin_space(-1., 1., num=height),
2965          math_ops.lin_space(-1., 1., num=width),
2966          indexing='ij')
2967      pos_x = array_ops.reshape(pos_x, [height * width])
2968      pos_y = array_ops.reshape(pos_y, [height * width])
2969
2970      if temperature is None:
2971        temp_initializer = init_ops.ones_initializer()
2972      else:
2973        temp_initializer = init_ops.constant_initializer(temperature)
2974
2975      if not trainable:
2976        temp_collections = None
2977      else:
2978        temp_collections = utils.get_variable_collections(
2979            variables_collections, 'temperature')
2980
2981      temperature = variables.model_variable(
2982          'temperature',
2983          shape=(),
2984          dtype=dtypes.float32,
2985          initializer=temp_initializer,
2986          collections=temp_collections,
2987          trainable=trainable)
2988      if data_format == 'NCHW':
2989        features = array_ops.reshape(features, [-1, height * width])
2990      else:
2991        features = array_ops.reshape(
2992            array_ops.transpose(features, [0, 3, 1, 2]), [-1, height * width])
2993
2994      softmax_attention = nn.softmax(features / temperature)
2995      expected_x = math_ops.reduce_sum(
2996          pos_x * softmax_attention, [1], keepdims=True)
2997      expected_y = math_ops.reduce_sum(
2998          pos_y * softmax_attention, [1], keepdims=True)
2999      expected_xy = array_ops.concat([expected_x, expected_y], 1)
3000      feature_keypoints = array_ops.reshape(
3001          expected_xy,
3002          [-1, tensor_shape.dimension_value(num_channels) * 2])
3003      feature_keypoints.set_shape(
3004          [None, tensor_shape.dimension_value(num_channels) * 2])
3005  return feature_keypoints
3006
3007
3008def stack(inputs, layer, stack_args, **kwargs):
3009  """Builds a stack of layers by applying layer repeatedly using stack_args.
3010
3011  `stack` allows you to repeatedly apply the same operation with different
3012  arguments `stack_args[i]`. For each application of the layer, `stack` creates
3013  a new scope appended with an increasing number. For example:
3014
3015  ```python
3016    y = stack(x, fully_connected, [32, 64, 128], scope='fc')
3017    # It is equivalent to:
3018
3019    x = fully_connected(x, 32, scope='fc/fc_1')
3020    x = fully_connected(x, 64, scope='fc/fc_2')
3021    y = fully_connected(x, 128, scope='fc/fc_3')
3022  ```
3023
3024  If the `scope` argument is not given in `kwargs`, it is set to
3025  `layer.__name__`, or `layer.func.__name__` (for `functools.partial`
3026  objects). If neither `__name__` nor `func.__name__` is available, the
3027  layers are called with `scope='stack'`.
3028
3029  Args:
3030    inputs: A `Tensor` suitable for layer.
3031    layer: A layer with arguments `(inputs, *args, **kwargs)`
3032    stack_args: A list/tuple of parameters for each call of layer.
3033    **kwargs: Extra kwargs for the layer.
3034
3035  Returns:
3036    A `Tensor` result of applying the stacked layers.
3037
3038  Raises:
3039    ValueError: If the op is unknown or wrong.
3040  """
3041  scope = kwargs.pop('scope', None)
3042  if not isinstance(stack_args, (list, tuple)):
3043    raise ValueError('stack_args need to be a list or tuple')
3044  with variable_scope.variable_scope(scope, 'Stack', [inputs]):
3045    inputs = ops.convert_to_tensor(inputs)
3046    if scope is None:
3047      if hasattr(layer, '__name__'):
3048        scope = layer.__name__
3049      elif hasattr(layer, 'func') and hasattr(layer.func, '__name__'):
3050        scope = layer.func.__name__  # In case layer is a functools.partial.
3051      else:
3052        scope = 'stack'
3053    outputs = inputs
3054    for i in range(len(stack_args)):
3055      kwargs['scope'] = scope + '_' + str(i + 1)
3056      layer_args = stack_args[i]
3057      if not isinstance(layer_args, (list, tuple)):
3058        layer_args = [layer_args]
3059      outputs = layer(outputs, *layer_args, **kwargs)
3060    return outputs
3061
3062
3063@add_arg_scope
3064def unit_norm(inputs, dim, epsilon=1e-7, scope=None):
3065  """Normalizes the given input across the specified dimension to unit length.
3066
3067  Note that the rank of `input` must be known.
3068
3069  Args:
3070    inputs: A `Tensor` of arbitrary size.
3071    dim: The dimension along which the input is normalized.
3072    epsilon: A small value to add to the inputs to avoid dividing by zero.
3073    scope: Optional scope for variable_scope.
3074
3075  Returns:
3076    The normalized `Tensor`.
3077
3078  Raises:
3079    ValueError: If dim is smaller than the number of dimensions in 'inputs'.
3080  """
3081  with variable_scope.variable_scope(scope, 'UnitNorm', [inputs]):
3082    if not inputs.get_shape():
3083      raise ValueError('The input rank must be known.')
3084    input_rank = len(inputs.get_shape().as_list())
3085    if dim < 0 or dim >= input_rank:
3086      raise ValueError('dim must be positive but smaller than the input rank.')
3087
3088    lengths = math_ops.sqrt(
3089        epsilon + math_ops.reduce_sum(math_ops.square(inputs), dim, True))
3090    multiples = []
3091    if dim > 0:
3092      multiples.append(array_ops.ones([dim], dtypes.int32))
3093    multiples.append(
3094        array_ops.strided_slice(array_ops.shape(inputs), [dim], [dim + 1]))
3095    if dim < (input_rank - 1):
3096      multiples.append(array_ops.ones([input_rank - 1 - dim], dtypes.int32))
3097    multiples = array_ops.concat(multiples, 0)
3098    return math_ops.div(inputs, array_ops.tile(lengths, multiples))
3099
3100
3101@add_arg_scope
3102def maxout(inputs, num_units, axis=-1, scope=None):
3103  """Adds a maxout op from https://arxiv.org/abs/1302.4389
3104
3105  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron
3106  Courville,
3107   Yoshua Bengio
3108
3109  Usually the operation is performed in the filter/channel dimension. This can
3110  also be
3111  used after fully-connected layers to reduce number of features.
3112
3113  Arguments:
3114    inputs: Tensor input
3115    num_units: Specifies how many features will remain after maxout
3116      in the `axis` dimension (usually channel).
3117      This must be a factor of number of features.
3118    axis: The dimension where max pooling will be performed. Default is the
3119    last dimension.
3120    scope: Optional scope for variable_scope.
3121
3122  Returns:
3123    A `Tensor` representing the results of the pooling operation.
3124
3125  Raises:
3126    ValueError: if num_units is not multiple of number of features.
3127  """
3128  with variable_scope.variable_scope(scope, 'MaxOut', [inputs]):
3129    inputs = ops.convert_to_tensor(inputs)
3130    shape = inputs.get_shape().as_list()
3131    num_channels = shape[axis]
3132    if num_channels % num_units:
3133      raise ValueError('number of features({}) is not '
3134                       'a multiple of num_units({})'.format(
3135                           num_channels, num_units))
3136    shape[axis] = num_units
3137    shape += [num_channels // num_units]
3138
3139    # Dealing with batches with arbitrary sizes
3140    for i in range(len(shape)):
3141      if shape[i] is None:
3142        shape[i] = array_ops.shape(inputs)[i]
3143    outputs = math_ops.reduce_max(
3144        array_ops.reshape(inputs, shape), -1, keepdims=False)
3145    return outputs
3146
3147
3148def poincare_normalize(x, axis=1, epsilon=1e-5, name=None):
3149  """Project into the Poincare ball with norm <= 1.0 - epsilon.
3150
3151  https://en.wikipedia.org/wiki/Poincare_ball_model
3152
3153  Used in
3154  Poincare Embeddings for Learning Hierarchical Representations
3155  Maximilian Nickel, Douwe Kiela
3156  https://arxiv.org/pdf/1705.08039.pdf
3157
3158  For a 1-D tensor with `axis = 0`, computes
3159
3160                (x * (1 - epsilon)) / ||x||     if ||x|| > 1 - epsilon
3161      output =
3162                 x                              otherwise
3163
3164  For `x` with more dimensions, independently normalizes each 1-D slice along
3165  dimension `axis`.
3166
3167  Args:
3168    x: A `Tensor`.
3169    axis: Axis along which to normalize.  A scalar or a vector of
3170      integers.
3171    epsilon: A small deviation from the edge of the unit sphere for numerical
3172      stability.
3173    name: A name for this operation (optional).
3174
3175  Returns:
3176    A `Tensor` with the same shape as `x`.
3177  """
3178  with ops.name_scope(name, 'poincare_normalize', [x]) as name:
3179    x = ops.convert_to_tensor(x, name='x')
3180    square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
3181    x_inv_norm = math_ops.rsqrt(square_sum)
3182    x_inv_norm = math_ops.minimum((1. - epsilon) * x_inv_norm, 1.)
3183    return math_ops.multiply(x, x_inv_norm, name=name)
3184
3185
3186def legacy_fully_connected(x,
3187                           num_output_units,
3188                           activation_fn=None,
3189                           weight_init=initializers.xavier_initializer(),
3190                           bias_init=init_ops.zeros_initializer(),
3191                           name=None,
3192                           weight_collections=(ops.GraphKeys.WEIGHTS,),
3193                           bias_collections=(ops.GraphKeys.BIASES,),
3194                           output_collections=(ops.GraphKeys.ACTIVATIONS,),
3195                           trainable=True,
3196                           weight_regularizer=None,
3197                           bias_regularizer=None):
3198  # pylint: disable=anomalous-backslash-in-string
3199  r"""Adds the parameters for a fully connected layer and returns the output.
3200
3201  A fully connected layer is generally defined as a matrix multiply:
3202  `y = f(w * x + b)` where `f` is given by `activation_fn`. If
3203  `activation_fn` is `None`, the result of `y = w * x + b` is
3204  returned.
3205
3206  If `x` has shape [\\(\text{dim}_0, \text{dim}_1, ..., \text{dim}_n\\)]
3207  with more than 2 dimensions (\\(n > 1\\)), then we repeat the matrix
3208  multiply along the first dimensions. The result r is a tensor of shape
3209  [\\(\text{dim}_0, ..., \text{dim}_{n-1},\\) `num_output_units`],
3210  where \\( r_{i_0, ..., i_{n-1}, k} =
3211  \sum_{0 \leq j < \text{dim}_n} x_{i_0, ... i_{n-1}, j} \cdot w_{j, k}\\).
3212  This is accomplished by reshaping `x` to 2-D
3213  [\\(\text{dim}_0 \cdot ... \cdot \text{dim}_{n-1}, \text{dim}_n\\)]
3214  before the matrix multiply and afterwards reshaping it to
3215  [\\(\text{dim}_0, ..., \text{dim}_{n-1},\\) `num_output_units`].
3216
3217  This op creates `w` and optionally `b`. Bias (`b`) can be disabled by setting
3218  `bias_init` to `None`.
3219
3220  The variable creation is compatible with `tf.variable_scope` and so can be
3221  reused with `tf.variable_scope` or `tf.make_template`.
3222
3223  Most of the details of variable creation can be controlled by specifying the
3224  initializers (`weight_init` and `bias_init`) and in which collections to place
3225  the created variables (`weight_collections` and `bias_collections`; note that
3226  the variables are always added to the `VARIABLES` collection). The output of
3227  the layer can be placed in custom collections using `output_collections`.
3228  The collections arguments default to `WEIGHTS`, `BIASES` and `ACTIVATIONS`,
3229  respectively.
3230
3231  A per layer regularization can be specified by setting `weight_regularizer`
3232  and `bias_regularizer`, which are applied to the weights and biases
3233  respectively, and whose output is added to the `REGULARIZATION_LOSSES`
3234  collection.
3235
3236  Args:
3237    x: The input `Tensor`.
3238    num_output_units: The size of the output.
3239    activation_fn: Activation function, default set to None to skip it and
3240      maintain a linear activation.
3241    weight_init: An optional weight initialization, defaults to
3242      `xavier_initializer`.
3243    bias_init: An initializer for the bias, defaults to 0. Set to `None` in
3244      order to disable bias.
3245    name: The name for this operation is used to name operations and to find
3246      variables. If specified it must be unique for this scope, otherwise a
3247      unique name starting with "fully_connected" will be created.  See
3248      `tf.variable_scope` for details.
3249    weight_collections: List of graph collections to which weights are added.
3250    bias_collections: List of graph collections to which biases are added.
3251    output_collections: List of graph collections to which outputs are added.
3252    trainable: If `True` also add variables to the graph collection
3253      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
3254    weight_regularizer: A regularizer like the result of
3255      `l1_regularizer` or `l2_regularizer`. Used for weights.
3256    bias_regularizer: A regularizer like the result of
3257      `l1_regularizer` or `l2_regularizer`. Used for biases.
3258
3259  Returns:
3260    The output of the fully connected layer.
3261
3262  Raises:
3263    ValueError: If x has rank less than 2 or if its last dimension is not set.
3264  """
3265  with variable_scope.variable_scope(name, 'fully_connected', [x]):
3266    x = ops.convert_to_tensor(x)
3267    dims = x.get_shape().dims
3268    if dims is None:
3269      raise ValueError('dims of x must be known but is None')
3270    if len(dims) < 2:
3271      raise ValueError('rank of x must be at least 2 not: %d' % len(dims))
3272    num_input_units = dims[-1].value
3273    if num_input_units is None:
3274      raise ValueError('last dimension of x must be known but is None')
3275    dtype = x.dtype.base_dtype
3276
3277    weight_collections = set(
3278        list(weight_collections or []) + [ops.GraphKeys.GLOBAL_VARIABLES])
3279    w = variable_scope.get_variable(
3280        'weights',
3281        shape=[num_input_units, num_output_units],
3282        dtype=dtype,
3283        initializer=weight_init,
3284        collections=weight_collections,
3285        regularizer=weight_regularizer,
3286        trainable=trainable)
3287    x_2_dim = x if len(dims) <= 2 else array_ops.reshape(
3288        x, [-1, num_input_units])
3289    y = standard_ops.matmul(x_2_dim, w)
3290
3291    if bias_init is not None:
3292      bias_collections = set(
3293          list(bias_collections or []) + [ops.GraphKeys.GLOBAL_VARIABLES])
3294      b = variable_scope.get_variable(
3295          'bias',
3296          shape=[num_output_units],
3297          dtype=dtype,
3298          initializer=bias_init,
3299          collections=bias_collections,
3300          regularizer=bias_regularizer,
3301          trainable=trainable)
3302
3303      y = nn.bias_add(y, b)
3304
3305    if len(dims) > 2:
3306      out_shape = array_ops.unstack(array_ops.shape(x))
3307      out_shape[-1] = num_output_units
3308
3309      y = array_ops.reshape(y, array_ops.stack(out_shape))
3310
3311      static_shape = x.get_shape().as_list()
3312      static_shape[-1] = num_output_units
3313      y.set_shape(static_shape)
3314
3315    return _apply_activation(y, activation_fn, output_collections)
3316
3317
3318# TODO(eiderm): Verify and fix autocomplete in colab (also relu6).
3319# Simple aliases which remove the activation_fn parameter.
3320elu = functools.partial(fully_connected, activation_fn=nn.elu)
3321legacy_relu = functools.partial(legacy_fully_connected, activation_fn=nn.relu)
3322legacy_linear = functools.partial(legacy_fully_connected, activation_fn=None)
3323relu = functools.partial(fully_connected, activation_fn=nn.relu)
3324relu6 = functools.partial(fully_connected, activation_fn=nn.relu6)
3325linear = functools.partial(fully_connected, activation_fn=None)
3326
3327# Simple alias.
3328conv1d = convolution1d
3329conv2d = convolution2d
3330conv3d = convolution3d
3331conv2d_transpose = convolution2d_transpose
3332conv3d_transpose = convolution3d_transpose
3333conv2d_in_plane = convolution2d_in_plane
3334separable_conv2d = separable_convolution2d
3335