1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15
16"""Operations for clipping (gradient, weight) tensors to min/max values."""
17from __future__ import absolute_import
18from __future__ import division
19from __future__ import print_function
20
21import six
22
23from tensorflow.python.framework import constant_op
24from tensorflow.python.framework import dtypes
25from tensorflow.python.framework import ops
26from tensorflow.python.ops import array_ops
27from tensorflow.python.ops import gen_array_ops
28from tensorflow.python.ops import gen_nn_ops
29from tensorflow.python.ops import math_ops
30from tensorflow.python.util import deprecation
31from tensorflow.python.util import dispatch
32from tensorflow.python.util.compat import collections_abc
33from tensorflow.python.util.tf_export import tf_export
34
35
36@tf_export("clip_by_value")
37@dispatch.add_dispatch_support
38def clip_by_value(t, clip_value_min, clip_value_max,
39                  name=None):
40  """Clips tensor values to a specified min and max.
41
42  Given a tensor `t`, this operation returns a tensor of the same type and
43  shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
44  Any values less than `clip_value_min` are set to `clip_value_min`. Any values
45  greater than `clip_value_max` are set to `clip_value_max`.
46
47  Note: `clip_value_min` needs to be smaller or equal to `clip_value_max` for
48  correct results.
49
50  For example:
51
52  Basic usage passes a scalar as the min and max value.
53
54  >>> t = tf.constant([[-10., -1., 0.], [0., 2., 10.]])
55  >>> t2 = tf.clip_by_value(t, clip_value_min=-1, clip_value_max=1)
56  >>> t2.numpy()
57  array([[-1., -1.,  0.],
58         [ 0.,  1.,  1.]], dtype=float32)
59
60  The min and max can be the same size as `t`, or broadcastable to that size.
61
62  >>> t = tf.constant([[-1, 0., 10.], [-1, 0, 10]])
63  >>> clip_min = [[2],[1]]
64  >>> t3 = tf.clip_by_value(t, clip_value_min=clip_min, clip_value_max=100)
65  >>> t3.numpy()
66  array([[ 2.,  2., 10.],
67         [ 1.,  1., 10.]], dtype=float32)
68
69  Broadcasting fails, intentionally, if you would expand the dimensions of `t`
70
71  >>> t = tf.constant([[-1, 0., 10.], [-1, 0, 10]])
72  >>> clip_min = [[[2, 1]]] # Has a third axis
73  >>> t4 = tf.clip_by_value(t, clip_value_min=clip_min, clip_value_max=100)
74  Traceback (most recent call last):
75  ...
76  InvalidArgumentError: Incompatible shapes: [2,3] vs. [1,1,2]
77
78  It throws a `TypeError` if you try to clip an `int` to a `float` value
79  (`tf.cast` the input to `float` first).
80
81  >>> t = tf.constant([[1, 2], [3, 4]], dtype=tf.int32)
82  >>> t5 = tf.clip_by_value(t, clip_value_min=-3.1, clip_value_max=3.1)
83  Traceback (most recent call last):
84  ...
85  TypeError: Cannot convert ...
86
87
88  Args:
89    t: A `Tensor` or `IndexedSlices`.
90    clip_value_min: The minimum value to clip to. A scalar `Tensor` or one that
91      is broadcastable to the shape of `t`.
92    clip_value_max: The maximum value to clip to. A scalar `Tensor` or one that
93      is broadcastable to the shape of `t`.
94    name: A name for the operation (optional).
95
96  Returns:
97    A clipped `Tensor` or `IndexedSlices`.
98
99  Raises:
100    `tf.errors.InvalidArgumentError`: If the clip tensors would trigger array
101      broadcasting that would make the returned tensor larger than the input.
102    TypeError: If dtype of the input is `int32` and dtype of
103      the `clip_value_min` or `clip_value_max` is `float32`
104  """
105  with ops.name_scope(name, "clip_by_value",
106                      [t, clip_value_min, clip_value_max]) as name:
107    values = ops.convert_to_tensor(
108        t.values if isinstance(t, ops.IndexedSlices) else t, name="t")
109
110    # Go through list of tensors, for each value in each tensor clip
111    t_min = math_ops.minimum(values, clip_value_max)
112    # Assert that the shape is compatible with the initial shape,
113    # to prevent unintentional broadcasting.
114    values.shape.assert_is_compatible_with(t_min.shape)
115
116    t_max = math_ops.maximum(t_min, clip_value_min, name=name)
117    values.shape.assert_is_compatible_with(t_max.shape)
118
119    if isinstance(t, ops.IndexedSlices):
120      t_max = ops.IndexedSlices(t_max, t.indices, t.dense_shape)
121
122  return t_max
123  # TODO(scottzhu): switch to use new implementation in 2 weeks.
124  # return gen_math_ops.clip_by_value(
125  #     t, clip_value_min, clip_value_max, name=name)
126
127
128# TODO(scottzhu): switch to use new implementation in 2 weeks.
129# @ops.RegisterGradient("ClipByValue")
130def _clip_by_value_grad(op, grad):
131  """Returns grad of clip_by_value."""
132  x = op.inputs[0]
133  y = op.inputs[1]
134  z = op.inputs[2]
135  gdtype = grad.dtype
136  sx = array_ops.shape(x)
137  sy = array_ops.shape(y)
138  sz = array_ops.shape(z)
139  gradshape = array_ops.shape(grad)
140  zeros = array_ops.zeros(gradshape, gdtype)
141  xymask = math_ops.less(x, y)
142  xzmask = math_ops.greater(x, z)
143  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
144  rx, rz = gen_array_ops.broadcast_gradient_args(sx, sz)
145  xgrad = array_ops.where(math_ops.logical_or(xymask, xzmask), zeros, grad)
146  ygrad = array_ops.where(xymask, grad, zeros)
147  zgrad = array_ops.where(xzmask, grad, zeros)
148  gx = array_ops.reshape(math_ops.reduce_sum(xgrad, rx), sx)
149  gy = array_ops.reshape(math_ops.reduce_sum(ygrad, ry), sy)
150  gz = array_ops.reshape(math_ops.reduce_sum(zgrad, rz), sz)
151  return (gx, gy, gz)
152
153
154@tf_export("clip_by_norm")
155@dispatch.add_dispatch_support
156def clip_by_norm(t, clip_norm, axes=None, name=None):
157  """Clips tensor values to a maximum L2-norm.
158
159  Given a tensor `t`, and a maximum clip value `clip_norm`, this operation
160  normalizes `t` so that its L2-norm is less than or equal to `clip_norm`,
161  along the dimensions given in `axes`. Specifically, in the default case
162  where all dimensions are used for calculation, if the L2-norm of `t` is
163  already less than or equal to `clip_norm`, then `t` is not modified. If
164  the L2-norm is greater than `clip_norm`, then this operation returns a
165  tensor of the same type and shape as `t` with its values set to:
166
167  `t * clip_norm / l2norm(t)`
168
169  In this case, the L2-norm of the output tensor is `clip_norm`.
170
171  As another example, if `t` is a matrix and `axes == [1]`, then each row
172  of the output will have L2-norm less than or equal to `clip_norm`. If
173  `axes == [0]` instead, each column of the output will be clipped.
174
175  Code example:
176
177  >>> some_nums = tf.constant([[1, 2, 3, 4, 5]], dtype=tf.float32)
178  >>> tf.clip_by_norm(some_nums, 2.0).numpy()
179  array([[0.26967996, 0.5393599 , 0.80903983, 1.0787199 , 1.3483998 ]],
180        dtype=float32)
181
182  This operation is typically used to clip gradients before applying them with
183  an optimizer.  Most gradient data is a collection of different shaped tensors
184  for different parts of the model.  Thus, this is a common usage:
185
186  ```
187  # Get your gradients after training
188  loss_value, grads = grad(model, features, labels)
189
190  # Apply some clipping
191  grads = [tf.clip_by_norm(g, norm)
192               for g in grads]
193
194  # Continue on with training
195  optimizer.apply_gradients(grads)
196  ```
197
198  Args:
199    t: A `Tensor` or `IndexedSlices`.  This must be a floating point type.
200    clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value, also
201      floating point
202    axes: A 1-D (vector) `Tensor` of type int32 containing the dimensions
203      to use for computing the L2-norm. If `None` (the default), uses all
204      dimensions.
205    name: A name for the operation (optional).
206
207  Returns:
208    A clipped `Tensor` or `IndexedSlices`.
209
210  Raises:
211    ValueError: If the clip_norm tensor is not a 0-D scalar tensor.
212    TypeError: If dtype of the input is not a floating point or
213      complex type.
214  """
215  with ops.name_scope(name, "clip_by_norm", [t, clip_norm]) as name:
216    values = ops.convert_to_tensor(
217        t.values if isinstance(t, ops.IndexedSlices) else t, name="t")
218
219    # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
220    l2sum = math_ops.reduce_sum(values * values, axes, keepdims=True)
221    pred = l2sum > 0
222    # Two-tap tf.where trick to bypass NaN gradients
223    l2sum_safe = array_ops.where(pred, l2sum, array_ops.ones_like(l2sum))
224    l2norm = array_ops.where(pred, math_ops.sqrt(l2sum_safe), l2sum)
225    intermediate = values * clip_norm
226    # Assert that the shape is compatible with the initial shape,
227    # to prevent unintentional broadcasting.
228    values.shape.assert_is_compatible_with(intermediate.shape)
229    values_clip = array_ops.identity(
230        intermediate / math_ops.maximum(l2norm, clip_norm), name=name)
231
232    if isinstance(t, ops.IndexedSlices):
233      return ops.IndexedSlices(values_clip, t.indices, t.dense_shape)
234
235    return values_clip
236
237
238@tf_export("linalg.global_norm", v1=["linalg.global_norm", "global_norm"])
239@dispatch.add_dispatch_support
240@deprecation.deprecated_endpoints("global_norm")
241def global_norm(t_list, name=None):
242  """Computes the global norm of multiple tensors.
243
244  Given a tuple or list of tensors `t_list`, this operation returns the
245  global norm of the elements in all tensors in `t_list`. The global norm is
246  computed as:
247
248  `global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))`
249
250  Any entries in `t_list` that are of type None are ignored.
251
252  Args:
253    t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None.
254    name: A name for the operation (optional).
255
256  Returns:
257    A 0-D (scalar) `Tensor` of type `float`.
258
259  Raises:
260    TypeError: If `t_list` is not a sequence.
261  """
262  if (not isinstance(t_list, collections_abc.Sequence) or
263      isinstance(t_list, six.string_types)):
264    raise TypeError("t_list should be a sequence")
265  t_list = list(t_list)
266  with ops.name_scope(name, "global_norm", t_list) as name:
267    values = [
268        ops.convert_to_tensor(
269            t.values if isinstance(t, ops.IndexedSlices) else t,
270            name="t_%d" % i)
271        if t is not None else t
272        for i, t in enumerate(t_list)]
273    half_squared_norms = []
274    for v in values:
275      if v is not None:
276        with ops.colocate_with(v):
277          half_squared_norms.append(gen_nn_ops.l2_loss(v))
278
279    half_squared_norm = math_ops.reduce_sum(array_ops.stack(half_squared_norms))
280
281    norm = math_ops.sqrt(
282        half_squared_norm *
283        constant_op.constant(2.0, dtype=half_squared_norm.dtype),
284        name="global_norm")
285
286  return norm
287
288
289@tf_export("clip_by_global_norm")
290@dispatch.add_dispatch_support
291def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
292  """Clips values of multiple tensors by the ratio of the sum of their norms.
293
294  Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`,
295  this operation returns a list of clipped tensors `list_clipped`
296  and the global norm (`global_norm`) of all tensors in `t_list`. Optionally,
297  if you've already computed the global norm for `t_list`, you can specify
298  the global norm with `use_norm`.
299
300  To perform the clipping, the values `t_list[i]` are set to:
301
302      t_list[i] * clip_norm / max(global_norm, clip_norm)
303
304  where:
305
306      global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))
307
308  If `clip_norm > global_norm` then the entries in `t_list` remain as they are,
309  otherwise they're all shrunk by the global ratio.
310
311  If `global_norm == infinity` then the entries in `t_list` are all set to `NaN`
312  to signal that an error occurred.
313
314  Any of the entries of `t_list` that are of type `None` are ignored.
315
316  This is the correct way to perform gradient clipping (Pascanu et al., 2012).
317
318  However, it is slower than `clip_by_norm()` because all the parameters must be
319  ready before the clipping operation can be performed.
320
321  Args:
322    t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None.
323    clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio.
324    use_norm: A 0-D (scalar) `Tensor` of type `float` (optional). The global
325      norm to use. If not provided, `global_norm()` is used to compute the norm.
326    name: A name for the operation (optional).
327
328  Returns:
329    list_clipped: A list of `Tensors` of the same type as `list_t`.
330    global_norm: A 0-D (scalar) `Tensor` representing the global norm.
331
332  Raises:
333    TypeError: If `t_list` is not a sequence.
334
335  References:
336    On the difficulty of training Recurrent Neural Networks:
337      [Pascanu et al., 2012](http://proceedings.mlr.press/v28/pascanu13.html)
338      ([pdf](http://proceedings.mlr.press/v28/pascanu13.pdf))
339  """
340  if (not isinstance(t_list, collections_abc.Sequence) or
341      isinstance(t_list, six.string_types)):
342    raise TypeError("t_list should be a sequence")
343  t_list = list(t_list)
344  if use_norm is None:
345    use_norm = global_norm(t_list, name)
346
347  with ops.name_scope(name, "clip_by_global_norm",
348                      t_list + [clip_norm]) as name:
349    # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm
350    scale_for_finite = clip_norm * math_ops.minimum(
351        1.0 / use_norm,
352        constant_op.constant(1.0, dtype=use_norm.dtype) / clip_norm)
353    # If use_norm is any finite number, this is a no-op. For inf/-inf/NaN,
354    # this will make scale NaN.
355    scale = scale_for_finite + (use_norm - use_norm)
356
357    values = [
358        ops.convert_to_tensor(
359            t.values if isinstance(t, ops.IndexedSlices) else t,
360            name="t_%d" % i)
361        if t is not None else t
362        for i, t in enumerate(t_list)]
363
364    values_clipped = []
365    for i, v in enumerate(values):
366      if v is None:
367        values_clipped.append(None)
368      else:
369        with ops.colocate_with(v):
370          values_clipped.append(
371              array_ops.identity(v * scale, name="%s_%d" % (name, i)))
372
373    list_clipped = [
374        ops.IndexedSlices(c_v, t.indices, t.dense_shape)
375        if isinstance(t, ops.IndexedSlices)
376        else c_v
377        for (c_v, t) in zip(values_clipped, t_list)]
378
379  return list_clipped, use_norm
380
381
382@deprecation.deprecated(
383    date=None,
384    instructions="clip_by_average_norm is deprecated in TensorFlow 2.0. Please "
385    "use clip_by_norm(t, clip_norm * tf.cast(tf.size(t), tf.float32), name) "
386    "instead.")
387@tf_export(v1=["clip_by_average_norm"])
388@dispatch.add_dispatch_support
389def clip_by_average_norm(t, clip_norm, name=None):
390  """Clips tensor values to a maximum average L2-norm.
391
392  Given a tensor `t`, and a maximum clip value `clip_norm`, this operation
393  normalizes `t` so that its average L2-norm is less than or equal to
394  `clip_norm`. Specifically, if the average L2-norm is already less than or
395  equal to `clip_norm`, then `t` is not modified. If the average L2-norm is
396  greater than `clip_norm`, then this operation returns a tensor of the same
397  type and shape as `t` with its values set to:
398
399  `t * clip_norm / l2norm_avg(t)`
400
401  In this case, the average L2-norm of the output tensor is `clip_norm`.
402
403  This operation is typically used to clip gradients before applying them with
404  an optimizer.
405
406  Args:
407    t: A `Tensor`.
408    clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value.
409    name: A name for the operation (optional).
410
411  Returns:
412    A clipped `Tensor`.
413  """
414  with ops.name_scope(name, "clip_by_average_norm", [t, clip_norm]) as name:
415    t = ops.convert_to_tensor(t, name="t")
416
417    # Calculate L2-norm per element, clip elements by ratio of clip_norm to
418    # L2-norm per element
419    n_element = math_ops.cast(array_ops.size(t), dtypes.float32)
420    l2norm_inv = math_ops.rsqrt(
421        math_ops.reduce_sum(t * t, math_ops.range(array_ops.rank(t))))
422    tclip = array_ops.identity(
423        t * clip_norm * math_ops.minimum(
424            l2norm_inv * n_element, constant_op.constant(1.0) / clip_norm),
425        name=name)
426
427  return tclip
428