1# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Implementation of Loss operations for use in neural networks."""
16
17from __future__ import absolute_import
18from __future__ import division
19from __future__ import print_function
20
21from tensorflow.python.eager import context
22from tensorflow.python.framework import dtypes
23from tensorflow.python.framework import ops
24from tensorflow.python.ops import array_ops
25from tensorflow.python.ops import confusion_matrix
26from tensorflow.python.ops import control_flow_ops
27from tensorflow.python.ops import math_ops
28from tensorflow.python.ops import nn
29from tensorflow.python.ops import nn_ops
30from tensorflow.python.ops import weights_broadcast_ops
31from tensorflow.python.ops.losses import util
32from tensorflow.python.util import dispatch
33from tensorflow.python.util.deprecation import deprecated_args
34from tensorflow.python.util.deprecation import deprecated_argument_lookup
35from tensorflow.python.util.tf_export import tf_export
36
37
38@tf_export(v1=["losses.Reduction"])
39class Reduction(object):
40  """Types of loss reduction.
41
42  Contains the following values:
43
44  * `NONE`: Un-reduced weighted losses with the same shape as input.
45  * `SUM`: Scalar sum of weighted losses.
46  * `MEAN`: Scalar `SUM` divided by sum of weights. DEPRECATED.
47  * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
48  * `SUM_OVER_NONZERO_WEIGHTS`: Scalar `SUM` divided by number of non-zero
49     weights. DEPRECATED.
50  * `SUM_BY_NONZERO_WEIGHTS`: Same as `SUM_OVER_NONZERO_WEIGHTS`. DEPRECATED.
51  """
52
53  NONE = "none"
54  SUM = "weighted_sum"
55  SUM_OVER_BATCH_SIZE = "weighted_sum_over_batch_size"
56  MEAN = "weighted_mean"
57  SUM_BY_NONZERO_WEIGHTS = "weighted_sum_by_nonzero_weights"
58  SUM_OVER_NONZERO_WEIGHTS = SUM_BY_NONZERO_WEIGHTS
59
60  @classmethod
61  def all(cls):
62    return (
63        cls.NONE,
64        cls.SUM,
65        cls.MEAN,
66        cls.SUM_OVER_BATCH_SIZE,
67        cls.SUM_OVER_NONZERO_WEIGHTS,
68        cls.SUM_BY_NONZERO_WEIGHTS)
69
70  @classmethod
71  def validate(cls, key):
72    if key not in cls.all():
73      raise ValueError("Invalid Reduction Key %s." % key)
74
75
76def _safe_mean(losses, num_present):
77  """Computes a safe mean of the losses.
78
79  Args:
80    losses: `Tensor` whose elements contain individual loss measurements.
81    num_present: The number of measurable elements in `losses`.
82
83  Returns:
84    A scalar representing the mean of `losses`. If `num_present` is zero,
85      then zero is returned.
86  """
87  total_loss = math_ops.reduce_sum(losses)
88  return math_ops.div_no_nan(total_loss, num_present, name="value")
89
90
91def _num_present(losses, weights, per_batch=False):
92  """Computes the number of elements in the loss function induced by `weights`.
93
94  A given weights tensor induces different numbers of usable elements in the
95  `losses` tensor. The `weights` tensor is broadcast across `losses` for all
96  possible dimensions. For example, if `losses` is a tensor of dimension
97  `[4, 5, 6, 3]` and `weights` is a tensor of shape `[4, 5]`, then `weights` is,
98  in effect, tiled to match the shape of `losses`. Following this effective
99  tile, the total number of present elements is the number of non-zero weights.
100
101  Args:
102    losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
103    weights: `Tensor` of shape `[]`, `[batch_size]` or
104      `[batch_size, d1, ... dK]`, where K < N.
105    per_batch: Whether to return the number of elements per batch or as a sum
106      total.
107
108  Returns:
109    The number of present (non-zero) elements in the losses tensor. If
110      `per_batch` is `True`, the value is returned as a tensor of size
111      `[batch_size]`. Otherwise, a single scalar tensor is returned.
112  """
113  if ((isinstance(weights, float) and weights != 0.0) or
114      (context.executing_eagerly() and weights._rank() == 0  # pylint: disable=protected-access
115       and not math_ops.equal(weights, 0.0))):
116    return _num_elements(losses)
117  with ops.name_scope(None, "num_present", (losses, weights)) as scope:
118    weights = math_ops.cast(weights, dtype=dtypes.float32)
119    present = array_ops.where(
120        math_ops.equal(weights, 0.0),
121        array_ops.zeros_like(weights),
122        array_ops.ones_like(weights))
123    present = weights_broadcast_ops.broadcast_weights(present, losses)
124    if per_batch:
125      return math_ops.reduce_sum(
126          present,
127          axis=math_ops.range(1, array_ops.rank(present)),
128          keepdims=True,
129          name=scope)
130    return math_ops.reduce_sum(present, name=scope)
131
132
133def _num_elements(losses):
134  """Computes the number of elements in `losses` tensor."""
135  with ops.name_scope(None, "num_elements", values=[losses]) as scope:
136    return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
137
138
139@tf_export(v1=["losses.compute_weighted_loss"])
140@dispatch.add_dispatch_support
141def compute_weighted_loss(
142    losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES,
143    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
144  """Computes the weighted loss.
145
146  Args:
147    losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
148    weights: Optional `Tensor` whose rank is either 0, or the same rank as
149      `losses`, and must be broadcastable to `losses` (i.e., all dimensions must
150      be either `1`, or the same as the corresponding `losses` dimension).
151    scope: the scope for the operations performed in computing the loss.
152    loss_collection: the loss will be added to these collections.
153    reduction: Type of reduction to apply to loss.
154
155  Returns:
156    Weighted loss `Tensor` of the same type as `losses`. If `reduction` is
157    `NONE`, this has the same shape as `losses`; otherwise, it is scalar.
158
159  Raises:
160    ValueError: If `weights` is `None` or the shape is not compatible with
161      `losses`, or if the number of dimensions (rank) of either `losses` or
162      `weights` is missing.
163
164  Note:
165    When calculating the gradient of a weighted loss contributions from
166    both `losses` and `weights` are considered. If your `weights` depend
167    on some model parameters but you do not want this to affect the loss
168    gradient, you need to apply `tf.stop_gradient` to `weights` before
169    passing them to `compute_weighted_loss`.
170
171  @compatibility(eager)
172  The `loss_collection` argument is ignored when executing eagerly. Consider
173  holding on to the return value or collecting losses via a `tf.keras.Model`.
174  @end_compatibility
175  """
176  Reduction.validate(reduction)
177  with ops.name_scope(scope, "weighted_loss", (losses, weights)):
178    # Save the `reduction` argument for loss normalization when distributing
179    # to multiple replicas. Used only for estimator + v1 optimizer flow.
180    ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
181
182    with ops.control_dependencies((
183        weights_broadcast_ops.assert_broadcastable(weights, losses),)):
184      losses = ops.convert_to_tensor(losses)
185      input_dtype = losses.dtype
186      losses = math_ops.cast(losses, dtype=dtypes.float32)
187      weights = math_ops.cast(weights, dtype=dtypes.float32)
188      weighted_losses = math_ops.multiply(losses, weights)
189      if reduction == Reduction.NONE:
190        loss = weighted_losses
191      else:
192        loss = math_ops.reduce_sum(weighted_losses)
193        if reduction == Reduction.MEAN:
194          loss = _safe_mean(
195              loss, math_ops.reduce_sum(array_ops.ones_like(losses) * weights))
196        elif (reduction == Reduction.SUM_BY_NONZERO_WEIGHTS or
197              reduction == Reduction.SUM_OVER_NONZERO_WEIGHTS):
198          loss = _safe_mean(loss, _num_present(losses, weights))
199        elif reduction == Reduction.SUM_OVER_BATCH_SIZE:
200          loss = _safe_mean(loss, _num_elements(losses))
201
202      # Convert the result back to the input type.
203      loss = math_ops.cast(loss, input_dtype)
204      util.add_loss(loss, loss_collection)
205      return loss
206
207
208@tf_export(v1=["losses.absolute_difference"])
209@dispatch.add_dispatch_support
210def absolute_difference(
211    labels, predictions, weights=1.0, scope=None,
212    loss_collection=ops.GraphKeys.LOSSES,
213    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
214  """Adds an Absolute Difference loss to the training procedure.
215
216  `weights` acts as a coefficient for the loss. If a scalar is provided, then
217  the loss is simply scaled by the given value. If `weights` is a `Tensor` of
218  shape `[batch_size]`, then the total loss for each sample of the batch is
219  rescaled by the corresponding element in the `weights` vector. If the shape of
220  `weights` matches the shape of `predictions`, then the loss of each
221  measurable element of `predictions` is scaled by the corresponding value of
222  `weights`.
223
224  Args:
225    labels: The ground truth output tensor, same dimensions as 'predictions'.
226    predictions: The predicted outputs.
227    weights: Optional `Tensor` whose rank is either 0, or the same rank as
228      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
229      be either `1`, or the same as the corresponding `losses` dimension).
230    scope: The scope for the operations performed in computing the loss.
231    loss_collection: collection to which this loss will be added.
232    reduction: Type of reduction to apply to loss.
233
234  Returns:
235    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
236    shape as `labels`; otherwise, it is scalar.
237
238  Raises:
239    ValueError: If the shape of `predictions` doesn't match that of
240      `labels` or if the shape of `weights` is invalid or if `labels`
241      or `predictions` is None.
242
243  @compatibility(eager)
244  The `loss_collection` argument is ignored when executing eagerly. Consider
245  holding on to the return value or collecting losses via a `tf.keras.Model`.
246  @end_compatibility
247  """
248  if labels is None:
249    raise ValueError("labels must not be None.")
250  if predictions is None:
251    raise ValueError("predictions must not be None.")
252  with ops.name_scope(scope, "absolute_difference",
253                      (predictions, labels, weights)) as scope:
254    predictions = math_ops.cast(predictions, dtype=dtypes.float32)
255    labels = math_ops.cast(labels, dtype=dtypes.float32)
256    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
257    losses = math_ops.abs(math_ops.subtract(predictions, labels))
258    return compute_weighted_loss(
259        losses, weights, scope, loss_collection, reduction=reduction)
260
261
262@tf_export(v1=["losses.cosine_distance"])
263@dispatch.add_dispatch_support
264@deprecated_args(None, "dim is deprecated, use axis instead", "dim")
265def cosine_distance(
266    labels, predictions, axis=None, weights=1.0, scope=None,
267    loss_collection=ops.GraphKeys.LOSSES,
268    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS,
269    dim=None):
270  """Adds a cosine-distance loss to the training procedure.
271
272  Note that the function assumes that `predictions` and `labels` are already
273  unit-normalized.
274
275  Args:
276    labels: `Tensor` whose shape matches 'predictions'
277    predictions: An arbitrary matrix.
278    axis: The dimension along which the cosine distance is computed.
279    weights: Optional `Tensor` whose rank is either 0, or the same rank as
280      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
281      be either `1`, or the same as the corresponding `losses` dimension).
282    scope: The scope for the operations performed in computing the loss.
283    loss_collection: collection to which this loss will be added.
284    reduction: Type of reduction to apply to loss.
285    dim: The old (deprecated) name for `axis`.
286
287  Returns:
288    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
289    shape as `labels`; otherwise, it is scalar.
290
291  Raises:
292    ValueError: If `predictions` shape doesn't match `labels` shape, or
293      `axis`, `labels`, `predictions` or `weights` is `None`.
294
295  @compatibility(eager)
296  The `loss_collection` argument is ignored when executing eagerly. Consider
297  holding on to the return value or collecting losses via a `tf.keras.Model`.
298  @end_compatibility
299  """
300  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
301  if axis is None:
302    raise ValueError("You must specify 'axis'.")
303  if labels is None:
304    raise ValueError("labels must not be None.")
305  if predictions is None:
306    raise ValueError("predictions must not be None.")
307  with ops.name_scope(scope, "cosine_distance_loss",
308                      (predictions, labels, weights)) as scope:
309    predictions = math_ops.cast(predictions, dtype=dtypes.float32)
310    labels = math_ops.cast(labels, dtype=dtypes.float32)
311    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
312
313    radial_diffs = math_ops.multiply(predictions, labels)
314    losses = 1 - math_ops.reduce_sum(radial_diffs, axis=(axis,), keepdims=True)
315    return compute_weighted_loss(
316        losses, weights, scope, loss_collection, reduction=reduction)
317
318
319@tf_export(v1=["losses.hinge_loss"])
320@dispatch.add_dispatch_support
321def hinge_loss(labels, logits, weights=1.0, scope=None,
322               loss_collection=ops.GraphKeys.LOSSES,
323               reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
324  """Adds a hinge loss to the training procedure.
325
326  Args:
327    labels: The ground truth output tensor. Its shape should match the shape of
328      logits. The values of the tensor are expected to be 0.0 or 1.0. Internally
329      the {0,1} labels are converted to {-1,1} when calculating the hinge loss.
330    logits: The logits, a float tensor. Note that logits are assumed to be
331      unbounded and 0-centered. A value > 0 (resp. < 0) is considered a positive
332      (resp. negative) binary prediction.
333    weights: Optional `Tensor` whose rank is either 0, or the same rank as
334      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
335      be either `1`, or the same as the corresponding `losses` dimension).
336    scope: The scope for the operations performed in computing the loss.
337    loss_collection: collection to which the loss will be added.
338    reduction: Type of reduction to apply to loss.
339
340  Returns:
341    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
342    shape as `labels`; otherwise, it is scalar.
343
344  Raises:
345    ValueError: If the shapes of `logits` and `labels` don't match or
346      if `labels` or `logits` is None.
347
348  @compatibility(eager)
349  The `loss_collection` argument is ignored when executing eagerly. Consider
350  holding on to the return value or collecting losses via a `tf.keras.Model`.
351  @end_compatibility
352  """
353  if labels is None:
354    raise ValueError("labels must not be None.")
355  if logits is None:
356    raise ValueError("logits must not be None.")
357  with ops.name_scope(scope, "hinge_loss", (logits, labels, weights)) as scope:
358    logits = math_ops.cast(logits, dtype=dtypes.float32)
359    labels = math_ops.cast(labels, dtype=dtypes.float32)
360    logits.get_shape().assert_is_compatible_with(labels.get_shape())
361    # We first need to convert binary labels to -1/1 labels (as floats).
362    all_ones = array_ops.ones_like(labels)
363    labels = math_ops.subtract(2 * labels, all_ones)
364    losses = nn_ops.relu(
365        math_ops.subtract(all_ones, math_ops.multiply(labels, logits)))
366    return compute_weighted_loss(
367        losses, weights, scope, loss_collection, reduction=reduction)
368
369
370@tf_export(v1=["losses.huber_loss"])
371@dispatch.add_dispatch_support
372def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
373               loss_collection=ops.GraphKeys.LOSSES,
374               reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
375  """Adds a [Huber Loss](https://en.wikipedia.org/wiki/Huber_loss) term to the training procedure.
376
377  For each value x in `error=labels-predictions`, the following is calculated:
378
379  ```
380    0.5 * x^2                  if |x| <= d
381    0.5 * d^2 + d * (|x| - d)  if |x| > d
382  ```
383
384  where d is `delta`.
385
386  `weights` acts as a coefficient for the loss. If a scalar is provided, then
387  the loss is simply scaled by the given value. If `weights` is a tensor of size
388  `[batch_size]`, then the total loss for each sample of the batch is rescaled
389  by the corresponding element in the `weights` vector. If the shape of
390  `weights` matches the shape of `predictions`, then the loss of each
391  measurable element of `predictions` is scaled by the corresponding value of
392  `weights`.
393
394  Args:
395    labels: The ground truth output tensor, same dimensions as 'predictions'.
396    predictions: The predicted outputs.
397    weights: Optional `Tensor` whose rank is either 0, or the same rank as
398      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
399      be either `1`, or the same as the corresponding `losses` dimension).
400    delta: `float`, the point where the huber loss function changes from a
401      quadratic to linear.
402    scope: The scope for the operations performed in computing the loss.
403    loss_collection: collection to which the loss will be added.
404    reduction: Type of reduction to apply to loss.
405
406  Returns:
407    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
408    shape as `labels`; otherwise, it is scalar.
409
410  Raises:
411    ValueError: If the shape of `predictions` doesn't match that of `labels` or
412      if the shape of `weights` is invalid.  Also if `labels` or
413     `predictions` is None.
414
415  @compatibility(eager)
416  The `loss_collection` argument is ignored when executing eagerly. Consider
417  holding on to the return value or collecting losses via a `tf.keras.Model`.
418  @end_compatibility
419  """
420  if labels is None:
421    raise ValueError("labels must not be None.")
422  if predictions is None:
423    raise ValueError("predictions must not be None.")
424  with ops.name_scope(scope, "huber_loss",
425                      (predictions, labels, weights)) as scope:
426    predictions = math_ops.cast(predictions, dtype=dtypes.float32)
427    labels = math_ops.cast(labels, dtype=dtypes.float32)
428    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
429    error = math_ops.subtract(predictions, labels)
430    abs_error = math_ops.abs(error)
431    quadratic = math_ops.minimum(abs_error, delta)
432    # The following expression is the same in value as
433    # tf.maximum(abs_error - delta, 0), but importantly the gradient for the
434    # expression when abs_error == delta is 0 (for tf.maximum it would be 1).
435    # This is necessary to avoid doubling the gradient, since there is already a
436    # nonzero contribution to the gradient from the quadratic term.
437    linear = math_ops.subtract(abs_error, quadratic)
438    losses = math_ops.add(
439        math_ops.multiply(
440            ops.convert_to_tensor(0.5, dtype=quadratic.dtype),
441            math_ops.multiply(quadratic, quadratic)),
442        math_ops.multiply(delta, linear))
443    return compute_weighted_loss(
444        losses, weights, scope, loss_collection, reduction=reduction)
445
446
447@tf_export(v1=["losses.log_loss"])
448@dispatch.add_dispatch_support
449def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
450             loss_collection=ops.GraphKeys.LOSSES,
451             reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
452  """Adds a Log Loss term to the training procedure.
453
454  `weights` acts as a coefficient for the loss. If a scalar is provided, then
455  the loss is simply scaled by the given value. If `weights` is a tensor of size
456  `[batch_size]`, then the total loss for each sample of the batch is rescaled
457  by the corresponding element in the `weights` vector. If the shape of
458  `weights` matches the shape of `predictions`, then the loss of each
459  measurable element of `predictions` is scaled by the corresponding value of
460  `weights`.
461
462  Args:
463    labels: The ground truth output tensor, same dimensions as 'predictions'.
464    predictions: The predicted outputs.
465    weights: Optional `Tensor` whose rank is either 0, or the same rank as
466      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
467      be either `1`, or the same as the corresponding `losses` dimension).
468    epsilon: A small increment to add to avoid taking a log of zero.
469    scope: The scope for the operations performed in computing the loss.
470    loss_collection: collection to which the loss will be added.
471    reduction: Type of reduction to apply to loss.
472
473  Returns:
474    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
475    shape as `labels`; otherwise, it is scalar.
476
477  Raises:
478    ValueError: If the shape of `predictions` doesn't match that of `labels` or
479      if the shape of `weights` is invalid.  Also if `labels` or `predictions`
480      is None.
481
482  @compatibility(eager)
483  The `loss_collection` argument is ignored when executing eagerly. Consider
484  holding on to the return value or collecting losses via a `tf.keras.Model`.
485  @end_compatibility
486  """
487  if labels is None:
488    raise ValueError("labels must not be None.")
489  if predictions is None:
490    raise ValueError("predictions must not be None.")
491  with ops.name_scope(scope, "log_loss",
492                      (predictions, labels, weights)) as scope:
493    predictions = math_ops.cast(predictions, dtype=dtypes.float32)
494    labels = math_ops.cast(labels, dtype=dtypes.float32)
495    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
496    losses = -math_ops.multiply(
497        labels,
498        math_ops.log(predictions + epsilon)) - math_ops.multiply(
499            (1 - labels), math_ops.log(1 - predictions + epsilon))
500    return compute_weighted_loss(
501        losses, weights, scope, loss_collection, reduction=reduction)
502
503
504# TODO(b/37208492): Add reduction arg.
505@tf_export(v1=["losses.mean_pairwise_squared_error"])
506@dispatch.add_dispatch_support
507def mean_pairwise_squared_error(
508    labels, predictions, weights=1.0, scope=None,
509    loss_collection=ops.GraphKeys.LOSSES):
510  """Adds a pairwise-errors-squared loss to the training procedure.
511
512  Unlike `mean_squared_error`, which is a measure of the differences between
513  corresponding elements of `predictions` and `labels`,
514  `mean_pairwise_squared_error` is a measure of the differences between pairs of
515  corresponding elements of `predictions` and `labels`.
516
517  For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are
518  three pairs of differences are summed to compute the loss:
519    loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3
520
521  Note that since the inputs are of shape `[batch_size, d0, ... dN]`, the
522  corresponding pairs are computed within each batch sample but not across
523  samples within a batch. For example, if `predictions` represents a batch of
524  16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs
525  is drawn from each image, but not across images.
526
527  `weights` acts as a coefficient for the loss. If a scalar is provided, then
528  the loss is simply scaled by the given value. If `weights` is a tensor of size
529  `[batch_size]`, then the total loss for each sample of the batch is rescaled
530  by the corresponding element in the `weights` vector.
531
532  Args:
533    labels: The ground truth output tensor, whose shape must match the shape of
534      `predictions`.
535    predictions: The predicted outputs, a tensor of size
536      `[batch_size, d0, .. dN]` where N+1 is the total number of dimensions in
537      `predictions`.
538    weights: Coefficients for the loss a scalar, a tensor of shape
539      `[batch_size]` or a tensor whose shape matches `predictions`.
540    scope: The scope for the operations performed in computing the loss.
541    loss_collection: collection to which the loss will be added.
542
543  Returns:
544    A scalar `Tensor` that returns the weighted loss.
545
546  Raises:
547    ValueError: If the shape of `predictions` doesn't match that of `labels` or
548      if the shape of `weights` is invalid.  Also if `labels` or `predictions`
549      is None.
550
551  @compatibility(eager)
552  The `loss_collection` argument is ignored when executing eagerly. Consider
553  holding on to the return value or collecting losses via a `tf.keras.Model`.
554  @end_compatibility
555  """
556  if labels is None:
557    raise ValueError("labels must not be None.")
558  if predictions is None:
559    raise ValueError("predictions must not be None.")
560  with ops.name_scope(scope, "mean_pairwise_squared_error",
561                      (predictions, labels, weights)) as scope:
562    weights = math_ops.cast(weights, dtype=dtypes.float32)
563    labels = math_ops.cast(labels, dtype=dtypes.float32)
564    with ops.control_dependencies((
565        weights_broadcast_ops.assert_broadcastable(weights, labels),)):
566      predictions = math_ops.cast(predictions, dtype=dtypes.float32)
567      predictions.get_shape().assert_is_compatible_with(labels.get_shape())
568
569      diffs = math_ops.subtract(predictions, labels)
570
571      axis = math_ops.range(1, array_ops.rank(diffs))
572
573      sum_squares_diff_per_batch = math_ops.reduce_sum(
574          math_ops.square(diffs), axis=axis, keepdims=True)
575      num_present_per_batch = _num_present(diffs, weights, per_batch=True)
576
577      term1 = 2.0 * math_ops.div_no_nan(
578          sum_squares_diff_per_batch,
579          math_ops.maximum(num_present_per_batch - 1, 0),
580          name="value")
581
582      sum_diff = math_ops.reduce_sum(diffs, axis=axis, keepdims=True)
583      term2 = 2.0 * math_ops.div_no_nan(
584          math_ops.square(sum_diff),
585          math_ops.maximum(
586              math_ops.multiply(num_present_per_batch,
587                                num_present_per_batch - 1), 0),
588          name="value")
589
590      weighted_losses = math_ops.multiply(term1 - term2, weights)
591      loss = math_ops.reduce_sum(weighted_losses)
592
593      mean_loss = array_ops.where(
594          math_ops.reduce_sum(num_present_per_batch) > 0,
595          loss,
596          array_ops.zeros_like(loss),
597          name="value")
598      util.add_loss(mean_loss, loss_collection)
599      return mean_loss
600
601
602@tf_export(v1=["losses.mean_squared_error"])
603@dispatch.add_dispatch_support
604def mean_squared_error(
605    labels, predictions, weights=1.0, scope=None,
606    loss_collection=ops.GraphKeys.LOSSES,
607    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
608  """Adds a Sum-of-Squares loss to the training procedure.
609
610  `weights` acts as a coefficient for the loss. If a scalar is provided, then
611  the loss is simply scaled by the given value. If `weights` is a tensor of size
612  `[batch_size]`, then the total loss for each sample of the batch is rescaled
613  by the corresponding element in the `weights` vector. If the shape of
614  `weights` matches the shape of `predictions`, then the loss of each
615  measurable element of `predictions` is scaled by the corresponding value of
616  `weights`.
617
618  Args:
619    labels: The ground truth output tensor, same dimensions as 'predictions'.
620    predictions: The predicted outputs.
621    weights: Optional `Tensor` whose rank is either 0, or the same rank as
622      `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
623      be either `1`, or the same as the corresponding `losses` dimension).
624    scope: The scope for the operations performed in computing the loss.
625    loss_collection: collection to which the loss will be added.
626    reduction: Type of reduction to apply to loss.
627
628  Returns:
629    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
630    shape as `labels`; otherwise, it is scalar.
631
632  Raises:
633    ValueError: If the shape of `predictions` doesn't match that of `labels` or
634      if the shape of `weights` is invalid.  Also if `labels` or `predictions`
635      is None.
636
637  @compatibility(eager)
638  The `loss_collection` argument is ignored when executing eagerly. Consider
639  holding on to the return value or collecting losses via a `tf.keras.Model`.
640  @end_compatibility
641  """
642  if labels is None:
643    raise ValueError("labels must not be None.")
644  if predictions is None:
645    raise ValueError("predictions must not be None.")
646  with ops.name_scope(scope, "mean_squared_error",
647                      (predictions, labels, weights)) as scope:
648    predictions = math_ops.cast(predictions, dtype=dtypes.float32)
649    labels = math_ops.cast(labels, dtype=dtypes.float32)
650    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
651    losses = math_ops.squared_difference(predictions, labels)
652    return compute_weighted_loss(
653        losses, weights, scope, loss_collection, reduction=reduction)
654
655
656@tf_export(v1=["losses.sigmoid_cross_entropy"])
657@dispatch.add_dispatch_support
658def sigmoid_cross_entropy(
659    multi_class_labels, logits, weights=1.0, label_smoothing=0, scope=None,
660    loss_collection=ops.GraphKeys.LOSSES,
661    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
662  """Creates a cross-entropy loss using tf.nn.sigmoid_cross_entropy_with_logits.
663
664  `weights` acts as a coefficient for the loss. If a scalar is provided,
665  then the loss is simply scaled by the given value. If `weights` is a
666  tensor of shape `[batch_size]`, then the loss weights apply to each
667  corresponding sample.
668
669  If `label_smoothing` is nonzero, smooth the labels towards 1/2:
670
671      new_multiclass_labels = multiclass_labels * (1 - label_smoothing)
672                              + 0.5 * label_smoothing
673
674  Args:
675    multi_class_labels: `[batch_size, num_classes]` target integer labels in
676      `{0, 1}`.
677    logits: Float `[batch_size, num_classes]` logits outputs of the network.
678    weights: Optional `Tensor` whose rank is either 0, or the same rank as
679    `multi_class_labels`, and must be broadcastable to `multi_class_labels`
680    (i.e., all dimensions must be either `1`, or the same as the
681    corresponding `losses` dimension).
682    label_smoothing: If greater than `0` then smooth the labels.
683    scope: The scope for the operations performed in computing the loss.
684    loss_collection: collection to which the loss will be added.
685    reduction: Type of reduction to apply to loss.
686
687  Returns:
688    Weighted loss `Tensor` of the same type as `logits`. If `reduction` is
689    `NONE`, this has the same shape as `logits`; otherwise, it is scalar.
690
691  Raises:
692    ValueError: If the shape of `logits` doesn't match that of
693      `multi_class_labels` or if the shape of `weights` is invalid, or if
694      `weights` is None.  Also if `multi_class_labels` or `logits` is None.
695
696  @compatibility(eager)
697  The `loss_collection` argument is ignored when executing eagerly. Consider
698  holding on to the return value or collecting losses via a `tf.keras.Model`.
699  @end_compatibility
700  """
701  if multi_class_labels is None:
702    raise ValueError("multi_class_labels must not be None.")
703  if logits is None:
704    raise ValueError("logits must not be None.")
705  with ops.name_scope(scope, "sigmoid_cross_entropy_loss",
706                      (logits, multi_class_labels, weights)) as scope:
707    logits = ops.convert_to_tensor(logits)
708    multi_class_labels = math_ops.cast(multi_class_labels, logits.dtype)
709    logits.get_shape().assert_is_compatible_with(multi_class_labels.get_shape())
710
711    if label_smoothing > 0:
712      multi_class_labels = (multi_class_labels * (1 - label_smoothing) +
713                            0.5 * label_smoothing)
714
715    losses = nn.sigmoid_cross_entropy_with_logits(labels=multi_class_labels,
716                                                  logits=logits,
717                                                  name="xentropy")
718    return compute_weighted_loss(
719        losses, weights, scope, loss_collection, reduction=reduction)
720
721
722@tf_export(v1=["losses.softmax_cross_entropy"])
723@dispatch.add_dispatch_support
724def softmax_cross_entropy(
725    onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
726    loss_collection=ops.GraphKeys.LOSSES,
727    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
728  """Creates a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits_v2.
729
730  `weights` acts as a coefficient for the loss. If a scalar is provided,
731  then the loss is simply scaled by the given value. If `weights` is a
732  tensor of shape `[batch_size]`, then the loss weights apply to each
733  corresponding sample.
734
735  If `label_smoothing` is nonzero, smooth the labels towards 1/num_classes:
736      new_onehot_labels = onehot_labels * (1 - label_smoothing)
737                          + label_smoothing / num_classes
738
739  Note that `onehot_labels` and `logits` must have the same shape,
740  e.g. `[batch_size, num_classes]`. The shape of `weights` must be
741  broadcastable to loss, whose shape is decided by the shape of `logits`.
742  In case the shape of `logits` is `[batch_size, num_classes]`, loss is
743  a `Tensor` of shape `[batch_size]`.
744
745  Args:
746    onehot_labels: One-hot-encoded labels.
747    logits: Logits outputs of the network.
748    weights: Optional `Tensor` that is broadcastable to loss.
749    label_smoothing: If greater than 0 then smooth the labels.
750    scope: the scope for the operations performed in computing the loss.
751    loss_collection: collection to which the loss will be added.
752    reduction: Type of reduction to apply to loss.
753
754  Returns:
755    Weighted loss `Tensor` of the same type as `logits`. If `reduction` is
756    `NONE`, this has shape `[batch_size]`; otherwise, it is scalar.
757
758  Raises:
759    ValueError: If the shape of `logits` doesn't match that of `onehot_labels`
760      or if the shape of `weights` is invalid or if `weights` is None.  Also if
761      `onehot_labels` or `logits` is None.
762
763  @compatibility(eager)
764  The `loss_collection` argument is ignored when executing eagerly. Consider
765  holding on to the return value or collecting losses via a `tf.keras.Model`.
766  @end_compatibility
767  """
768  if onehot_labels is None:
769    raise ValueError("onehot_labels must not be None.")
770  if logits is None:
771    raise ValueError("logits must not be None.")
772  with ops.name_scope(scope, "softmax_cross_entropy_loss",
773                      (logits, onehot_labels, weights)) as scope:
774    logits = ops.convert_to_tensor(logits)
775    onehot_labels = math_ops.cast(onehot_labels, logits.dtype)
776    logits.get_shape().assert_is_compatible_with(onehot_labels.get_shape())
777
778    if label_smoothing > 0:
779      num_classes = math_ops.cast(
780          array_ops.shape(onehot_labels)[-1], logits.dtype)
781      smooth_positives = 1.0 - label_smoothing
782      smooth_negatives = label_smoothing / num_classes
783      onehot_labels = onehot_labels * smooth_positives + smooth_negatives
784
785    onehot_labels = array_ops.stop_gradient(
786        onehot_labels, name="labels_stop_gradient")
787    losses = nn.softmax_cross_entropy_with_logits_v2(
788        labels=onehot_labels, logits=logits, name="xentropy")
789
790    return compute_weighted_loss(
791        losses, weights, scope, loss_collection, reduction=reduction)
792
793
794# TODO(ptucker): Merge this with similar method in metrics_impl.
795def _remove_squeezable_dimensions(
796    labels, predictions, weights=None, expected_rank_diff=0):
797  """Internal version of _remove_squeezable_dimensions which handles weights.
798
799  Squeezes `predictions` and `labels` if their ranks differ from expected by
800  exactly 1.
801  Squeezes `weights` if its rank is 1 more than the new rank of `predictions`
802
803  This will use static shape if available. Otherwise, it will add graph
804  operations, which could result in a performance hit.
805
806  Args:
807    labels: Label values, a `Tensor` whose dimensions match `predictions`.
808    predictions: Predicted values, a `Tensor` of arbitrary dimensions.
809    weights: Optional weight `Tensor`. It will be squeezed if it's not scalar,
810      and its rank is 1 more than the new rank of `labels`.
811    expected_rank_diff: Expected result of `rank(predictions) - rank(labels)`.
812
813  Returns:
814    Tuple of `predictions`, `labels` and `weights`, possibly with the last
815    dimension squeezed.
816  """
817  labels, predictions = confusion_matrix.remove_squeezable_dimensions(
818      labels, predictions, expected_rank_diff=expected_rank_diff)
819
820  if weights is not None:
821    weights = ops.convert_to_tensor(weights)
822    labels_rank = labels.get_shape().ndims
823    weights_shape = weights.get_shape()
824    weights_rank = weights_shape.ndims
825
826    if (labels_rank is not None) and (weights_rank is not None):
827      # Use static rank.
828      rank_diff = weights_rank - labels_rank
829      if rank_diff == 1:
830        weights = array_ops.squeeze(weights, [-1])
831      return labels, predictions, weights
832
833    # Use dynamic rank.
834    rank_diff = array_ops.rank(weights) - array_ops.rank(labels)
835    if (weights_rank is None) or (
836        weights_rank > 0 and weights_shape.dims[-1].is_compatible_with(1)):
837      weights = control_flow_ops.cond(
838          math_ops.equal(1, rank_diff),
839          lambda: array_ops.squeeze(weights, [-1]),
840          lambda: weights)
841
842  return labels, predictions, weights
843
844
845@tf_export(v1=["losses.sparse_softmax_cross_entropy"])
846@dispatch.add_dispatch_support
847def sparse_softmax_cross_entropy(
848    labels, logits, weights=1.0, scope=None,
849    loss_collection=ops.GraphKeys.LOSSES,
850    reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
851  """Cross-entropy loss using `tf.nn.sparse_softmax_cross_entropy_with_logits`.
852
853  `weights` acts as a coefficient for the loss. If a scalar is provided,
854  then the loss is simply scaled by the given value. If `weights` is a
855  tensor of shape `[batch_size]`, then the loss weights apply to each
856  corresponding sample.
857
858  Args:
859    labels: `Tensor` of shape `[d_0, d_1, ..., d_{r-1}]` (where `r` is rank of
860      `labels` and result) and dtype `int32` or `int64`. Each entry in `labels`
861      must be an index in `[0, num_classes)`. Other values will raise an
862      exception when this op is run on CPU, and return `NaN` for corresponding
863      loss and gradient rows on GPU.
864    logits: Unscaled log probabilities of shape
865      `[d_0, d_1, ..., d_{r-1}, num_classes]` and dtype `float16`, `float32` or
866      `float64`.
867    weights: Coefficients for the loss. This must be scalar or broadcastable to
868      `labels` (i.e. same rank and each dimension is either 1 or the same).
869    scope: the scope for the operations performed in computing the loss.
870    loss_collection: collection to which the loss will be added.
871    reduction: Type of reduction to apply to loss.
872
873  Returns:
874    Weighted loss `Tensor` of the same type as `logits`. If `reduction` is
875    `NONE`, this has the same shape as `labels`; otherwise, it is scalar.
876
877  Raises:
878    ValueError: If the shapes of `logits`, `labels`, and `weights` are
879      incompatible, or if any of them are None.
880
881  @compatibility(eager)
882  The `loss_collection` argument is ignored when executing eagerly. Consider
883  holding on to the return value or collecting losses via a `tf.keras.Model`.
884  @end_compatibility
885  """
886  if labels is None:
887    raise ValueError("labels must not be None.")
888  if logits is None:
889    raise ValueError("logits must not be None.")
890  with ops.name_scope(scope, "sparse_softmax_cross_entropy_loss",
891                      (logits, labels, weights)) as scope:
892    # As documented above in Args, labels contain class IDs and logits contains
893    # 1 probability per class ID, so we expect rank(logits) - rank(labels) == 1;
894    # therefore, expected_rank_diff=1.
895    labels, logits, weights = _remove_squeezable_dimensions(
896        labels, logits, weights, expected_rank_diff=1)
897    losses = nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
898                                                         logits=logits,
899                                                         name="xentropy")
900    return compute_weighted_loss(
901        losses, weights, scope, loss_collection, reduction=reduction)
902