1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15
16"""Gradient checker for any ops, graphs.
17
18The gradient checker verifies numerically that an op/graph properly
19computes the gradients
20"""
21from __future__ import absolute_import
22from __future__ import division
23from __future__ import print_function
24
25import numpy as np
26
27from tensorflow.python.framework import constant_op
28from tensorflow.python.framework import dtypes
29from tensorflow.python.framework import ops
30from tensorflow.python.ops import array_ops
31from tensorflow.python.ops import gradients
32from tensorflow.python.ops import math_ops
33from tensorflow.python.platform import tf_logging as logging
34from tensorflow.python.util.tf_export import tf_export
35
36
37def _product(t):
38  if isinstance(t, int):
39    return t
40  else:
41    y = 1
42    for x in t:
43      y *= x
44    return y
45
46
47def _extra_feeds(extra_feed_dict, new_feeds):
48  if not extra_feed_dict:
49    return new_feeds
50  r = {}
51  r.update(extra_feed_dict)
52  r.update(new_feeds)
53  return r
54
55
56def _compute_theoretical_jacobian(x, x_shape, x_data, dy, dy_shape, dx,
57                                  extra_feed_dict):
58  """Computes the theoretical Jacobian for dy/dx.
59
60  Computes the theoretical Jacobian using the ops generated by
61  compute_gradient().
62
63  Args:
64    x: the tensor "x".
65    x_shape: the dimensions of x as a tuple or an array of ints.
66    x_data: a numpy parray as the input data for x
67    dy: the tensor "dy".
68    dy_shape: the dimensions of dy as a tuple or an array of ints.
69    dx: Tensor or IndexedSlices representing dx
70    extra_feed_dict: dict that allows fixing specified tensor values
71      during the jacobian calculation.
72
73  Returns:
74    A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows
75    and "dy_size" columns where "x_size" is the number of elements in x and
76    "dy_size" is the number of elements in dy.
77
78  Raises:
79    ValueError: If `dy` is empty but the gradient is nonzero.
80  """
81  # Complex vectors are treated as vectors of twice as many reals.
82  if x.dtype.is_complex:
83    x_shape = tuple(x_shape) + (2,)
84  dy_factor = 2 if dy.dtype.is_complex else 1
85
86  # To compute the jacobian, we treat x and y as one-dimensional vectors.
87  x_size = _product(x_shape)
88  x_val_size = _product(x_shape[1:])  # This is used for sparse gradients
89  dy_size = _product(dy_shape) * dy_factor
90
91  # Allocate 2-D Jacobian, with x dimensions smashed into the first
92  # dimension and y dimensions smashed into the second.
93  jacobian = np.zeros((x_size, dy_size),
94                      dtype=x.dtype.real_dtype.as_numpy_dtype)
95
96  # For each of the entry of dy, we set this to be 1 and
97  # everything else to be 0 and compute the backprop -- this will give us one
98  # one column of the Jacobian matrix.
99  dy_data = np.zeros(dy_shape, dtype=dy.dtype.as_numpy_dtype)
100  dy_data_flat = dy_data.ravel().view(dy.dtype.real_dtype.as_numpy_dtype)
101  sess = ops.get_default_session()
102  for col in range(dy_size):
103    dy_data_flat[col] = 1
104    if isinstance(dx, ops.IndexedSlices):
105      backprop_indices, backprop_values = sess.run(
106          [dx.indices, dx.values],
107          feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data}))
108      for i, v in zip(backprop_indices, backprop_values):
109        r_begin = i * x_val_size
110        r_end = r_begin + x_val_size
111        jacobian[r_begin:r_end, col] += v.flat
112    else:
113      assert isinstance(dx, ops.Tensor), "dx = " + str(dx)
114      backprop = sess.run(
115          dx, feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data}))
116      jacobian[:, col] = backprop.ravel().view(jacobian.dtype)
117    dy_data_flat[col] = 0
118
119  # If the output is empty, run the gradients at least once and make sure
120  # they produce zeros.
121  if not dy_size:
122    backprop = sess.run(
123        dx, feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data}))
124    if backprop.shape != x_data.shape:
125      raise ValueError("Empty gradient has wrong shape: expected %s, got %s" %
126                       (x_data.shape, backprop.shape))
127    if np.any(backprop):
128      raise ValueError("Empty tensor with nonzero gradients")
129
130  logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian)
131  return jacobian
132
133
134def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta,
135                              extra_feed_dict):
136  """Computes the numeric Jacobian for dy/dx.
137
138  Computes the numeric Jacobian by slightly perturbing the inputs and
139  measuring the differences on the output.
140
141  Args:
142    x: the tensor "x".
143    x_shape: the dimensions of x as a tuple or an array of ints.
144    x_data: a numpy array as the input data for x
145    y: the tensor "y".
146    y_shape: the dimensions of y as a tuple or an array of ints.
147    delta: the amount of perturbation we give to the input
148    extra_feed_dict: dict that allows fixing specified tensor values
149      during the jacobian calculation.
150
151  Returns:
152    A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows
153    and "y_size" columns where "x_size" is the number of elements in x and
154    "y_size" is the number of elements in y.
155  """
156  # bfloat16 doesn't have enough bits to represent high precision numbers such
157  # as delta. Convert to float32 here. Since numeric_jacobian is expected to
158  # be the groundtruth to compare against, it shouldn't lose any information.
159  if x.dtype == dtypes.bfloat16:
160    x = math_ops.cast(x, dtypes.float32)
161  if y.dtype == dtypes.bfloat16:
162    y = math_ops.cast(y, dtypes.float32)
163  if x_data.dtype == dtypes.bfloat16.as_numpy_dtype:
164    x_data = x_data.astype(np.float32)
165
166  # To compute the jacobian, we treat x and y as one-dimensional vectors
167  x_size = _product(x_shape) * (2 if x.dtype.is_complex else 1)
168  y_size = _product(y_shape) * (2 if y.dtype.is_complex else 1)
169  x_dtype = x.dtype.real_dtype.as_numpy_dtype
170  y_dtype = y.dtype.real_dtype.as_numpy_dtype
171
172  # Make sure we have the right types
173  x_data = np.asarray(x_data, dtype=x.dtype.as_numpy_dtype)
174  scale = np.asarray(2 * delta, dtype=y_dtype)[()]
175
176  jacobian = np.zeros((x_size, y_size), dtype=x_dtype)
177  # For each of the entry of x, we slightly perturbs this by adding and
178  # subtracting a delta and then compute difference between the outputs. This
179  # will give us one row of the Jacobian matrix.
180  for row in range(x_size):
181    x_pos = x_data.copy()
182    x_neg = x_data.copy()
183    x_pos.ravel().view(x_dtype)[row] += delta
184    y_pos = y.eval(feed_dict=_extra_feeds(extra_feed_dict, {x: x_pos}))
185    x_neg.ravel().view(x_dtype)[row] -= delta
186    y_neg = y.eval(feed_dict=_extra_feeds(extra_feed_dict, {x: x_neg}))
187    diff = (y_pos - y_neg) / scale
188    jacobian[row, :] = diff.ravel().view(y_dtype)
189
190  logging.vlog(1, "Numeric Jacobian =\n%s", jacobian)
191  return jacobian
192
193
194def _compute_dx_and_dy(x, y, y_shape):
195  """Returns a node to compute gradient of y wrt x."""
196  # We make up a dy so that we can compute the gradients. We don't really use
197  # the value of dy -- we will always feed it. We need to add an identity node
198  # so that we can always feed it properly. Otherwise, for the Add operation,
199  # dx is the same as dy and we cannot fetch the tensor that we are feeding.
200  with x.graph.as_default():
201    dy_orig = constant_op.constant(1.0, shape=y_shape, dtype=y.dtype)
202    dy = array_ops.identity(dy_orig)
203  # We compute the gradients for y wrt. x
204  grads = gradients.gradients(y, x, dy)
205  assert len(grads) == 1
206  return grads[0], dy_orig
207
208
209def _compute_gradient(x,
210                      x_shape,
211                      dx,
212                      y,
213                      y_shape,
214                      dy,
215                      x_init_value=None,
216                      delta=1e-3,
217                      extra_feed_dict=None):
218  """Computes the theoretical and numerical jacobian."""
219  t = dtypes.as_dtype(x.dtype)
220  allowed_types = [dtypes.float16, dtypes.bfloat16, dtypes.float32,
221                   dtypes.float64, dtypes.complex64, dtypes.complex128]
222  assert t.base_dtype in allowed_types, "Don't support type %s for x" % t.name
223  t2 = dtypes.as_dtype(y.dtype)
224  assert t2.base_dtype in allowed_types, "Don't support type %s for y" % t2.name
225
226  if x_init_value is not None:
227    i_shape = list(x_init_value.shape)
228    assert(list(x_shape) == i_shape), "x_shape = %s, init_data shape = %s" % (
229        x_shape, i_shape)
230    x_data = x_init_value
231  else:
232    x_data = np.random.random_sample(x_shape).astype(t.as_numpy_dtype)
233    if t.is_complex:
234      x_data.imag = np.random.random_sample(x_shape)
235
236  jacob_t = _compute_theoretical_jacobian(
237      x, x_shape, x_data, dy, y_shape, dx, extra_feed_dict=extra_feed_dict)
238  jacob_n = _compute_numeric_jacobian(
239      x, x_shape, x_data, y, y_shape, delta, extra_feed_dict=extra_feed_dict)
240  return jacob_t, jacob_n
241
242
243def _compute_gradient_list(x,
244                           x_shape,
245                           y,
246                           y_shape,
247                           x_init_value=None,
248                           delta=1e-3,
249                           init_targets=None,
250                           extra_feed_dict=None):
251  """Compute gradients for a list of x values."""
252  assert isinstance(x, list)
253  dx, dy = zip(*[_compute_dx_and_dy(xi, y, y_shape) for xi in x])
254
255  if init_targets is not None:
256    assert isinstance(init_targets, (list, tuple))
257    for init in init_targets:
258      init.run()
259  if x_init_value is None:
260    x_init_value = [None] * len(x)
261  ret = [_compute_gradient(xi, x_shapei, dxi, y, y_shape, dyi, x_init_valuei,
262                           delta, extra_feed_dict=extra_feed_dict)
263         for xi, x_shapei, dxi, dyi, x_init_valuei in zip(x, x_shape, dx, dy,
264                                                          x_init_value)]
265  return ret
266
267
268@tf_export("test.compute_gradient")
269def compute_gradient(x,
270                     x_shape,
271                     y,
272                     y_shape,
273                     x_init_value=None,
274                     delta=1e-3,
275                     init_targets=None,
276                     extra_feed_dict=None):
277  """Computes and returns the theoretical and numerical Jacobian.
278
279  If `x` or `y` is complex, the Jacobian will still be real but the
280  corresponding Jacobian dimension(s) will be twice as large.  This is required
281  even if both input and output is complex since TensorFlow graphs are not
282  necessarily holomorphic, and may have gradients not expressible as complex
283  numbers.  For example, if `x` is complex with shape `[m]` and `y` is complex
284  with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with
285
286      J[:m, :n] = d(Re y)/d(Re x)
287      J[:m, n:] = d(Im y)/d(Re x)
288      J[m:, :n] = d(Re y)/d(Im x)
289      J[m:, n:] = d(Im y)/d(Im x)
290
291  Args:
292    x: a tensor or list of tensors
293    x_shape: the dimensions of x as a tuple or an array of ints. If x is a list,
294    then this is the list of shapes.
295    y: a tensor
296    y_shape: the dimensions of y as a tuple or an array of ints.
297    x_init_value: (optional) a numpy array of the same shape as "x"
298      representing the initial value of x. If x is a list, this should be a list
299      of numpy arrays.  If this is none, the function will pick a random tensor
300      as the initial value.
301    delta: (optional) the amount of perturbation.
302    init_targets: list of targets to run to initialize model params.
303      TODO(mrry): remove this argument.
304    extra_feed_dict: dict that allows fixing specified tensor values
305      during the Jacobian calculation.
306
307  Returns:
308    Two 2-d numpy arrays representing the theoretical and numerical
309    Jacobian for dy/dx. Each has "x_size" rows and "y_size" columns
310    where "x_size" is the number of elements in x and "y_size" is the
311    number of elements in y. If x is a list, returns a list of two numpy arrays.
312  """
313  if extra_feed_dict is None:
314    extra_feed_dict = {}
315
316  if isinstance(x, list):
317    return _compute_gradient_list(x, x_shape, y, y_shape, x_init_value, delta,
318                                  init_targets, extra_feed_dict=extra_feed_dict)
319  else:
320    if init_targets is not None:
321      assert isinstance(init_targets, (list, tuple))
322      for init in init_targets:
323        init.run()
324    dx, dy = _compute_dx_and_dy(x, y, y_shape)
325    ret = _compute_gradient(x, x_shape, dx, y, y_shape, dy, x_init_value, delta,
326                            extra_feed_dict=extra_feed_dict)
327    return ret
328
329
330@tf_export("test.compute_gradient_error")
331def compute_gradient_error(x,
332                           x_shape,
333                           y,
334                           y_shape,
335                           x_init_value=None,
336                           delta=1e-3,
337                           init_targets=None,
338                           extra_feed_dict=None):
339  """Computes the gradient error.
340
341  Computes the maximum error for dy/dx between the computed Jacobian and the
342  numerically estimated Jacobian.
343
344  This function will modify the tensors passed in as it adds more operations
345  and hence changing the consumers of the operations of the input tensors.
346
347  This function adds operations to the current session. To compute the error
348  using a particular device, such as a GPU, use the standard methods for
349  setting a device (e.g. using with sess.graph.device() or setting a device
350  function in the session constructor).
351
352  Args:
353    x: a tensor or list of tensors
354    x_shape: the dimensions of x as a tuple or an array of ints. If x is a list,
355    then this is the list of shapes.
356    y: a tensor
357    y_shape: the dimensions of y as a tuple or an array of ints.
358    x_init_value: (optional) a numpy array of the same shape as "x"
359      representing the initial value of x. If x is a list, this should be a list
360      of numpy arrays.  If this is none, the function will pick a random tensor
361      as the initial value.
362    delta: (optional) the amount of perturbation.
363    init_targets: list of targets to run to initialize model params.
364    extra_feed_dict: dict that allows fixing specified tensor values
365      during the Jacobian calculation.
366
367  Returns:
368    The maximum error in between the two Jacobians.
369  """
370  grad = compute_gradient(x, x_shape, y, y_shape, x_init_value, delta,
371                          init_targets, extra_feed_dict=extra_feed_dict)
372  if isinstance(grad, tuple):
373    grad = [grad]
374  error = 0
375  for j_t, j_n in grad:
376    if j_t.size or j_n.size:  # Handle zero size tensors correctly
377      error = np.maximum(error, np.fabs(j_t - j_n).max())
378  return error
379