1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15 16"""Gradient checker for any ops, graphs. 17 18The gradient checker verifies numerically that an op/graph properly 19computes the gradients 20""" 21from __future__ import absolute_import 22from __future__ import division 23from __future__ import print_function 24 25import numpy as np 26 27from tensorflow.python.framework import constant_op 28from tensorflow.python.framework import dtypes 29from tensorflow.python.framework import ops 30from tensorflow.python.ops import array_ops 31from tensorflow.python.ops import gradients 32from tensorflow.python.ops import math_ops 33from tensorflow.python.platform import tf_logging as logging 34from tensorflow.python.util.tf_export import tf_export 35 36 37def _product(t): 38 if isinstance(t, int): 39 return t 40 else: 41 y = 1 42 for x in t: 43 y *= x 44 return y 45 46 47def _extra_feeds(extra_feed_dict, new_feeds): 48 if not extra_feed_dict: 49 return new_feeds 50 r = {} 51 r.update(extra_feed_dict) 52 r.update(new_feeds) 53 return r 54 55 56def _compute_theoretical_jacobian(x, x_shape, x_data, dy, dy_shape, dx, 57 extra_feed_dict): 58 """Computes the theoretical Jacobian for dy/dx. 59 60 Computes the theoretical Jacobian using the ops generated by 61 compute_gradient(). 62 63 Args: 64 x: the tensor "x". 65 x_shape: the dimensions of x as a tuple or an array of ints. 66 x_data: a numpy parray as the input data for x 67 dy: the tensor "dy". 68 dy_shape: the dimensions of dy as a tuple or an array of ints. 69 dx: Tensor or IndexedSlices representing dx 70 extra_feed_dict: dict that allows fixing specified tensor values 71 during the jacobian calculation. 72 73 Returns: 74 A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows 75 and "dy_size" columns where "x_size" is the number of elements in x and 76 "dy_size" is the number of elements in dy. 77 78 Raises: 79 ValueError: If `dy` is empty but the gradient is nonzero. 80 """ 81 # Complex vectors are treated as vectors of twice as many reals. 82 if x.dtype.is_complex: 83 x_shape = tuple(x_shape) + (2,) 84 dy_factor = 2 if dy.dtype.is_complex else 1 85 86 # To compute the jacobian, we treat x and y as one-dimensional vectors. 87 x_size = _product(x_shape) 88 x_val_size = _product(x_shape[1:]) # This is used for sparse gradients 89 dy_size = _product(dy_shape) * dy_factor 90 91 # Allocate 2-D Jacobian, with x dimensions smashed into the first 92 # dimension and y dimensions smashed into the second. 93 jacobian = np.zeros((x_size, dy_size), 94 dtype=x.dtype.real_dtype.as_numpy_dtype) 95 96 # For each of the entry of dy, we set this to be 1 and 97 # everything else to be 0 and compute the backprop -- this will give us one 98 # one column of the Jacobian matrix. 99 dy_data = np.zeros(dy_shape, dtype=dy.dtype.as_numpy_dtype) 100 dy_data_flat = dy_data.ravel().view(dy.dtype.real_dtype.as_numpy_dtype) 101 sess = ops.get_default_session() 102 for col in range(dy_size): 103 dy_data_flat[col] = 1 104 if isinstance(dx, ops.IndexedSlices): 105 backprop_indices, backprop_values = sess.run( 106 [dx.indices, dx.values], 107 feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data})) 108 for i, v in zip(backprop_indices, backprop_values): 109 r_begin = i * x_val_size 110 r_end = r_begin + x_val_size 111 jacobian[r_begin:r_end, col] += v.flat 112 else: 113 assert isinstance(dx, ops.Tensor), "dx = " + str(dx) 114 backprop = sess.run( 115 dx, feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data})) 116 jacobian[:, col] = backprop.ravel().view(jacobian.dtype) 117 dy_data_flat[col] = 0 118 119 # If the output is empty, run the gradients at least once and make sure 120 # they produce zeros. 121 if not dy_size: 122 backprop = sess.run( 123 dx, feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data})) 124 if backprop.shape != x_data.shape: 125 raise ValueError("Empty gradient has wrong shape: expected %s, got %s" % 126 (x_data.shape, backprop.shape)) 127 if np.any(backprop): 128 raise ValueError("Empty tensor with nonzero gradients") 129 130 logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian) 131 return jacobian 132 133 134def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta, 135 extra_feed_dict): 136 """Computes the numeric Jacobian for dy/dx. 137 138 Computes the numeric Jacobian by slightly perturbing the inputs and 139 measuring the differences on the output. 140 141 Args: 142 x: the tensor "x". 143 x_shape: the dimensions of x as a tuple or an array of ints. 144 x_data: a numpy array as the input data for x 145 y: the tensor "y". 146 y_shape: the dimensions of y as a tuple or an array of ints. 147 delta: the amount of perturbation we give to the input 148 extra_feed_dict: dict that allows fixing specified tensor values 149 during the jacobian calculation. 150 151 Returns: 152 A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows 153 and "y_size" columns where "x_size" is the number of elements in x and 154 "y_size" is the number of elements in y. 155 """ 156 # bfloat16 doesn't have enough bits to represent high precision numbers such 157 # as delta. Convert to float32 here. Since numeric_jacobian is expected to 158 # be the groundtruth to compare against, it shouldn't lose any information. 159 if x.dtype == dtypes.bfloat16: 160 x = math_ops.cast(x, dtypes.float32) 161 if y.dtype == dtypes.bfloat16: 162 y = math_ops.cast(y, dtypes.float32) 163 if x_data.dtype == dtypes.bfloat16.as_numpy_dtype: 164 x_data = x_data.astype(np.float32) 165 166 # To compute the jacobian, we treat x and y as one-dimensional vectors 167 x_size = _product(x_shape) * (2 if x.dtype.is_complex else 1) 168 y_size = _product(y_shape) * (2 if y.dtype.is_complex else 1) 169 x_dtype = x.dtype.real_dtype.as_numpy_dtype 170 y_dtype = y.dtype.real_dtype.as_numpy_dtype 171 172 # Make sure we have the right types 173 x_data = np.asarray(x_data, dtype=x.dtype.as_numpy_dtype) 174 scale = np.asarray(2 * delta, dtype=y_dtype)[()] 175 176 jacobian = np.zeros((x_size, y_size), dtype=x_dtype) 177 # For each of the entry of x, we slightly perturbs this by adding and 178 # subtracting a delta and then compute difference between the outputs. This 179 # will give us one row of the Jacobian matrix. 180 for row in range(x_size): 181 x_pos = x_data.copy() 182 x_neg = x_data.copy() 183 x_pos.ravel().view(x_dtype)[row] += delta 184 y_pos = y.eval(feed_dict=_extra_feeds(extra_feed_dict, {x: x_pos})) 185 x_neg.ravel().view(x_dtype)[row] -= delta 186 y_neg = y.eval(feed_dict=_extra_feeds(extra_feed_dict, {x: x_neg})) 187 diff = (y_pos - y_neg) / scale 188 jacobian[row, :] = diff.ravel().view(y_dtype) 189 190 logging.vlog(1, "Numeric Jacobian =\n%s", jacobian) 191 return jacobian 192 193 194def _compute_dx_and_dy(x, y, y_shape): 195 """Returns a node to compute gradient of y wrt x.""" 196 # We make up a dy so that we can compute the gradients. We don't really use 197 # the value of dy -- we will always feed it. We need to add an identity node 198 # so that we can always feed it properly. Otherwise, for the Add operation, 199 # dx is the same as dy and we cannot fetch the tensor that we are feeding. 200 with x.graph.as_default(): 201 dy_orig = constant_op.constant(1.0, shape=y_shape, dtype=y.dtype) 202 dy = array_ops.identity(dy_orig) 203 # We compute the gradients for y wrt. x 204 grads = gradients.gradients(y, x, dy) 205 assert len(grads) == 1 206 return grads[0], dy_orig 207 208 209def _compute_gradient(x, 210 x_shape, 211 dx, 212 y, 213 y_shape, 214 dy, 215 x_init_value=None, 216 delta=1e-3, 217 extra_feed_dict=None): 218 """Computes the theoretical and numerical jacobian.""" 219 t = dtypes.as_dtype(x.dtype) 220 allowed_types = [dtypes.float16, dtypes.bfloat16, dtypes.float32, 221 dtypes.float64, dtypes.complex64, dtypes.complex128] 222 assert t.base_dtype in allowed_types, "Don't support type %s for x" % t.name 223 t2 = dtypes.as_dtype(y.dtype) 224 assert t2.base_dtype in allowed_types, "Don't support type %s for y" % t2.name 225 226 if x_init_value is not None: 227 i_shape = list(x_init_value.shape) 228 assert(list(x_shape) == i_shape), "x_shape = %s, init_data shape = %s" % ( 229 x_shape, i_shape) 230 x_data = x_init_value 231 else: 232 x_data = np.random.random_sample(x_shape).astype(t.as_numpy_dtype) 233 if t.is_complex: 234 x_data.imag = np.random.random_sample(x_shape) 235 236 jacob_t = _compute_theoretical_jacobian( 237 x, x_shape, x_data, dy, y_shape, dx, extra_feed_dict=extra_feed_dict) 238 jacob_n = _compute_numeric_jacobian( 239 x, x_shape, x_data, y, y_shape, delta, extra_feed_dict=extra_feed_dict) 240 return jacob_t, jacob_n 241 242 243def _compute_gradient_list(x, 244 x_shape, 245 y, 246 y_shape, 247 x_init_value=None, 248 delta=1e-3, 249 init_targets=None, 250 extra_feed_dict=None): 251 """Compute gradients for a list of x values.""" 252 assert isinstance(x, list) 253 dx, dy = zip(*[_compute_dx_and_dy(xi, y, y_shape) for xi in x]) 254 255 if init_targets is not None: 256 assert isinstance(init_targets, (list, tuple)) 257 for init in init_targets: 258 init.run() 259 if x_init_value is None: 260 x_init_value = [None] * len(x) 261 ret = [_compute_gradient(xi, x_shapei, dxi, y, y_shape, dyi, x_init_valuei, 262 delta, extra_feed_dict=extra_feed_dict) 263 for xi, x_shapei, dxi, dyi, x_init_valuei in zip(x, x_shape, dx, dy, 264 x_init_value)] 265 return ret 266 267 268@tf_export("test.compute_gradient") 269def compute_gradient(x, 270 x_shape, 271 y, 272 y_shape, 273 x_init_value=None, 274 delta=1e-3, 275 init_targets=None, 276 extra_feed_dict=None): 277 """Computes and returns the theoretical and numerical Jacobian. 278 279 If `x` or `y` is complex, the Jacobian will still be real but the 280 corresponding Jacobian dimension(s) will be twice as large. This is required 281 even if both input and output is complex since TensorFlow graphs are not 282 necessarily holomorphic, and may have gradients not expressible as complex 283 numbers. For example, if `x` is complex with shape `[m]` and `y` is complex 284 with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with 285 286 J[:m, :n] = d(Re y)/d(Re x) 287 J[:m, n:] = d(Im y)/d(Re x) 288 J[m:, :n] = d(Re y)/d(Im x) 289 J[m:, n:] = d(Im y)/d(Im x) 290 291 Args: 292 x: a tensor or list of tensors 293 x_shape: the dimensions of x as a tuple or an array of ints. If x is a list, 294 then this is the list of shapes. 295 y: a tensor 296 y_shape: the dimensions of y as a tuple or an array of ints. 297 x_init_value: (optional) a numpy array of the same shape as "x" 298 representing the initial value of x. If x is a list, this should be a list 299 of numpy arrays. If this is none, the function will pick a random tensor 300 as the initial value. 301 delta: (optional) the amount of perturbation. 302 init_targets: list of targets to run to initialize model params. 303 TODO(mrry): remove this argument. 304 extra_feed_dict: dict that allows fixing specified tensor values 305 during the Jacobian calculation. 306 307 Returns: 308 Two 2-d numpy arrays representing the theoretical and numerical 309 Jacobian for dy/dx. Each has "x_size" rows and "y_size" columns 310 where "x_size" is the number of elements in x and "y_size" is the 311 number of elements in y. If x is a list, returns a list of two numpy arrays. 312 """ 313 if extra_feed_dict is None: 314 extra_feed_dict = {} 315 316 if isinstance(x, list): 317 return _compute_gradient_list(x, x_shape, y, y_shape, x_init_value, delta, 318 init_targets, extra_feed_dict=extra_feed_dict) 319 else: 320 if init_targets is not None: 321 assert isinstance(init_targets, (list, tuple)) 322 for init in init_targets: 323 init.run() 324 dx, dy = _compute_dx_and_dy(x, y, y_shape) 325 ret = _compute_gradient(x, x_shape, dx, y, y_shape, dy, x_init_value, delta, 326 extra_feed_dict=extra_feed_dict) 327 return ret 328 329 330@tf_export("test.compute_gradient_error") 331def compute_gradient_error(x, 332 x_shape, 333 y, 334 y_shape, 335 x_init_value=None, 336 delta=1e-3, 337 init_targets=None, 338 extra_feed_dict=None): 339 """Computes the gradient error. 340 341 Computes the maximum error for dy/dx between the computed Jacobian and the 342 numerically estimated Jacobian. 343 344 This function will modify the tensors passed in as it adds more operations 345 and hence changing the consumers of the operations of the input tensors. 346 347 This function adds operations to the current session. To compute the error 348 using a particular device, such as a GPU, use the standard methods for 349 setting a device (e.g. using with sess.graph.device() or setting a device 350 function in the session constructor). 351 352 Args: 353 x: a tensor or list of tensors 354 x_shape: the dimensions of x as a tuple or an array of ints. If x is a list, 355 then this is the list of shapes. 356 y: a tensor 357 y_shape: the dimensions of y as a tuple or an array of ints. 358 x_init_value: (optional) a numpy array of the same shape as "x" 359 representing the initial value of x. If x is a list, this should be a list 360 of numpy arrays. If this is none, the function will pick a random tensor 361 as the initial value. 362 delta: (optional) the amount of perturbation. 363 init_targets: list of targets to run to initialize model params. 364 extra_feed_dict: dict that allows fixing specified tensor values 365 during the Jacobian calculation. 366 367 Returns: 368 The maximum error in between the two Jacobians. 369 """ 370 grad = compute_gradient(x, x_shape, y, y_shape, x_init_value, delta, 371 init_targets, extra_feed_dict=extra_feed_dict) 372 if isinstance(grad, tuple): 373 grad = [grad] 374 error = 0 375 for j_t, j_n in grad: 376 if j_t.size or j_n.size: # Handle zero size tensors correctly 377 error = np.maximum(error, np.fabs(j_t - j_n).max()) 378 return error 379