1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15# pylint: disable=invalid-name
16"""Built-in optimizer classes.
17"""
18from __future__ import absolute_import
19from __future__ import division
20from __future__ import print_function
21
22import six
23from six.moves import zip  # pylint: disable=redefined-builtin
24
25from tensorflow.python.distribute import distribution_strategy_context
26from tensorflow.python.framework import ops
27from tensorflow.python.keras import backend as K
28from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
29from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
30from tensorflow.python.keras.optimizer_v2 import adam as adam_v2
31from tensorflow.python.keras.optimizer_v2 import adamax as adamax_v2
32from tensorflow.python.keras.optimizer_v2 import ftrl
33from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
34from tensorflow.python.keras.optimizer_v2 import nadam as nadam_v2
35from tensorflow.python.keras.optimizer_v2 import optimizer_v2
36from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
37from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
38from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
39from tensorflow.python.ops import clip_ops
40from tensorflow.python.ops import math_ops
41from tensorflow.python.ops import state_ops
42from tensorflow.python.training import optimizer as tf_optimizer_module
43from tensorflow.python.training import training_util
44from tensorflow.python.training.tracking import base as trackable
45from tensorflow.python.util.tf_export import keras_export
46
47
48class Optimizer(object):
49  """Abstract optimizer base class.
50
51  Note: this is the parent class of all optimizers, not an actual optimizer
52  that can be used for training models.
53
54  All Keras optimizers support the following keyword arguments:
55
56      clipnorm: float >= 0. Gradients will be clipped
57          when their L2 norm exceeds this value.
58      clipvalue: float >= 0. Gradients will be clipped
59          when their absolute value exceeds this value.
60  """
61
62  def __init__(self, **kwargs):
63    allowed_kwargs = {'clipnorm', 'clipvalue'}
64    for k in kwargs:
65      if k not in allowed_kwargs:
66        raise TypeError('Unexpected keyword argument '
67                        'passed to optimizer: ' + str(k))
68      # checks that clipnorm >= 0 and clipvalue >= 0
69      if kwargs[k] < 0:
70        raise ValueError('Expected {} >= 0, received: {}'.format(k, kwargs[k]))
71    self.__dict__.update(kwargs)
72    self.updates = []
73    self.weights = []
74
75  def get_updates(self, loss, params):
76    raise NotImplementedError
77
78  def get_gradients(self, loss, params):
79    """Returns gradients of `loss` with respect to `params`.
80
81    Arguments:
82        loss: Loss tensor.
83        params: List of variables.
84
85    Returns:
86        List of gradient tensors.
87
88    Raises:
89        ValueError: In case any gradient cannot be computed (e.g. if gradient
90          function not implemented).
91    """
92    grads = K.gradients(loss, params)
93    if None in grads:
94      raise ValueError('An operation has `None` for gradient. '
95                       'Please make sure that all of your ops have a '
96                       'gradient defined (i.e. are differentiable). '
97                       'Common ops without gradient: '
98                       'K.argmax, K.round, K.eval.')
99    if hasattr(self, 'clipnorm'):
100      grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
101    if hasattr(self, 'clipvalue'):
102      grads = [
103          clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
104          for g in grads
105      ]
106    return grads
107
108  def set_weights(self, weights):
109    """Sets the weights of the optimizer, from Numpy arrays.
110
111    Should only be called after computing the gradients
112    (otherwise the optimizer has no weights).
113
114    Arguments:
115        weights: a list of Numpy arrays. The number
116            of arrays and their shape must match
117            number of the dimensions of the weights
118            of the optimizer (i.e. it should match the
119            output of `get_weights`).
120
121    Raises:
122        ValueError: in case of incompatible weight shapes.
123    """
124    params = self.weights
125    if len(params) != len(weights):
126      raise ValueError(
127          'Length of the specified weight list (' + str(len(weights)) +
128          ') does not match the number of weights '
129          'of the optimizer (' + str(len(params)) + ')')
130    weight_value_tuples = []
131    param_values = K.batch_get_value(params)
132    for pv, p, w in zip(param_values, params, weights):
133      if pv.shape != w.shape:
134        raise ValueError(
135            'Optimizer weight shape ' + str(pv.shape) + ' not compatible with '
136            'provided weight shape ' + str(w.shape))
137      weight_value_tuples.append((p, w))
138    K.batch_set_value(weight_value_tuples)
139
140  def get_weights(self):
141    """Returns the current value of the weights of the optimizer.
142
143    Returns:
144        A list of numpy arrays.
145    """
146    return K.batch_get_value(self.weights)
147
148  def get_config(self):
149    config = {}
150    if hasattr(self, 'clipnorm'):
151      config['clipnorm'] = self.clipnorm
152    if hasattr(self, 'clipvalue'):
153      config['clipvalue'] = self.clipvalue
154    return config
155
156  @classmethod
157  def from_config(cls, config):
158    return cls(**config)
159
160
161class SGD(Optimizer):
162  """Stochastic gradient descent optimizer.
163
164  Includes support for momentum,
165  learning rate decay, and Nesterov momentum.
166
167  Arguments:
168      lr: float >= 0. Learning rate.
169      momentum: float >= 0. Parameter that accelerates SGD
170          in the relevant direction and dampens oscillations.
171      decay: float >= 0. Learning rate decay over each update.
172      nesterov: boolean. Whether to apply Nesterov momentum.
173  """
174
175  def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, **kwargs):
176    super(SGD, self).__init__(**kwargs)
177    with K.name_scope(self.__class__.__name__):
178      self.iterations = K.variable(0, dtype='int64', name='iterations')
179      self.lr = K.variable(lr, name='lr')
180      self.momentum = K.variable(momentum, name='momentum')
181      self.decay = K.variable(decay, name='decay')
182    self.initial_decay = decay
183    self.nesterov = nesterov
184
185  def get_updates(self, loss, params):
186    grads = self.get_gradients(loss, params)
187    self.updates = [state_ops.assign_add(self.iterations, 1)]
188
189    lr = self.lr
190    if self.initial_decay > 0:
191      lr = lr * (  # pylint: disable=g-no-augmented-assignment
192          1. / (1. + self.decay * math_ops.cast(self.iterations,
193                                                K.dtype(self.decay))))
194    # momentum
195    shapes = [K.int_shape(p) for p in params]
196    moments = [K.zeros(shape) for shape in shapes]
197    self.weights = [self.iterations] + moments
198    for p, g, m in zip(params, grads, moments):
199      v = self.momentum * m - lr * g  # velocity
200      self.updates.append(state_ops.assign(m, v))
201
202      if self.nesterov:
203        new_p = p + self.momentum * v - lr * g
204      else:
205        new_p = p + v
206
207      # Apply constraints.
208      if getattr(p, 'constraint', None) is not None:
209        new_p = p.constraint(new_p)
210
211      self.updates.append(state_ops.assign(p, new_p))
212    return self.updates
213
214  def get_config(self):
215    config = {
216        'lr': float(K.get_value(self.lr)),
217        'momentum': float(K.get_value(self.momentum)),
218        'decay': float(K.get_value(self.decay)),
219        'nesterov': self.nesterov
220    }
221    base_config = super(SGD, self).get_config()
222    return dict(list(base_config.items()) + list(config.items()))
223
224
225class RMSprop(Optimizer):
226  """RMSProp optimizer.
227
228  It is recommended to leave the parameters of this optimizer
229  at their default values
230  (except the learning rate, which can be freely tuned).
231
232  This optimizer is usually a good choice for recurrent
233  neural networks.
234
235  Arguments:
236      lr: float >= 0. Learning rate.
237      rho: float >= 0.
238      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
239      decay: float >= 0. Learning rate decay over each update.
240
241  """
242
243  def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0., **kwargs):
244    super(RMSprop, self).__init__(**kwargs)
245    with K.name_scope(self.__class__.__name__):
246      self.lr = K.variable(lr, name='lr')
247      self.rho = K.variable(rho, name='rho')
248      self.decay = K.variable(decay, name='decay')
249      self.iterations = K.variable(0, dtype='int64', name='iterations')
250    if epsilon is None:
251      epsilon = K.epsilon()
252    self.epsilon = epsilon
253    self.initial_decay = decay
254
255  def get_updates(self, loss, params):
256    grads = self.get_gradients(loss, params)
257    accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
258    self.weights = accumulators
259    self.updates = [state_ops.assign_add(self.iterations, 1)]
260
261    lr = self.lr
262    if self.initial_decay > 0:
263      lr = lr * (  # pylint: disable=g-no-augmented-assignment
264          1. / (1. + self.decay * math_ops.cast(self.iterations,
265                                                K.dtype(self.decay))))
266
267    for p, g, a in zip(params, grads, accumulators):
268      # update accumulator
269      new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
270      self.updates.append(state_ops.assign(a, new_a))
271      new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
272
273      # Apply constraints.
274      if getattr(p, 'constraint', None) is not None:
275        new_p = p.constraint(new_p)
276
277      self.updates.append(state_ops.assign(p, new_p))
278    return self.updates
279
280  def get_config(self):
281    config = {
282        'lr': float(K.get_value(self.lr)),
283        'rho': float(K.get_value(self.rho)),
284        'decay': float(K.get_value(self.decay)),
285        'epsilon': self.epsilon
286    }
287    base_config = super(RMSprop, self).get_config()
288    return dict(list(base_config.items()) + list(config.items()))
289
290
291class Adagrad(Optimizer):
292  """Adagrad optimizer.
293
294  Adagrad is an optimizer with parameter-specific learning rates,
295  which are adapted relative to how frequently a parameter gets
296  updated during training. The more updates a parameter receives,
297  the smaller the updates.
298
299  It is recommended to leave the parameters of this optimizer
300  at their default values.
301
302  # Arguments
303      lr: float >= 0. Initial learning rate.
304      epsilon: float >= 0. If `None`, defaults to `K.epsilon()`.
305      decay: float >= 0. Learning rate decay over each update.
306
307  # References
308      - [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
309  """
310
311  def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs):
312    super(Adagrad, self).__init__(**kwargs)
313    with K.name_scope(self.__class__.__name__):
314      self.lr = K.variable(lr, name='lr')
315      self.decay = K.variable(decay, name='decay')
316      self.iterations = K.variable(0, dtype='int64', name='iterations')
317    if epsilon is None:
318      epsilon = K.epsilon()
319    self.epsilon = epsilon
320    self.initial_decay = decay
321
322  def get_updates(self, loss, params):
323    grads = self.get_gradients(loss, params)
324    shapes = [K.int_shape(p) for p in params]
325    accumulators = [K.zeros(shape) for shape in shapes]
326    self.weights = accumulators
327    self.updates = [state_ops.assign_add(self.iterations, 1)]
328
329    lr = self.lr
330    if self.initial_decay > 0:
331      lr = lr * (  # pylint: disable=g-no-augmented-assignment
332          1. / (1. + self.decay * math_ops.cast(self.iterations,
333                                                K.dtype(self.decay))))
334
335    for p, g, a in zip(params, grads, accumulators):
336      new_a = a + math_ops.square(g)  # update accumulator
337      self.updates.append(state_ops.assign(a, new_a))
338      new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
339
340      # Apply constraints.
341      if getattr(p, 'constraint', None) is not None:
342        new_p = p.constraint(new_p)
343
344      self.updates.append(state_ops.assign(p, new_p))
345    return self.updates
346
347  def get_config(self):
348    config = {
349        'lr': float(K.get_value(self.lr)),
350        'decay': float(K.get_value(self.decay)),
351        'epsilon': self.epsilon
352    }
353    base_config = super(Adagrad, self).get_config()
354    return dict(list(base_config.items()) + list(config.items()))
355
356
357class Adadelta(Optimizer):
358  """Adadelta optimizer.
359
360  Adadelta is a more robust extension of Adagrad
361  that adapts learning rates based on a moving window of gradient updates,
362  instead of accumulating all past gradients. This way, Adadelta continues
363  learning even when many updates have been done. Compared to Adagrad, in the
364  original version of Adadelta you don't have to set an initial learning
365  rate. In this version, initial learning rate and decay factor can
366  be set, as in most other Keras optimizers.
367
368  It is recommended to leave the parameters of this optimizer
369  at their default values.
370
371  # Arguments
372      lr: float >= 0. Initial learning rate, defaults to 1.
373          It is recommended to leave it at the default value.
374      rho: float >= 0. Adadelta decay factor, corresponding to fraction of
375          gradient to keep at each time step.
376      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
377      decay: float >= 0. Initial learning rate decay.
378
379  # References
380      - [Adadelta - an adaptive learning rate method](http://arxiv.org/abs/1212.5701)
381  """
382
383  def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs):
384    super(Adadelta, self).__init__(**kwargs)
385    with K.name_scope(self.__class__.__name__):
386      self.lr = K.variable(lr, name='lr')
387      self.decay = K.variable(decay, name='decay')
388      self.iterations = K.variable(0, dtype='int64', name='iterations')
389    if epsilon is None:
390      epsilon = K.epsilon()
391    self.rho = rho
392    self.epsilon = epsilon
393    self.initial_decay = decay
394
395  def get_updates(self, loss, params):
396    grads = self.get_gradients(loss, params)
397    shapes = [K.int_shape(p) for p in params]
398    accumulators = [K.zeros(shape) for shape in shapes]
399    delta_accumulators = [K.zeros(shape) for shape in shapes]
400    self.weights = accumulators + delta_accumulators
401    self.updates = [state_ops.assign_add(self.iterations, 1)]
402
403    lr = self.lr
404    if self.initial_decay > 0:
405      lr = lr * (  # pylint: disable=g-no-augmented-assignment
406          1. / (1. + self.decay * math_ops.cast(self.iterations,
407                                                K.dtype(self.decay))))
408
409    for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
410      # update accumulator
411      new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
412      self.updates.append(state_ops.assign(a, new_a))
413
414      # use the new accumulator and the *old* delta_accumulator
415      update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
416      new_p = p - lr * update
417
418      # Apply constraints.
419      if getattr(p, 'constraint', None) is not None:
420        new_p = p.constraint(new_p)
421
422      self.updates.append(state_ops.assign(p, new_p))
423
424      # update delta_accumulator
425      new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update)
426      self.updates.append(state_ops.assign(d_a, new_d_a))
427    return self.updates
428
429  def get_config(self):
430    config = {
431        'lr': float(K.get_value(self.lr)),
432        'rho': self.rho,
433        'decay': float(K.get_value(self.decay)),
434        'epsilon': self.epsilon
435    }
436    base_config = super(Adadelta, self).get_config()
437    return dict(list(base_config.items()) + list(config.items()))
438
439
440class Adam(Optimizer):
441  """Adam optimizer.
442
443  Default parameters follow those provided in the original paper.
444
445  Arguments:
446      lr: float >= 0. Learning rate.
447      beta_1: float, 0 < beta < 1. Generally close to 1.
448      beta_2: float, 0 < beta < 1. Generally close to 1.
449      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
450      decay: float >= 0. Learning rate decay over each update.
451      amsgrad: boolean. Whether to apply the AMSGrad variant of this
452          algorithm from the paper "On the Convergence of Adam and
453          Beyond".
454
455  """
456
457  def __init__(self,
458               lr=0.001,
459               beta_1=0.9,
460               beta_2=0.999,
461               epsilon=None,
462               decay=0.,
463               amsgrad=False,
464               **kwargs):
465    super(Adam, self).__init__(**kwargs)
466    with K.name_scope(self.__class__.__name__):
467      self.iterations = K.variable(0, dtype='int64', name='iterations')
468      self.lr = K.variable(lr, name='lr')
469      self.beta_1 = K.variable(beta_1, name='beta_1')
470      self.beta_2 = K.variable(beta_2, name='beta_2')
471      self.decay = K.variable(decay, name='decay')
472    if epsilon is None:
473      epsilon = K.epsilon()
474    self.epsilon = epsilon
475    self.initial_decay = decay
476    self.amsgrad = amsgrad
477
478  def get_updates(self, loss, params):
479    grads = self.get_gradients(loss, params)
480    self.updates = []
481
482    lr = self.lr
483    if self.initial_decay > 0:
484      lr = lr * (  # pylint: disable=g-no-augmented-assignment
485          1. / (1. + self.decay * math_ops.cast(self.iterations,
486                                                K.dtype(self.decay))))
487
488    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
489      t = math_ops.cast(self.iterations, K.floatx())
490    lr_t = lr * (
491        K.sqrt(1. - math_ops.pow(self.beta_2, t)) /
492        (1. - math_ops.pow(self.beta_1, t)))
493
494    ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
495    vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
496    if self.amsgrad:
497      vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
498    else:
499      vhats = [K.zeros(1) for _ in params]
500    self.weights = [self.iterations] + ms + vs + vhats
501
502    for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
503      m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
504      v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g)
505      if self.amsgrad:
506        vhat_t = math_ops.maximum(vhat, v_t)
507        p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
508        self.updates.append(state_ops.assign(vhat, vhat_t))
509      else:
510        p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
511
512      self.updates.append(state_ops.assign(m, m_t))
513      self.updates.append(state_ops.assign(v, v_t))
514      new_p = p_t
515
516      # Apply constraints.
517      if getattr(p, 'constraint', None) is not None:
518        new_p = p.constraint(new_p)
519
520      self.updates.append(state_ops.assign(p, new_p))
521    return self.updates
522
523  def get_config(self):
524    config = {
525        'lr': float(K.get_value(self.lr)),
526        'beta_1': float(K.get_value(self.beta_1)),
527        'beta_2': float(K.get_value(self.beta_2)),
528        'decay': float(K.get_value(self.decay)),
529        'epsilon': self.epsilon,
530        'amsgrad': self.amsgrad
531    }
532    base_config = super(Adam, self).get_config()
533    return dict(list(base_config.items()) + list(config.items()))
534
535
536class Adamax(Optimizer):
537  """Adamax optimizer from Adam paper's Section 7.
538
539  It is a variant of Adam based on the infinity norm.
540  Default parameters follow those provided in the paper.
541
542  Arguments:
543      lr: float >= 0. Learning rate.
544      beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
545      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
546      decay: float >= 0. Learning rate decay over each update.
547
548  """
549
550  def __init__(self,
551               lr=0.002,
552               beta_1=0.9,
553               beta_2=0.999,
554               epsilon=None,
555               decay=0.,
556               **kwargs):
557    super(Adamax, self).__init__(**kwargs)
558    with K.name_scope(self.__class__.__name__):
559      self.iterations = K.variable(0, dtype='int64', name='iterations')
560      self.lr = K.variable(lr, name='lr')
561      self.beta_1 = K.variable(beta_1, name='beta_1')
562      self.beta_2 = K.variable(beta_2, name='beta_2')
563      self.decay = K.variable(decay, name='decay')
564    if epsilon is None:
565      epsilon = K.epsilon()
566    self.epsilon = epsilon
567    self.initial_decay = decay
568
569  def get_updates(self, loss, params):
570    grads = self.get_gradients(loss, params)
571    self.updates = []
572
573    lr = self.lr
574    if self.initial_decay > 0:
575      lr = lr * (  # pylint: disable=g-no-augmented-assignment
576          1. / (1. + self.decay * math_ops.cast(self.iterations,
577                                                K.dtype(self.decay))))
578
579    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
580      t = math_ops.cast(self.iterations, K.floatx())
581    lr_t = lr / (1. - math_ops.pow(self.beta_1, t))
582
583    shapes = [K.int_shape(p) for p in params]
584    # zero init of 1st moment
585    ms = [K.zeros(shape) for shape in shapes]
586    # zero init of exponentially weighted infinity norm
587    us = [K.zeros(shape) for shape in shapes]
588    self.weights = [self.iterations] + ms + us
589
590    for p, g, m, u in zip(params, grads, ms, us):
591
592      m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
593      u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g))
594      p_t = p - lr_t * m_t / (u_t + self.epsilon)
595
596      self.updates.append(state_ops.assign(m, m_t))
597      self.updates.append(state_ops.assign(u, u_t))
598      new_p = p_t
599
600      # Apply constraints.
601      if getattr(p, 'constraint', None) is not None:
602        new_p = p.constraint(new_p)
603
604      self.updates.append(state_ops.assign(p, new_p))
605    return self.updates
606
607  def get_config(self):
608    config = {
609        'lr': float(K.get_value(self.lr)),
610        'beta_1': float(K.get_value(self.beta_1)),
611        'beta_2': float(K.get_value(self.beta_2)),
612        'decay': float(K.get_value(self.decay)),
613        'epsilon': self.epsilon
614    }
615    base_config = super(Adamax, self).get_config()
616    return dict(list(base_config.items()) + list(config.items()))
617
618
619class Nadam(Optimizer):
620  """Nesterov Adam optimizer.
621
622  Much like Adam is essentially RMSprop with momentum,
623  Nadam is Adam RMSprop with Nesterov momentum.
624
625  Default parameters follow those provided in the paper.
626  It is recommended to leave the parameters of this optimizer
627  at their default values.
628
629  Arguments:
630      lr: float >= 0. Learning rate.
631      beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
632      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
633
634  """
635
636  def __init__(self,
637               lr=0.002,
638               beta_1=0.9,
639               beta_2=0.999,
640               epsilon=None,
641               schedule_decay=0.004,
642               **kwargs):
643    super(Nadam, self).__init__(**kwargs)
644    with K.name_scope(self.__class__.__name__):
645      self.iterations = K.variable(0, dtype='int64', name='iterations')
646      self.m_schedule = K.variable(1., name='m_schedule')
647      self.lr = K.variable(lr, name='lr')
648      self.beta_1 = K.variable(beta_1, name='beta_1')
649      self.beta_2 = K.variable(beta_2, name='beta_2')
650    if epsilon is None:
651      epsilon = K.epsilon()
652    self.epsilon = epsilon
653    self.schedule_decay = schedule_decay
654
655  def get_updates(self, loss, params):
656    grads = self.get_gradients(loss, params)
657    self.updates = []
658
659    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
660      t = math_ops.cast(self.iterations, K.floatx())
661
662    # Due to the recommendations in [2], i.e. warming momentum schedule
663    momentum_cache_t = self.beta_1 * (
664        1. - 0.5 *
665        (math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
666    momentum_cache_t_1 = self.beta_1 * (
667        1. - 0.5 *
668        (math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
669    m_schedule_new = self.m_schedule * momentum_cache_t
670    m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
671    self.updates.append((self.m_schedule, m_schedule_new))
672
673    shapes = [K.int_shape(p) for p in params]
674    ms = [K.zeros(shape) for shape in shapes]
675    vs = [K.zeros(shape) for shape in shapes]
676
677    self.weights = [self.iterations, self.m_schedule] + ms + vs
678
679    for p, g, m, v in zip(params, grads, ms, vs):
680      # the following equations given in [1]
681      g_prime = g / (1. - m_schedule_new)
682      m_t = self.beta_1 * m + (1. - self.beta_1) * g
683      m_t_prime = m_t / (1. - m_schedule_next)
684      v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g)
685      v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t))
686      m_t_bar = (
687          1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
688
689      self.updates.append(state_ops.assign(m, m_t))
690      self.updates.append(state_ops.assign(v, v_t))
691
692      p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
693      new_p = p_t
694
695      # Apply constraints.
696      if getattr(p, 'constraint', None) is not None:
697        new_p = p.constraint(new_p)
698
699      self.updates.append(state_ops.assign(p, new_p))
700    return self.updates
701
702  def get_config(self):
703    config = {
704        'lr': float(K.get_value(self.lr)),
705        'beta_1': float(K.get_value(self.beta_1)),
706        'beta_2': float(K.get_value(self.beta_2)),
707        'epsilon': self.epsilon,
708        'schedule_decay': self.schedule_decay
709    }
710    base_config = super(Nadam, self).get_config()
711    return dict(list(base_config.items()) + list(config.items()))
712
713
714class TFOptimizer(Optimizer, trackable.Trackable):
715  """Wrapper class for native TensorFlow optimizers.
716  """
717
718  def __init__(self, optimizer, iterations=None):  # pylint: disable=super-init-not-called
719    self.optimizer = optimizer
720    self._track_trackable(optimizer, name='optimizer')
721    if iterations is None:
722      with K.name_scope(self.__class__.__name__):
723        self.iterations = K.variable(0, dtype='int64', name='iterations')
724    else:
725      self.iterations = iterations
726    self._track_trackable(self.iterations, name='global_step')
727
728  def apply_gradients(self, grads):
729    self.optimizer.apply_gradients(grads, global_step=self.iterations)
730
731  def get_grads(self, loss, params):
732    return self.optimizer.compute_gradients(loss, params)
733
734  def get_updates(self, loss, params):
735    if distribution_strategy_context.has_strategy():
736      self.updates = []
737
738      if not params:
739        # After the model vars have been created, the second call to get_updates
740        # is called with params as an empty list. This ensures that we call
741        # compute_gradients with params=None.
742        grads = self.optimizer.compute_gradients(loss)
743      else:
744        grads = self.optimizer.compute_gradients(loss, params)
745      global_step = training_util.get_global_step()
746      opt_update = self.optimizer.apply_gradients(grads, global_step)
747    else:
748      if not params:
749        self.updates = [state_ops.assign_add(self.iterations, 1)]
750        return self.updates
751
752      # Updates list starts out empty because the iterations variable is
753      # incremented in optimizer.apply_gradients()
754      self.updates = []
755      grads = self.optimizer.compute_gradients(loss, params)
756      opt_update = self.optimizer.apply_gradients(
757          grads, global_step=self.iterations)
758
759    self.updates.append(opt_update)
760    return self.updates
761
762  @property
763  def weights(self):
764    raise NotImplementedError
765
766  def get_config(self):
767    raise NotImplementedError
768
769  def from_config(self, config):
770    raise NotImplementedError
771
772
773# Aliases.
774
775sgd = SGD
776rmsprop = RMSprop
777adagrad = Adagrad
778adadelta = Adadelta
779adam = Adam
780adamax = Adamax
781nadam = Nadam
782
783
784@keras_export('keras.optimizers.serialize')
785def serialize(optimizer):
786  return serialize_keras_object(optimizer)
787
788
789@keras_export('keras.optimizers.deserialize')
790def deserialize(config, custom_objects=None):
791  """Inverse of the `serialize` function.
792
793  Arguments:
794      config: Optimizer configuration dictionary.
795      custom_objects: Optional dictionary mapping
796          names (strings) to custom objects
797          (classes and functions)
798          to be considered during deserialization.
799
800  Returns:
801      A Keras Optimizer instance.
802  """
803  all_classes = {
804      'adadelta': adadelta_v2.Adadelta,
805      'adagrad': adagrad_v2.Adagrad,
806      'adam': adam_v2.Adam,
807      'adamax': adamax_v2.Adamax,
808      'nadam': nadam_v2.Nadam,
809      'rmsprop': rmsprop_v2.RMSprop,
810      'sgd': gradient_descent_v2.SGD,
811      'ftrl': ftrl.Ftrl
812  }
813
814  # Make deserialization case-insensitive for built-in optimizers.
815  if config['class_name'].lower() in all_classes:
816    config['class_name'] = config['class_name'].lower()
817  return deserialize_keras_object(
818      config,
819      module_objects=all_classes,
820      custom_objects=custom_objects,
821      printable_module_name='optimizer')
822
823
824@keras_export('keras.optimizers.get')
825def get(identifier):
826  """Retrieves a Keras Optimizer instance.
827
828  Arguments:
829      identifier: Optimizer identifier, one of
830          - String: name of an optimizer
831          - Dictionary: configuration dictionary.
832          - Keras Optimizer instance (it will be returned unchanged).
833          - TensorFlow Optimizer instance
834              (it will be wrapped as a Keras Optimizer).
835
836  Returns:
837      A Keras Optimizer instance.
838
839  Raises:
840      ValueError: If `identifier` cannot be interpreted.
841  """
842  if isinstance(identifier, (Optimizer, optimizer_v2.OptimizerV2)):
843    return identifier
844  # Wrap TF optimizer instances
845  elif isinstance(identifier, tf_optimizer_module.Optimizer):
846    opt = TFOptimizer(identifier)
847    K.track_tf_optimizer(opt)
848    return opt
849  elif isinstance(identifier, dict):
850    return deserialize(identifier)
851  elif isinstance(identifier, six.string_types):
852    config = {'class_name': str(identifier), 'config': {}}
853    return deserialize(config)
854  else:
855    raise ValueError('Could not interpret optimizer identifier:', identifier)
856