1# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Adagrad optimizer implementation."""
16# pylint: disable=g-classes-have-attributes
17from __future__ import absolute_import
18from __future__ import division
19from __future__ import print_function
20
21import numpy as np
22
23from tensorflow.python.framework import dtypes
24from tensorflow.python.framework import ops
25from tensorflow.python.keras import backend_config
26from tensorflow.python.keras.optimizer_v2 import optimizer_v2
27from tensorflow.python.ops import array_ops
28from tensorflow.python.ops import init_ops
29from tensorflow.python.training import gen_training_ops
30from tensorflow.python.util.tf_export import keras_export
31
32
33@keras_export('keras.optimizers.Adagrad')
34class Adagrad(optimizer_v2.OptimizerV2):
35  r"""Optimizer that implements the Adagrad algorithm.
36
37  Adagrad is an optimizer with parameter-specific learning rates,
38  which are adapted relative to how frequently a parameter gets
39  updated during training. The more updates a parameter receives,
40  the smaller the updates.
41
42  Args:
43    learning_rate: A `Tensor`, floating point value, or a schedule that is a
44      `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
45    initial_accumulator_value: A floating point value.
46      Starting value for the accumulators, must be non-negative.
47    epsilon: A small floating point value to avoid zero denominator.
48    name: Optional name prefix for the operations created when applying
49      gradients.  Defaults to `"Adagrad"`.
50    **kwargs: Keyword arguments. Allowed to be one of
51      `"clipnorm"` or `"clipvalue"`.
52      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
53      gradients by value.
54
55  Reference:
56    - [Duchi et al., 2011](
57      http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
58  """
59
60  _HAS_AGGREGATE_GRAD = True
61
62  def __init__(self,
63               learning_rate=0.001,
64               initial_accumulator_value=0.1,
65               epsilon=1e-7,
66               name='Adagrad',
67               **kwargs):
68    if initial_accumulator_value < 0.0:
69      raise ValueError('initial_accumulator_value must be non-negative: %s' %
70                       initial_accumulator_value)
71    if epsilon is None:
72      epsilon = backend_config.epsilon()
73    super(Adagrad, self).__init__(name, **kwargs)
74    self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
75    self._set_hyper('decay', self._initial_decay)
76    self._initial_accumulator_value = initial_accumulator_value
77    self.epsilon = epsilon or backend_config.epsilon()
78
79  def _create_slots(self, var_list):
80    for var in var_list:
81      dtype = var.dtype.base_dtype
82      init = init_ops.constant_initializer(
83          self._initial_accumulator_value, dtype=dtype)
84      self.add_slot(var, 'accumulator', init)
85
86  def _prepare_local(self, var_device, var_dtype, apply_state):
87    super(Adagrad, self)._prepare_local(var_device, var_dtype, apply_state)
88    apply_state[(var_device, var_dtype)].update(
89        dict(
90            epsilon=ops.convert_to_tensor_v2_with_dispatch(
91                self.epsilon, var_dtype),
92            neg_lr_t=-apply_state[(var_device, var_dtype)]['lr_t'],
93            zero=array_ops.zeros((), dtype=dtypes.int64)))
94
95  def set_weights(self, weights):
96    params = self.weights
97    # Override set_weights for backward compatibility of Keras V1 optimizer
98    # since it does not include iteration at head of the weight list. Set
99    # iteration to 0.
100    if len(params) == len(weights) + 1:
101      weights = [np.array(0)] + weights
102    super(Adagrad, self).set_weights(weights)
103
104  @classmethod
105  def from_config(cls, config, custom_objects=None):
106    """Creates an optimizer from its config.
107
108    This method is the reverse of `get_config`,
109    capable of instantiating the same optimizer from the config
110    dictionary.
111
112    Args:
113        config: A Python dictionary, typically the output of get_config.
114        custom_objects: A Python dictionary mapping names to additional Python
115          objects used to create this optimizer, such as a function used for a
116          hyperparameter.
117
118    Returns:
119        An optimizer instance.
120    """
121    if 'initial_accumulator_value' not in config:
122      config['initial_accumulator_value'] = 0.1
123    if 'lr' in config:
124      config['learning_rate'] = config.pop('lr')
125    return cls(**config)
126
127  def _resource_apply_dense(self, grad, var, apply_state=None):
128    var_device, var_dtype = var.device, var.dtype.base_dtype
129    coefficients = ((apply_state or {}).get((var_device, var_dtype))
130                    or self._fallback_apply_state(var_device, var_dtype))
131
132    acc = self.get_slot(var, 'accumulator')
133    return gen_training_ops.ResourceApplyAdagradV2(
134        var=var.handle,
135        accum=acc.handle,
136        lr=coefficients['lr_t'],
137        epsilon=coefficients['epsilon'],
138        grad=grad,
139        use_locking=self._use_locking)
140
141  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
142    var_device, var_dtype = var.device, var.dtype.base_dtype
143    coefficients = ((apply_state or {}).get((var_device, var_dtype))
144                    or self._fallback_apply_state(var_device, var_dtype))
145
146    acc = self.get_slot(var, 'accumulator')
147    return gen_training_ops.ResourceSparseApplyAdagradV2(
148        var=var.handle,
149        accum=acc.handle,
150        lr=coefficients['lr_t'],
151        epsilon=coefficients['epsilon'],
152        grad=grad,
153        indices=indices,
154        use_locking=self._use_locking)
155
156  def get_config(self):
157    config = super(Adagrad, self).get_config()
158    config.update({
159        'learning_rate': self._serialize_hyperparameter('learning_rate'),
160        'decay': self._initial_decay,
161        'initial_accumulator_value': self._initial_accumulator_value,
162        'epsilon': self.epsilon,
163    })
164    return config
165