@@ -44,29 +44,31 @@ class Adam(optimizer.Optimizer):
44
44
data/parameters*".
45
45
46
46
Args:
47
- learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
48
- `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
49
- that takes no arguments and returns the actual value to use. The
50
- learning rate. Defaults to `0.001`.
51
- beta_1: A float value or a constant float tensor, or a callable
52
- that takes no arguments and returns the actual value to use. The
53
- exponential decay rate for the 1st moment estimates. Defaults to `0.9`.
54
- beta_2: A float value or a constant float tensor, or a callable
55
- that takes no arguments and returns the actual value to use. The
56
- exponential decay rate for the 2nd moment estimates. Defaults to
57
- `0.999`.
58
- epsilon: A small constant for numerical stability. This epsilon is
59
- "epsilon hat" in the Kingma and Ba paper (in the formula just before
60
- Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
61
- `1e-7`.
62
- amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
63
- the paper "On the Convergence of Adam and beyond". Defaults to `False`.
64
- {{base_optimizer_keyword_args}}
47
+ learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
48
+ `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
49
+ that takes no arguments and returns the actual value to use. The
50
+ learning rate. Defaults to `0.001`.
51
+ beta_1: A float value or a constant float tensor, or a callable
52
+ that takes no arguments and returns the actual value to use. The
53
+ exponential decay rate for the 1st moment estimates.
54
+ Defaults to `0.9`.
55
+ beta_2: A float value or a constant float tensor, or a callable
56
+ that takes no arguments and returns the actual value to use. The
57
+ exponential decay rate for the 2nd moment estimates.
58
+ Defaults to `0.999`.
59
+ epsilon: A small constant for numerical stability. This epsilon is
60
+ "epsilon hat" in the Kingma and Ba paper (in the formula just before
61
+ Section 2.1), not the epsilon in Algorithm 1 of the paper.
62
+ Defaults to `1e-7`.
63
+ amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm
64
+ from the paper "On the Convergence of Adam and beyond".
65
+ Defaults to `False`.
66
+ {{base_optimizer_keyword_args}}
65
67
66
68
Reference:
67
- - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
68
- - [Reddi et al., 2018](
69
- https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
69
+ - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
70
+ - [Reddi et al., 2018](
71
+ https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
70
72
71
73
Notes:
72
74
@@ -130,7 +132,7 @@ def build(self, var_list):
130
132
velocity_hat (only set when amsgrad is applied),
131
133
132
134
Args:
133
- var_list: list of model variables to build Adam variables on.
135
+ var_list: list of model variables to build Adam variables on.
134
136
"""
135
137
super ().build (var_list )
136
138
if hasattr (self , "_built" ) and self ._built :
@@ -160,8 +162,6 @@ def build(self, var_list):
160
162
161
163
def update_step (self , gradient , variable ):
162
164
"""Update step given gradient and the associated model variable."""
163
- beta_1_power = None
164
- beta_2_power = None
165
165
lr = tf .cast (self .learning_rate , variable .dtype )
166
166
local_step = tf .cast (self .iterations + 1 , variable .dtype )
167
167
beta_1_power = tf .pow (tf .cast (self .beta_1 , variable .dtype ), local_step )
0 commit comments