PR #18124: Some cleanup // Optimizers

Frightera · tensorflower-gardener · commit eeae25fee98b · 2023-05-23T02:47:27.000-07:00
Imported from GitHub PR #18124 This PR: * Migrates docstrings into 4 indents * Removes unused args in adam & adamw Copybara import of the project: -- f8c2982 by Kaan Bıçakcı <46622558+Frightera@users.noreply.github.com>: Remove unused args -- 0775e6b by Kaan Bıçakcı <46622558+Frightera@users.noreply.github.com>: adamw docstring update -- 7999975 by Kaan Bıçakcı <46622558+Frightera@users.noreply.github.com>: Adadelta docstring update -- a99442b by Kaan Bıçakcı <46622558+Frightera@users.noreply.github.com>: adafactor docstring update -- 070dd88 by Kaan Bıçakcı <46622558+Frightera@users.noreply.github.com>: adagrad docstring update -- 49e5689 by Kaan Bıçakcı <46622558+Frightera@users.noreply.github.com>: Remove unused args // adam -- 6b0a4da by Kaan Bıçakcı <46622558+Frightera@users.noreply.github.com>: Adam docstring update -- 53a87b9 by Kaan Bıçakcı <46622558+Frightera@users.noreply.github.com>: FTRL docstring update -- e47d096 by Kaan Bıçakcı <46622558+Frightera@users.noreply.github.com>: Adamax docstring update -- 3811ee2 by Kaan Bıçakcı <46622558+Frightera@users.noreply.github.com>: Lion docstring update -- 7452374 by Kaan Bıçakcı <46622558+Frightera@users.noreply.github.com>: Nadam docstring update -- 11d682e by Kaan Bıçakcı <46622558+Frightera@users.noreply.github.com>: RMS docstring update -- 69070b4 by Kaan Bıçakcı <46622558+Frightera@users.noreply.github.com>: Fix linting -- 7fe95d6 by Kaan Bıçakcı <46622558+Frightera@users.noreply.github.com>: Update indent for unupdated params -- 4855354 by Kaan Bıçakcı <46622558+Frightera@users.noreply.github.com>: SGD docstring update -- 3b3403c by Kaan Bıçakcı <46622558+Frightera@users.noreply.github.com>: Fix adagrad indent Merging this change closes #18124 FUTURE_COPYBARA_INTEGRATE_REVIEW=#18124 from Frightera:cleanup 3b3403c PiperOrigin-RevId: 534171670
diff --git a/keras/optimizers/adadelta.py b/keras/optimizers/adadelta.py
@@ -47,19 +47,20 @@ class Adadelta(optimizer.Optimizer):
     learning rate can be set, as in most other Keras optimizers.
 
     Args:
-      learning_rate: Initial value for the learning rate: either a floating
-        point value, or a `tf.keras.optimizers.schedules.LearningRateSchedule`
-        instance. Defaults to 0.001. Note that `Adadelta` tends to benefit from
-        higher initial learning rate values compared to other optimizers. To
-        match the exact form in the original paper, use 1.0.
-      rho: A `Tensor` or a floating point value. The decay rate. Defaults to
-        0.95.
-      epsilon: Small floating point value used to maintain numerical stability.
-        Defaults to 1e-7.
+        learning_rate: Initial value for the learning rate: either a floating
+            point value, or a
+            `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
+            Defaults to 0.001. Note that `Adadelta` tends to benefit from
+            higher initial learning rate values compared to other optimizers. To
+            match the exact form in the original paper, use 1.0.
+        rho: A `Tensor` or a floating point value. The decay rate. Defaults to
+            0.95.
+        epsilon: Small floating point value used to maintain numerical
+            stability. Defaults to 1e-7.
       {{base_optimizer_keyword_args}}
 
     Reference:
-      - [Zeiler, 2012](http://arxiv.org/abs/1212.5701)
+        - [Zeiler, 2012](http://arxiv.org/abs/1212.5701)
     """
 
     def __init__(
diff --git a/keras/optimizers/adafactor.py b/keras/optimizers/adafactor.py
@@ -42,26 +42,26 @@ class Adafactor(optimizer.Optimizer):
     last 2 dimensions separately in its accumulator variables.
 
     Args:
-      learning_rate: Initial value for the learning rate:
-        either a floating point value,
-        or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
-        Defaults to 0.001.
-      beta_2_decay: float, defaults to -0.8. The decay rate of `beta_2`.
-      epsilon_1: float, defaults to 1e-30. A small offset to keep demoninator
-        away from 0.
-      epsilon_2: float, defaults to 1e-3. A small offset to avoid learning
-        rate becoming too small by time.
-      clip_threshold: float, defaults to 1.0. Clipping threshold. This is a part
-        of Adafactor algorithm, independent from `clipnorm`, `clipvalue` and
-        `global_clipnorm`.
-      relative_step: bool, defaults to True. If `learning_rate` is a
-        constant and `relative_step=True`, learning rate will be adjusted
-        based on current iterations. This is a default learning rate decay
-        in Adafactor.
+        learning_rate: Initial value for the learning rate:
+            either a floating point value,
+            or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
+            Defaults to 0.001.
+        beta_2_decay: float, defaults to -0.8. The decay rate of `beta_2`.
+        epsilon_1: float, defaults to 1e-30. A small offset to keep denominator
+            away from 0.
+        epsilon_2: float, defaults to 1e-3. A small offset to avoid learning
+            rate becoming too small by time.
+        clip_threshold: float, defaults to 1.0. Clipping threshold. This is a
+            part of Adafactor algorithm, independent from `clipnorm`,
+            `clipvalue` and `global_clipnorm`.
+        relative_step: bool, defaults to True. If `learning_rate` is a
+            constant and `relative_step=True`, learning rate will be adjusted
+            based on current iterations. This is a default learning rate decay
+            in Adafactor.
       {{base_optimizer_keyword_args}}
 
     Reference:
-      - [Shazeer, Noam et al., 2018](https://arxiv.org/abs/1804.04235).
+        - [Shazeer, Noam et al., 2018](https://arxiv.org/abs/1804.04235).
 
     """
 
@@ -110,7 +110,7 @@ def build(self, var_list):
         velocity_hat (only set when amsgrad is applied),
 
         Args:
-          var_list: list of model variables to build Adam variables on.
+            var_list: list of model variables to build Adam variables on.
         """
         super().build(var_list)
         if hasattr(self, "_built") and self._built:
diff --git a/keras/optimizers/adagrad.py b/keras/optimizers/adagrad.py
@@ -40,22 +40,22 @@ class Adagrad(optimizer.Optimizer):
     the smaller the updates.
 
     Args:
-      learning_rate: Initial value for the learning rate:
-        either a floating point value,
-        or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
-        Defaults to 0.001.
-        Note that `Adagrad` tends to benefit from higher initial learning rate
-        values compared to other optimizers.
-        To match the exact form in the original paper, use 1.0.
-      initial_accumulator_value: Floating point value.
-        Starting value for the accumulators (per-parameter momentum values).
-        Must be non-negative.
-      epsilon: Small floating point value used to maintain numerical stability.
-      {{base_optimizer_keyword_args}}
+        learning_rate: Initial value for the learning rate:
+            either a floating point value,
+            or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
+            Defaults to 0.001. Note that `Adagrad` tends to benefit from higher
+            initial learning rate values compared to other optimizers. To match
+            the exact form in the original paper, use 1.0.
+        initial_accumulator_value: Floating point value.
+            Starting value for the accumulators (per-parameter momentum values).
+            Must be non-negative.
+        epsilon: Small floating point value used to maintain numerical
+            stability.
+        {{base_optimizer_keyword_args}}
 
     Reference:
-      - [Duchi et al., 2011](
-        http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
+        - [Duchi et al., 2011](
+            http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
     """
 
     def __init__(
diff --git a/keras/optimizers/adam.py b/keras/optimizers/adam.py
@@ -44,29 +44,31 @@ class Adam(optimizer.Optimizer):
     data/parameters*".
 
     Args:
-      learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
-        `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
-        that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to `0.001`.
-      beta_1: A float value or a constant float tensor, or a callable
-        that takes no arguments and returns the actual value to use. The
-        exponential decay rate for the 1st moment estimates. Defaults to `0.9`.
-      beta_2: A float value or a constant float tensor, or a callable
-        that takes no arguments and returns the actual value to use. The
-        exponential decay rate for the 2nd moment estimates. Defaults to
-        `0.999`.
-      epsilon: A small constant for numerical stability. This epsilon is
-        "epsilon hat" in the Kingma and Ba paper (in the formula just before
-        Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-        `1e-7`.
-      amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
-        the paper "On the Convergence of Adam and beyond". Defaults to `False`.
-      {{base_optimizer_keyword_args}}
+        learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
+            `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+            that takes no arguments and returns the actual value to use. The
+            learning rate. Defaults to `0.001`.
+        beta_1: A float value or a constant float tensor, or a callable
+            that takes no arguments and returns the actual value to use. The
+            exponential decay rate for the 1st moment estimates.
+            Defaults to `0.9`.
+        beta_2: A float value or a constant float tensor, or a callable
+            that takes no arguments and returns the actual value to use. The
+            exponential decay rate for the 2nd moment estimates.
+            Defaults to `0.999`.
+        epsilon: A small constant for numerical stability. This epsilon is
+            "epsilon hat" in the Kingma and Ba paper (in the formula just before
+            Section 2.1), not the epsilon in Algorithm 1 of the paper.
+            Defaults to `1e-7`.
+        amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm
+            from the paper "On the Convergence of Adam and beyond".
+            Defaults to `False`.
+        {{base_optimizer_keyword_args}}
 
     Reference:
-      - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
-      - [Reddi et al., 2018](
-          https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
+        - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+        - [Reddi et al., 2018](
+            https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
 
     Notes:
 
@@ -130,7 +132,7 @@ def build(self, var_list):
         velocity_hat (only set when amsgrad is applied),
 
         Args:
-          var_list: list of model variables to build Adam variables on.
+            var_list: list of model variables to build Adam variables on.
         """
         super().build(var_list)
         if hasattr(self, "_built") and self._built:
@@ -160,8 +162,6 @@ def build(self, var_list):
 
     def update_step(self, gradient, variable):
         """Update step given gradient and the associated model variable."""
-        beta_1_power = None
-        beta_2_power = None
         lr = tf.cast(self.learning_rate, variable.dtype)
         local_step = tf.cast(self.iterations + 1, variable.dtype)
         beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)
diff --git a/keras/optimizers/adamax.py b/keras/optimizers/adamax.py
@@ -57,19 +57,19 @@ class Adamax(optimizer.Optimizer):
     ```
 
     Args:
-      learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
-        `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
-        that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to `0.001`.
-      beta_1: A float value or a constant float tensor. The exponential decay
-        rate for the 1st moment estimates.
-      beta_2: A float value or a constant float tensor. The exponential decay
-        rate for the exponentially weighted infinity norm.
-      epsilon: A small constant for numerical stability.
-      {{base_optimizer_keyword_args}}
+        learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
+            `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+            that takes no arguments and returns the actual value to use. The
+            learning rate. Defaults to `0.001`.
+        beta_1: A float value or a constant float tensor. The exponential decay
+            rate for the 1st moment estimates.
+        beta_2: A float value or a constant float tensor. The exponential decay
+            rate for the exponentially weighted infinity norm.
+        epsilon: A small constant for numerical stability.
+        {{base_optimizer_keyword_args}}
 
     Reference:
-      - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+        - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
     """
 
     def __init__(
@@ -113,7 +113,7 @@ def build(self, var_list):
         exponentially weighted infinity norm (denoted as u).
 
         Args:
-          var_list: list of model variables to build Adamax variables on.
+            var_list: list of model variables to build Adamax variables on.
         """
         super().build(var_list)
         if hasattr(self, "_built") and self._built:
diff --git a/keras/optimizers/adamw.py b/keras/optimizers/adamw.py
@@ -48,23 +48,26 @@ class AdamW(optimizer.Optimizer):
     data/parameters*".
 
     Args:
-      learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
-        `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
-        that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to 0.001.
-      beta_1: A float value or a constant float tensor, or a callable
-        that takes no arguments and returns the actual value to use. The
-        exponential decay rate for the 1st moment estimates. Defaults to 0.9.
-      beta_2: A float value or a constant float tensor, or a callable
-        that takes no arguments and returns the actual value to use. The
-        exponential decay rate for the 2nd moment estimates. Defaults to 0.999.
-      epsilon: A small constant for numerical stability. This epsilon is
-        "epsilon hat" in the Kingma and Ba paper (in the formula just before
-        Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
-        1e-7.
-      amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
-        the paper "On the Convergence of Adam and beyond". Defaults to `False`.
-      {{base_optimizer_keyword_args}}
+        learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
+            `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+            that takes no arguments and returns the actual value to use. The
+            learning rate. Defaults to 0.001.
+        beta_1: A float value or a constant float tensor, or a callable
+            that takes no arguments and returns the actual value to use. The
+            exponential decay rate for the 1st moment estimates.
+            Defaults to 0.9.
+        beta_2: A float value or a constant float tensor, or a callable
+            that takes no arguments and returns the actual value to use. The
+            exponential decay rate for the 2nd moment estimates.
+            Defaults to 0.999.
+        epsilon: A small constant for numerical stability. This epsilon is
+            "epsilon hat" in the Kingma and Ba paper (in the formula just before
+            Section 2.1), not the epsilon in Algorithm 1 of the paper.
+            Defaults to 1e-7.
+        amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm
+            from the paper "On the Convergence of Adam and beyond".
+            Defaults to `False`.
+        {{base_optimizer_keyword_args}}
 
     Reference:
       - [Loshchilov et al., 2019](https://arxiv.org/abs/1711.05101)
@@ -163,8 +166,6 @@ def build(self, var_list):
 
     def update_step(self, gradient, variable):
         """Update step given gradient and the associated model variable."""
-        beta_1_power = None
-        beta_2_power = None
         lr = tf.cast(self.learning_rate, variable.dtype)
         local_step = tf.cast(self.iterations + 1, variable.dtype)
         beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)
diff --git a/keras/optimizers/ftrl.py b/keras/optimizers/ftrl.py
@@ -74,26 +74,27 @@ class Ftrl(optimizer.Optimizer):
     is replaced with a gradient with shrinkage.
 
     Args:
-      learning_rate: A `Tensor`, floating point value, a schedule that is a
-        `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable that
-        takes no arguments and returns the actual value to use. The learning
-        rate.  Defaults to `0.001`.
-      learning_rate_power: A float value, must be less or equal to zero.
-        Controls how the learning rate decreases during training. Use zero for a
-        fixed learning rate.
-      initial_accumulator_value: The starting value for accumulators. Only zero
-        or positive values are allowed.
-      l1_regularization_strength: A float value, must be greater than or equal
-        to zero. Defaults to `0.0`.
-      l2_regularization_strength: A float value, must be greater than or equal
-        to zero. Defaults to `0.0`.
-      l2_shrinkage_regularization_strength: A float value, must be greater than
-        or equal to zero. This differs from L2 above in that the L2 above is a
-        stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
-        When input is sparse shrinkage will only happen on the active weights.
-      beta: A float value, representing the beta value from the paper. Defaults
-        to 0.0.
-      {{base_optimizer_keyword_args}}
+        learning_rate: A `Tensor`, floating point value, a schedule that is a
+            `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+             that takes no arguments and returns the actual value to use. The
+             learning rate.  Defaults to `0.001`.
+        learning_rate_power: A float value, must be less or equal to zero.
+            Controls how the learning rate decreases during training. Use zero
+            for a fixed learning rate.
+        initial_accumulator_value: The starting value for accumulators. Only
+            zero or positive values are allowed.
+        l1_regularization_strength: A float value, must be greater than or equal
+            to zero. Defaults to `0.0`.
+        l2_regularization_strength: A float value, must be greater than or equal
+            to zero. Defaults to `0.0`.
+        l2_shrinkage_regularization_strength: A float value, must be greater
+            than or equal to zero. This differs from L2 above in that the L2
+            above is a stabilization penalty, whereas this L2 shrinkage is a
+            magnitude penalty. When input is sparse shrinkage will only happen
+            on the active weights.
+        beta: A float value, representing the beta value from the paper.
+            Defaults to 0.0.
+        {{base_optimizer_keyword_args}}
     """
 
     def __init__(
diff --git a/keras/optimizers/lion.py b/keras/optimizers/lion.py
@@ -40,22 +40,22 @@ class Lion(optimizer.Optimizer):
     similar strength (lr * wd).
 
     Args:
-      learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
-        `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
-        that takes no arguments and returns the actual value to use. The
-        learning rate. Defaults to 0.0001.
-      beta_1: A float value or a constant float tensor, or a callable
-        that takes no arguments and returns the actual value to use. The rate
-        to combine the current gradient and the 1st moment estimate.
-      beta_2: A float value or a constant float tensor, or a callable
-        that takes no arguments and returns the actual value to use. The
-        exponential decay rate for the 1st moment estimate.
-      {{base_optimizer_keyword_args}}
+        learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
+            `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+            that takes no arguments and returns the actual value to use. The
+            learning rate. Defaults to 0.0001.
+        beta_1: A float value or a constant float tensor, or a callable
+            that takes no arguments and returns the actual value to use. The
+            rate to combine the current gradient and the 1st moment estimate.
+        beta_2: A float value or a constant float tensor, or a callable
+            that takes no arguments and returns the actual value to use. The
+            exponential decay rate for the 1st moment estimate.
+        {{base_optimizer_keyword_args}}
 
     References:
-      - [Chen et al., 2023](http://arxiv.org/abs/2302.06675)
-      - [Authors' implementation](
-          http://github.com/google/automl/tree/master/lion)
+        - [Chen et al., 2023](http://arxiv.org/abs/2302.06675)
+        - [Authors' implementation](
+            http://github.com/google/automl/tree/master/lion)
 
     """
 
@@ -102,7 +102,7 @@ def build(self, var_list):
         Lion optimizer has one variable `momentums`.
 
         Args:
-          var_list: list of model variables to build Lion variables on.
+            var_list: list of model variables to build Lion variables on.
         """
         super().build(var_list)
         if hasattr(self, "_built") and self._built:
diff --git a/keras/optimizers/nadam.py b/keras/optimizers/nadam.py
diff --git a/keras/optimizers/rmsprop.py b/keras/optimizers/rmsprop.py
diff --git a/keras/optimizers/sgd.py b/keras/optimizers/sgd.py