Source code for ablator.modules.optimizer

from collections import abc
import inspect
import typing as ty
from abc import abstractmethod

import torch
from torch import nn
from torch.optim import SGD, Adam, AdamW, Optimizer

from ablator.config.main import ConfigBase, configclass
from ablator.config.types import Tuple


[docs]def get_optim_parameters(
    model: torch.nn.Module,
) -> abc.Iterator[nn.Parameter]:
    """
    Get model parameters to be optimized. It first attempts to derive optimization parameters
    via a user-defined `get_optim_param` function which when it fails to find it simply uses the
    default torch `nn.parameters()`

    Parameters
    ----------
    model : torch.nn.Module
        The model for which to get parameters that will be optimized.

    Returns
    -------
    abc.Iterator[nn.Parameter]
        The list of parameters that require to be optimized. It can be a list, tensor or dictionary. Please see
        Pytorch Optimizer documentation on the specific format.

    Notes
    -----
    We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
    Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass.

    Examples
    --------
    >>> class MyModel(nn.Module):
    >>>     def __init__(self, embedding_dim=10, vocab_size=10, *args, **kwargs) -> None:
    >>>         super().__init__(*args, **kwargs)
    >>>         self.param = nn.Parameter(torch.ones(100))
    >>>         self.embedding = nn.Embedding(num_embeddings=vocab_size,
    >>>                                     embedding_dim=embedding_dim)
    >>>         self.norm_layer = nn.LayerNorm(embedding_dim)
    >>>     def forward(self):
    >>>         x = self.param + torch.rand_like(self.param) * 0.01
    >>>         return x.sum().abs()
    >>>     def get_optim_param(self):
    >>>         return [{"params": [self.param], 'weight_decay':0.2}]
    >>> mM = MyModel()
    >>> get_optim_parameters(mM)
    [{'params': ['param'], 'weight_decay': 0.2}]
    """
    _model = model
    if isinstance(model, nn.DataParallel):
        _model = model.module
    fn = getattr(_model, "get_optim_param", None)
    if fn is not None and inspect.ismethod(fn):
        return fn()
    return model.parameters()


[docs]@configclass
class OptimizerArgs(ConfigBase):
    """
    A base class for optimizer arguments, here we define learning rate lr.

    Attributes
    ----------
    lr : float
        Learning rate of the optimizer
    """

    lr: float

[docs]    @abstractmethod
    def init_optimizer(self, model: nn.Module):
        """
        Abstract method to be implemented by derived classes, which initializes the optimizer.
        """
        raise NotImplementedError("init_optimizer method not implemented.")


[docs]@configclass
class OptimizerConfig(ConfigBase):
    """
    Configuration for an optimizer, including optimizer name and arguments (these arguments
    are specific to a certain type of optimizer like SGD, Adam, AdamW). This optimizer config
    will be provided to ``TrainConfig`` as part of the training setting of the experiment.

    Parameters
    ----------
    name : str
        Name of the optimizer, this can be any in ``['adamw', 'adam', 'sgd']``.
    arguments : dict[str, ty.Any]
        Arguments for the optimizer, specific to a certain type of optimizer. A common argument
        can be learning rate, e.g ``{'lr': 0.5}``. If ``name`` is ``"adamw"``, can add ``eps`` to ``arguments``,
        e.g ``{'lr': 0.5, 'eps': 0.001}``. Refer to  `Configuration Basics
        scheduler <./notebooks/Configuration-Basics.ipynb>`_ tutorial for more details on each optimizer's arguments.

    Attributes
    ----------
    name : str
        Name of the optimizer.
    arguments : OptimizerArgs
        Arguments for the optimizer, specific to a certain type of optimizer.

    Examples
    --------
    The following example shows how to create an optimizer config for SGD optimizer and use it in
    ``TrainConfig`` to define the training setting of the experiment.

    >>> optim_config = OptimizerConfig("sgd", {"lr": 0.5})
    >>> train_config = TrainConfig(
    ...     dataset="[Dataset Name]",
    ...     batch_size=32,
    ...     epochs=20,
    ...     optimizer_config=optim_config,
    ...     scheduler_config=None
    ... )
    >>> # ... create the run config (proto/parallel), model wrapper, trainer and launch the experiment

    .. note::
        Sometimes we want to run ablation studies on different optimizers to learn about their
        effects on the model performance. However, ``OptimizerConfig`` only configures one single
        optimizer for the experiment. But you can run experiments on different optimizers by creating
        a custom config class and add an extra method called ``make_optimizer``. Go to the tutorial on
        `Search space for different types of optimizers and
        scheduler <./notebooks/Searchspace-for-diff-optimizers.ipynb>`_ for more details.

    """

    name: str
    arguments: OptimizerArgs

    def __init__(self, name: str, arguments: dict[str, ty.Any]):
        # Initializes the optimizer configuration. Add any provided settings to the optimizer.
        argument_cls = OPTIMIZER_CONFIG_MAP[name]
        _arguments = argument_cls(**arguments)
        super().__init__(name=name, arguments=_arguments)

[docs]    def make_optimizer(self, model: nn.Module) -> Optimizer:
        """
        Creates and returns an optimizer for the given model.

        Parameters
        ----------
        model : nn.Module
            The model to optimize.

        Returns
        -------
        optimizer : Optimizer
            The created optimizer.

        Examples
        --------
        >>> optim_config = OptimizerConfig("sgd", {"lr": 0.5, "weight_decay": 0.5})
        >>> optim_config.make_optimizer(my_module)
        SGD (
        Parameter Group 0
            dampening: 0
            differentiable: False
            foreach: None
            lr: 0.5
            maximize: False
            momentum: 0.0
            nesterov: False
            weight_decay: 0.5
        Parameter Group 1
            dampening: 0
            differentiable: False
            foreach: None
            lr: 0.5
            maximize: False
            momentum: 0.0
            nesterov: False
            weight_decay: 0.0
        )
        """
        return self.arguments.init_optimizer(model)


[docs]@configclass
class SGDConfig(OptimizerArgs):
    """
    Configuration for an SGD optimizer. This class has ``init_optimizer()`` method,
    which is used to initialize and return an SGD optimizer.

    Attributes
    ----------
    weight_decay : float
        Weight decay rate.
    momentum : float
        Momentum factor.

    Examples
    --------
    >>> config = SGDConfig(lr=0.1, momentum=0.9)
    """

    weight_decay: float = 0.0
    momentum: float = 0.0

[docs]    def init_optimizer(self, model: nn.Module) -> SGD:
        """
        Creates and returns an SGD optimizer that optimizes the model's parameters. These parameters
        will be processed via ``get_optim_parameters`` before used to initalized the optimizer.

        Parameters
        ----------
        model : nn.Module
            The model that has parameters that the optimizer will optimize.

        Returns
        -------
        optimizer : SGD
            The created SGD optimizer.

        Examples
        --------
        >>> config = SGDConfig(lr=0.1, weight_decay=0.5, momentum=0.9)
        >>> config.init_optimizer(MyModel())
        SGD (
        Parameter Group 0
            dampening: 0
            differentiable: False
            foreach: None
            lr: 0.1
            maximize: False
            momentum: 0.9
            nesterov: False
            weight_decay: 0.5
        Parameter Group 1
            dampening: 0
            differentiable: False
            foreach: None
            lr: 0.1
            maximize: False
            momentum: 0.9
            nesterov: False
            weight_decay: 0.0
        )
        """
        kwargs = self.to_dict()
        model_parameters = get_optim_parameters(model)
        return SGD(model_parameters, **kwargs)


[docs]@configclass
class AdamWConfig(OptimizerArgs):
    """
    Configuration for an AdamW optimizer. This class has ``init_optimizer()`` method
    used to initialize and return an ``AdamW`` optimizer.

    Attributes
    ----------
    betas : Tuple[float, float]
        Coefficients for computing running averages of gradient and its square, by default ``(0.9, 0.999)``.
    eps : float
        Term added to the denominator to improve numerical stability, by default ``1e-8``.
    weight_decay : float
        Weight decay rate, by default ``0.01``.

    Examples
    --------
    >>> config = AdamWConfig(lr=0.1, weight_decay=0.5, betas=(0.9,0.99))
    """

    betas: Tuple[float, float] = (0.9, 0.999)
    eps: float = 1e-8
    weight_decay: float = 0.01

[docs]    def init_optimizer(self, model: nn.Module) -> AdamW:
        """
        Creates and returns an ``AdamW`` optimizer that optimizes the model's parameters. These parameters
        will be processed via ``get_optim_parameters`` before used to initalized the optimizer.

        Parameters
        ----------
        model : nn.Module
            The model that has parameters that the optimizer will optimize.

        Returns
        -------
        AdamW
            An instance of the ``AdamW`` optimizer.

        Examples
        --------
        >>> config = AdamWConfig(lr=0.1, weight_decay=0.5, betas=(0.9,0.99), eps=0.001)
        >>> config.init_optimizer(MyModel())
        AdamW (
        Parameter Group 0
            amsgrad: False
            betas: (0.9, 0.99)
            capturable: False
            eps: 0.001
            foreach: None
            lr: 0.1
            maximize: False
            weight_decay: 0.5
        Parameter Group 1
            amsgrad: False
            betas: (0.9, 0.99)
            capturable: False
            eps: 0.001
            foreach: None
            lr: 0.1
            maximize: False
            weight_decay: 0.0
        )
        """
        kwargs = self.to_dict()
        # 1e-4
        model_parameters = get_optim_parameters(model)
        return AdamW(model_parameters, **kwargs)


[docs]@configclass
class AdamConfig(OptimizerArgs):
    """
    Configuration for an ``Adam`` optimizer. This class has ``init_optimizer()`` method
    used to initialize and return an ``Adam`` optimizer.

    Attributes
    ----------
    betas : Tuple[float, float]
        Coefficients for computing running averages of gradient and its square, by default ``(0.9, 0.999)``.
    weight_decay : float
        Weight decay rate, by default ``0.0``.

    """

    betas: Tuple[float, float] = (0.9, 0.999)
    weight_decay: float = 0.0

[docs]    def init_optimizer(self, model: nn.Module) -> Adam:
        """
        Creates and returns an ``Adam`` optimizer that optimizes the model's parameters. These parameters
        will be processed via ``get_optim_parameters`` before used to initalized the optimizer.

        Parameters
        ----------
        model : nn.Module
            The model that has parameters that the optimizer will optimize.

        Returns
        -------
        Adam
            An instance of the ``Adam`` optimizer.

        Examples
        --------
        >>> config = AdamConfig(lr=0.1, weight_decay=0.5, betas=(0.6,0.9))
        >>> config.init_optimizer(MyModel())
        Adam (
        Parameter Group 0
            amsgrad: False
            betas: (0.6, 0.9)
            capturable: False
            differentiable: False
            eps: 1e-08
            foreach: None
            fused: False
            lr: 0.1
            maximize: False
            weight_decay: 0.5
        Parameter Group 1
            amsgrad: False
            betas: (0.6, 0.9)
            capturable: False
            differentiable: False
            eps: 1e-08
            foreach: None
            fused: False
            lr: 0.1
            maximize: False
            weight_decay: 0.0
        )
        """
        kwargs = self.to_dict()
        model_parameters = get_optim_parameters(model)
        return Adam(model_parameters, **kwargs)


OPTIMIZER_CONFIG_MAP: dict[str, type] = {
    "adamw": AdamWConfig,
    "adam": AdamConfig,
    "sgd": SGDConfig,
}