Source code for ablator.modules.optimizer

from collections import abc
import inspect
import typing as ty
from abc import abstractmethod

import torch
from torch import nn
from torch.optim import SGD, Adam, AdamW, Optimizer

from ablator.config.main import ConfigBase, configclass
from ablator.config.types import Tuple


[docs]def get_optim_parameters( model: torch.nn.Module, ) -> abc.Iterator[nn.Parameter]: """ Get model parameters to be optimized. It first attempts to derive optimization parameters via a user-defined `get_optim_param` function which when it fails to find it simply uses the default torch `nn.parameters()` Parameters ---------- model : torch.nn.Module The model for which to get parameters that will be optimized. Returns ------- abc.Iterator[nn.Parameter] The list of parameters that require to be optimized. It can be a list, tensor or dictionary. Please see Pytorch Optimizer documentation on the specific format. Notes ----- We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass. Examples -------- >>> class MyModel(nn.Module): >>> def __init__(self, embedding_dim=10, vocab_size=10, *args, **kwargs) -> None: >>> super().__init__(*args, **kwargs) >>> self.param = nn.Parameter(torch.ones(100)) >>> self.embedding = nn.Embedding(num_embeddings=vocab_size, >>> embedding_dim=embedding_dim) >>> self.norm_layer = nn.LayerNorm(embedding_dim) >>> def forward(self): >>> x = self.param + torch.rand_like(self.param) * 0.01 >>> return x.sum().abs() >>> def get_optim_param(self): >>> return [{"params": [self.param], 'weight_decay':0.2}] >>> mM = MyModel() >>> get_optim_parameters(mM) [{'params': ['param'], 'weight_decay': 0.2}] """ _model = model if isinstance(model, nn.DataParallel): _model = model.module fn = getattr(_model, "get_optim_param", None) if fn is not None and inspect.ismethod(fn): return fn() return model.parameters()
[docs]@configclass class OptimizerArgs(ConfigBase): """ A base class for optimizer arguments, here we define learning rate lr. Attributes ---------- lr : float Learning rate of the optimizer """ lr: float
[docs] @abstractmethod def init_optimizer(self, model: nn.Module): """ Abstract method to be implemented by derived classes, which initializes the optimizer. """ raise NotImplementedError("init_optimizer method not implemented.")
[docs]@configclass class OptimizerConfig(ConfigBase): """ Configuration for an optimizer, including optimizer name and arguments (these arguments are specific to a certain type of optimizer like SGD, Adam, AdamW). This optimizer config will be provided to ``TrainConfig`` as part of the training setting of the experiment. Parameters ---------- name : str Name of the optimizer, this can be any in ``['adamw', 'adam', 'sgd']``. arguments : dict[str, ty.Any] Arguments for the optimizer, specific to a certain type of optimizer. A common argument can be learning rate, e.g ``{'lr': 0.5}``. If ``name`` is ``"adamw"``, can add ``eps`` to ``arguments``, e.g ``{'lr': 0.5, 'eps': 0.001}``. Refer to `Configuration Basics scheduler <./notebooks/Configuration-Basics.ipynb>`_ tutorial for more details on each optimizer's arguments. Attributes ---------- name : str Name of the optimizer. arguments : OptimizerArgs Arguments for the optimizer, specific to a certain type of optimizer. Examples -------- The following example shows how to create an optimizer config for SGD optimizer and use it in ``TrainConfig`` to define the training setting of the experiment. >>> optim_config = OptimizerConfig("sgd", {"lr": 0.5}) >>> train_config = TrainConfig( ... dataset="[Dataset Name]", ... batch_size=32, ... epochs=20, ... optimizer_config=optim_config, ... scheduler_config=None ... ) >>> # ... create the run config (proto/parallel), model wrapper, trainer and launch the experiment .. note:: Sometimes we want to run ablation studies on different optimizers to learn about their effects on the model performance. However, ``OptimizerConfig`` only configures one single optimizer for the experiment. But you can run experiments on different optimizers by creating a custom config class and add an extra method called ``make_optimizer``. Go to the tutorial on `Search space for different types of optimizers and scheduler <./notebooks/Searchspace-for-diff-optimizers.ipynb>`_ for more details. """ name: str arguments: OptimizerArgs def __init__(self, name: str, arguments: dict[str, ty.Any]): # Initializes the optimizer configuration. Add any provided settings to the optimizer. argument_cls = OPTIMIZER_CONFIG_MAP[name] _arguments = argument_cls(**arguments) super().__init__(name=name, arguments=_arguments)
[docs] def make_optimizer(self, model: nn.Module) -> Optimizer: """ Creates and returns an optimizer for the given model. Parameters ---------- model : nn.Module The model to optimize. Returns ------- optimizer : Optimizer The created optimizer. Examples -------- >>> optim_config = OptimizerConfig("sgd", {"lr": 0.5, "weight_decay": 0.5}) >>> optim_config.make_optimizer(my_module) SGD ( Parameter Group 0 dampening: 0 differentiable: False foreach: None lr: 0.5 maximize: False momentum: 0.0 nesterov: False weight_decay: 0.5 Parameter Group 1 dampening: 0 differentiable: False foreach: None lr: 0.5 maximize: False momentum: 0.0 nesterov: False weight_decay: 0.0 ) """ return self.arguments.init_optimizer(model)
[docs]@configclass class SGDConfig(OptimizerArgs): """ Configuration for an SGD optimizer. This class has ``init_optimizer()`` method, which is used to initialize and return an SGD optimizer. Attributes ---------- weight_decay : float Weight decay rate. momentum : float Momentum factor. Examples -------- >>> config = SGDConfig(lr=0.1, momentum=0.9) """ weight_decay: float = 0.0 momentum: float = 0.0
[docs] def init_optimizer(self, model: nn.Module) -> SGD: """ Creates and returns an SGD optimizer that optimizes the model's parameters. These parameters will be processed via ``get_optim_parameters`` before used to initalized the optimizer. Parameters ---------- model : nn.Module The model that has parameters that the optimizer will optimize. Returns ------- optimizer : SGD The created SGD optimizer. Examples -------- >>> config = SGDConfig(lr=0.1, weight_decay=0.5, momentum=0.9) >>> config.init_optimizer(MyModel()) SGD ( Parameter Group 0 dampening: 0 differentiable: False foreach: None lr: 0.1 maximize: False momentum: 0.9 nesterov: False weight_decay: 0.5 Parameter Group 1 dampening: 0 differentiable: False foreach: None lr: 0.1 maximize: False momentum: 0.9 nesterov: False weight_decay: 0.0 ) """ kwargs = self.to_dict() model_parameters = get_optim_parameters(model) return SGD(model_parameters, **kwargs)
[docs]@configclass class AdamWConfig(OptimizerArgs): """ Configuration for an AdamW optimizer. This class has ``init_optimizer()`` method used to initialize and return an ``AdamW`` optimizer. Attributes ---------- betas : Tuple[float, float] Coefficients for computing running averages of gradient and its square, by default ``(0.9, 0.999)``. eps : float Term added to the denominator to improve numerical stability, by default ``1e-8``. weight_decay : float Weight decay rate, by default ``0.01``. Examples -------- >>> config = AdamWConfig(lr=0.1, weight_decay=0.5, betas=(0.9,0.99)) """ betas: Tuple[float, float] = (0.9, 0.999) eps: float = 1e-8 weight_decay: float = 0.01
[docs] def init_optimizer(self, model: nn.Module) -> AdamW: """ Creates and returns an ``AdamW`` optimizer that optimizes the model's parameters. These parameters will be processed via ``get_optim_parameters`` before used to initalized the optimizer. Parameters ---------- model : nn.Module The model that has parameters that the optimizer will optimize. Returns ------- AdamW An instance of the ``AdamW`` optimizer. Examples -------- >>> config = AdamWConfig(lr=0.1, weight_decay=0.5, betas=(0.9,0.99), eps=0.001) >>> config.init_optimizer(MyModel()) AdamW ( Parameter Group 0 amsgrad: False betas: (0.9, 0.99) capturable: False eps: 0.001 foreach: None lr: 0.1 maximize: False weight_decay: 0.5 Parameter Group 1 amsgrad: False betas: (0.9, 0.99) capturable: False eps: 0.001 foreach: None lr: 0.1 maximize: False weight_decay: 0.0 ) """ kwargs = self.to_dict() # 1e-4 model_parameters = get_optim_parameters(model) return AdamW(model_parameters, **kwargs)
[docs]@configclass class AdamConfig(OptimizerArgs): """ Configuration for an ``Adam`` optimizer. This class has ``init_optimizer()`` method used to initialize and return an ``Adam`` optimizer. Attributes ---------- betas : Tuple[float, float] Coefficients for computing running averages of gradient and its square, by default ``(0.9, 0.999)``. weight_decay : float Weight decay rate, by default ``0.0``. """ betas: Tuple[float, float] = (0.9, 0.999) weight_decay: float = 0.0
[docs] def init_optimizer(self, model: nn.Module) -> Adam: """ Creates and returns an ``Adam`` optimizer that optimizes the model's parameters. These parameters will be processed via ``get_optim_parameters`` before used to initalized the optimizer. Parameters ---------- model : nn.Module The model that has parameters that the optimizer will optimize. Returns ------- Adam An instance of the ``Adam`` optimizer. Examples -------- >>> config = AdamConfig(lr=0.1, weight_decay=0.5, betas=(0.6,0.9)) >>> config.init_optimizer(MyModel()) Adam ( Parameter Group 0 amsgrad: False betas: (0.6, 0.9) capturable: False differentiable: False eps: 1e-08 foreach: None fused: False lr: 0.1 maximize: False weight_decay: 0.5 Parameter Group 1 amsgrad: False betas: (0.6, 0.9) capturable: False differentiable: False eps: 1e-08 foreach: None fused: False lr: 0.1 maximize: False weight_decay: 0.0 ) """ kwargs = self.to_dict() model_parameters = get_optim_parameters(model) return Adam(model_parameters, **kwargs)
OPTIMIZER_CONFIG_MAP: dict[str, type] = { "adamw": AdamWConfig, "adam": AdamConfig, "sgd": SGDConfig, }