Source code for ablator.main.proto

import copy
import typing as ty
from copy import deepcopy
from pathlib import Path

import git
import torch
from git import exc

from ablator.config.proto import RunConfig
from ablator.main.model.wrapper import ModelWrapper


[docs]class ProtoTrainer: """ Manages resources for Prototyping. This trainer runs an experiment of a single prototype model (Therefore no ablation study nor HPO). Parameters ---------- wrapper : ModelWrapper The main model wrapper. run_config : RunConfig Running configuration for the model. Attributes ---------- wrapper : ModelWrapper The main model wrapper. run_config : RunConfig Running configuration for the model. experiment_dir : Path The path object to the experiment directory. Raises ------ RuntimeError If the experiment directory is not defined in the running configuration. Examples -------- Below is a complete workflow on how to launch a prototype experiment with ``ProtoTrainer``, from defining the config to launching the experiment: - Define training config: >>> my_optimizer_config = OptimizerConfig("sgd", {"lr": 0.5, "weight_decay": 0.5}) >>> my_scheduler_config = SchedulerConfig("step", arguments={"step_size": 1, "gamma": 0.99}) >>> train_config = TrainConfig( ... dataset="[Dataset Name]", ... batch_size=32, ... epochs=10, ... optimizer_config = my_optimizer_config, ... scheduler_config = my_scheduler_config ... ) - Define model config: we use the default one with no custom hyperparameters (sometimes you would want to customize it to run ablation study/ HPO on the model's hyperparameters in a parallel experiment, which needs ``ParallelTrainer`` and ``ParallelConfig`` instead of ``ProtoTrainer`` and ``RunConfig``): >>> model_config = ModelConfig() - Define run config: >>> run_config = RunConfig( ... train_config=train_config, ... model_config=model_config, ... metrics_n_batches = 800, ... experiment_dir = "/tmp/experiments", ... device="cpu", ... amp=False, ... random_seed = 42 ... ) - Create model wrapper: >>> class MyModelWrapper(ModelWrapper): >>> def __init__(self, *args, **kwargs): >>> super().__init__(*args, **kwargs) >>> >>> def make_dataloader_train(self, run_config: RunConfig): >>> return torch.utils.data.DataLoader(<train_dataset>, batch_size=32, shuffle=True) >>> >>> def make_dataloader_val(self, run_config: RunConfig): >>> return torch.utils.data.DataLoader(<val_dataset>, batch_size=32, shuffle=False) - After gathering all configurations and model wrapper, it's time we initialize and launch the prototype trainer. When launching the experiment, we must provide a working directory, which points to a git repository that is used for keeping track of the code differences: >>> wrapper = MyModelWrapper( ... model_class=<your_ModelModule_class>, ... ) >>> ablator = ProtoTrainer( ... wrapper=wrapper, ... run_config=run_config, ... ) >>> metrics = ablator.launch(working_directory=os.getcwd()) # suppose current directory is tracked by git """ def __init__( self, wrapper: ModelWrapper, run_config: RunConfig, ): # Initialize model wrapper and running configuration for the model. super().__init__() self.wrapper = copy.deepcopy(wrapper) self.run_config: RunConfig = copy.deepcopy(run_config) if self.run_config.experiment_dir is None: raise RuntimeError("Must specify an experiment directory.") experiment_dir = self.run_config.experiment_dir experiment_path = Path(experiment_dir).absolute().resolve() self.experiment_dir = experiment_path self.run_config.experiment_dir = experiment_dir
[docs] def pre_train_setup(self): """ Used to prepare resources to avoid stalling during training or when resources are shared between trainers. """
def _mount(self): # TODO # mount experiment directory # https://rclone.org/commands/rclone_mount/ pass def _get_diffs(self, working_dir: str = ""): try: repo = git.Repo(Path(working_dir).resolve().absolute().as_posix()) t = repo.head.commit.tree diffs = repo.git.diff(t) return f"Git Diffs for {repo.head.ref} @ {repo.head.commit}: \n{diffs}" except ValueError as e: raise RuntimeError( f"Could not parse repo at {working_dir}. Error: {str(e)}" ) from e except exc.NoSuchPathError as e: raise FileNotFoundError(f"Directory {working_dir} was not found. ") from e except exc.InvalidGitRepositoryError: return ( f"No git repository was detected at {working_dir}. " "We recommend setting the working directory to a git repository " "to keep track of changes." )
[docs] def launch(self, working_directory: str, debug: bool = False) -> dict[str, float]: """ Launch the prototype experiment (train, evaluate the single prototype model) and return metrics. Parameters ---------- working_directory : str The working directory points to a git repository that is used for keeping track of the code differences. debug : bool, optional Whether to train models in debug mode, by default ``False``. Returns ------- metrics : dict[str, float] Metrics returned after training. """ self._mount() self.pre_train_setup() self.wrapper.init_state( run_config=self.run_config, smoke_test=False, debug=debug, resume=False ) diffs = self._get_diffs(working_directory) self.wrapper.logger.info(diffs) metrics = self.wrapper.train(debug=debug) return metrics
[docs] def evaluate(self) -> dict[str, dict[str, ty.Any]]: """ Run model evaluation on the training results, sync evaluation results to external logging services (e.g Google cloud storage, other remote servers). Returns ------- metrics : dict[str, dict[str, ty.Any]] Metrics returned after evaluation. """ # TODO load model if it is un-trained metrics = self.wrapper.evaluate(self.run_config) return metrics
[docs] def smoke_test(self, config: RunConfig | None = None): """ Run a smoke test training process on the model. Parameters ---------- config : RunConfig | None Running configuration for the model. Examples -------- try: ablator.smoke_test(run_config) except err: raise err """ if config is None: config = self.run_config run_config = deepcopy(config) wrapper = type(self.wrapper)(self.wrapper.model_class) wrapper.train(run_config=run_config, smoke_test=True) del wrapper torch.cuda.empty_cache() return True