Source code for ablator.analysis.main

import logging
from pathlib import Path

import pandas as pd
from joblib import Memory

from ablator.analysis.plot.utils import parse_name_remap
from ablator.analysis.results import Results
from ablator.config.proto import Optim

logger = logging.getLogger(__name__)


def _parse_results(
    results: pd.DataFrame | Results,
    categorical_attributes: list[str] | None = None,
    numerical_attributes: list[str] | None = None,
    optim_metrics: dict[str, Optim] | None = None,
) -> tuple[pd.DataFrame, list[str], list[str], dict[str, Optim]]:
    _categorical_attributes = None
    _numerical_attributes = None
    _optim_metrics = None
    if isinstance(results, Results):
        df = results.data
        _categorical_attributes = results.categorical_attributes
        _numerical_attributes = results.numerical_attributes
        _optim_metrics = results.metric_map
    elif isinstance(results, pd.DataFrame):
        df = results
    else:
        raise ValueError(f"Invalid value {type(results)}")
    if categorical_attributes is not None:
        _categorical_attributes = categorical_attributes
    if numerical_attributes is not None:
        _numerical_attributes = numerical_attributes
    if optim_metrics is not None:
        _optim_metrics = optim_metrics

    if _categorical_attributes is None:
        raise ValueError(
            "Must provide `categorical_attributes` when supplying a DataFrame. Otherwise provide a ``Results`` object. "
        )
    if _numerical_attributes is None:
        raise ValueError(
            "Must provide `_numerical_attributes` when supplying a DataFrame. Otherwise provide a ``Results`` object. "
        )
    if _optim_metrics is None:
        raise ValueError(
            "Missing `optim_metrics` or unable to derive from supplied results."
        )
    return df, _categorical_attributes, _numerical_attributes, _optim_metrics


[docs]class Analysis:
    """
    A class that stores and processes the attributes, metrics, and other data for the plotting
    of the experiment result.

    Parameters
    ----------
    results : pd.DataFrame | Results
        The result dataframe.
    categorical_attributes : list[str] | None
        The list of all the categorical hyperparameter names, by default ``None``.
    numerical_attributes : list[str] | None
        The list of all the numerical hyperparameter names, by default ``None``.
    optim_metrics : dict[str, Optim] | None
        A dictionary mapping metric names to optimization directions, by default ``None``.
    save_dir : str | None
        The directory to save analysis results to, by default ``None``.
    cache : bool
        Whether to cache results, by default ``False``.

    Attributes
    ----------
    optim_metrics : dict[str, Optim]
        A dictionary mapping metric names to optimization directions.
    save_dir : Path | None
        The directory to save analysis results to.
    cache : Memory | None
        A joblib memory cache for saving results.
    categorical_attributes : list[str]
        The list of all the categorical hyperparameter names
    numerical_attributes : list[str]
        The list of all the numerical hyperparameter names
    experiment_attributes : list[str]
        The list of all the hyperparameter names
    results : pd.DataFrame
        The dataframe extracted from the results file based on given metrics names and hyperparameter names.

    Raises
    ------
    FileNotFoundError
        if the provided ``save_dir`` to save plots don't exists.
    ValueError
        if ``cache`` is ``True`` but no ``save_dir`` is provided.
    """

    def __init__(
        self,
        results: pd.DataFrame | Results,
        categorical_attributes: list[str] | None = None,
        numerical_attributes: list[str] | None = None,
        optim_metrics: dict[str, Optim] | None = None,
        save_dir: str | None = None,
        cache: bool = False,
    ) -> None:
        (
            df,
            categorical_attributes,
            numerical_attributes,
            optim_metrics,
        ) = _parse_results(
            results,
            categorical_attributes=categorical_attributes,
            numerical_attributes=numerical_attributes,
            optim_metrics=optim_metrics,
        )

        self.categorical_attributes: list[str] = categorical_attributes
        self.numerical_attributes: list[str] = numerical_attributes
        self.optim_metrics = optim_metrics
        self.save_dir: Path | None = None
        self.cache: Memory | None = None
        if save_dir is not None:
            self.save_dir = Path(save_dir)
            if not self.save_dir.parent.exists():
                raise FileNotFoundError(
                    f"Save directory does not exist. `{self.save_dir.parent}`"
                )
            self.save_dir.mkdir(exist_ok=True)
            self.cache = Memory(Path(save_dir).joinpath(".cache"), verbose=0)
            if not cache:
                self.cache.clear()
                self.cache = None
        elif cache:
            raise ValueError("Must provide a `save_dir` when specifying `cache=True`.")
        self.experiment_attributes: list[str] = (
            self.categorical_attributes + self.numerical_attributes
        )

        self.results: pd.DataFrame = df[
            self.experiment_attributes + list(self.optim_metrics.keys())
        ]

    @property
    def metric_names(self) -> list[str]:
        """
        Returns
        -------
        list[str]
            list of all the metrics that will be plotted w.r.t hyperparameters.

        Examples
        --------
        >>> Make PlotAnalysis's object
        plots = Analysis(
            ...
            optim_metrics={"val_loss": Optim.min, "train_loss": Optim.min},
        )
        metrics = plots.metric_names
        >>> returns
        ['val_loss', 'train_loss']
        """
        return list(self.optim_metrics.keys())

    @classmethod
    def _get_best_results_by_metric(
        cls,
        raw_results: pd.DataFrame,
        metric_map: dict[str, Optim],
    ) -> pd.DataFrame:
        def _best_perf(row: pd.DataFrame, name, obj_fn):
            if Optim(obj_fn) == Optim.min:
                return row.sort_values(name, na_position="last").iloc[0]
            return row.sort_values(name, na_position="first").iloc[-1]

        _ress = []
        for name, obj_fn in metric_map.items():
            res = (
                raw_results.groupby("trial_uid")
                .apply(lambda x, name=name, obj_fn=obj_fn: _best_perf(x, name, obj_fn))
                .reset_index(drop=True)
            )
            res["best"] = name
            _ress.append(res)
        report_results = pd.concat(_ress).reset_index(drop=True)

        return report_results

    @classmethod
    def _remap_results(
        cls,
        attributes: pd.DataFrame,
        metrics: pd.DataFrame,
        metric_map: dict[str, Optim],
        metric_name_remap: dict[str, str] | None = None,
        attribute_name_remap: dict[str, str] | None = None,
    ) -> tuple[pd.DataFrame, pd.DataFrame, dict[str, Optim]]:
        """
        Remaps attribute and metric names in ``attributes`` and ``metrics`` DataFrames
        based on ``attribute_name_remap`` and ``metric_name_remap``, and updates ``metric_map``
        accordingly.

        Parameters
        ----------
        attributes : pd.DataFrame
            The DataFrame containing attribute values for each experiment.
        metrics : pd.DataFrame
            The DataFrame containing metric values for each experiment.
        metric_map : dict[str, Optim]
            A dictionary mapping metric names to optimization objectives (minimization or maximization).
        metric_name_remap : dict[str, str] | None
            A dictionary mapping input metric names to output metric names.
            If None, the output metric names will be the same as the input metric names.
        attribute_name_remap : dict[str, str] | None
            A dictionary mapping input attribute names to output attribute names.
            If None, the output attribute names will be the same as the input attribute names.

        Returns
        -------
        tuple[pd.DataFrame, pd.DataFrame, dict[str, Optim]]
            The remapped ``attributes`` DataFrame, the remapped ``metrics`` DataFrame,
            and the updated ``metric_map`` dictionary.

        Examples
        --------
        >>> import pandas as pd
        >>> from enum import Enum
        >>> class Optim(Enum):
        ...     min = "min"
        ...     max = "max"
        ...
        >>> attributes = pd.DataFrame({"color": ["red", "blue"], "size": [10, 20]})
        >>> metrics = pd.DataFrame({"loss": [0.5, 0.4], "accuracy": [0.8, 0.9]})
        >>> metric_map = {"loss": Optim.min, "accuracy": Optim.max}
        >>> metric_name_remap = {"loss": "error", "accuracy": "acc"}
        >>> attribute_name_remap = {"color": "c", "size": "s"}
        >>> remapped_attrs, remapped_metrics, updated_map = Analysis._remap_results(
        ...     attributes, metrics, metric_map,
        ...     metric_name_remap=metric_name_remap,
        ...     attribute_name_remap=attribute_name_remap
        ... )
        >>> assert list(remapped_attrs.columns) == ["c", "s"]
        >>> assert list(remapped_metrics.columns) == ["error", "acc"]
        >>> assert updated_map == {"error": Optim.min, "acc": Optim.max}
        """
        metric_name_remap = parse_name_remap(metrics.columns, metric_name_remap)
        attribute_name_remap = parse_name_remap(
            attributes.columns, attribute_name_remap
        )
        metric_map = {
            metric_name_remap[metric_name]: direction
            for metric_name, direction in metric_map.items()
            if metric_name in metric_name_remap
        }

        attributes = attributes[list(attribute_name_remap.keys())]
        metrics = metrics[list(metric_name_remap.keys())]
        attributes.columns = [attribute_name_remap[c] for c in attributes.columns]
        metrics.columns = [metric_name_remap[c] for c in metrics.columns]
        return attributes, metrics, metric_map