pytorch/torchgen/autoheuristic/train.py

# mypy: ignore-errors

import argparse
import sys
import warnings

import numpy as np
import pandas as pd  # type: ignore[import-untyped]
from scipy.stats import gmean  # type: ignore[import-untyped]
from sklearn.model_selection import train_test_split  # type: ignore[import-untyped]
from sklearn.tree import DecisionTreeRegressor  # type: ignore[import-untyped]

from torch._inductor.autoheuristic.autoheuristic import deserialize_data
from torch._inductor.autoheuristic.autoheuristic_utils import CHOICE_COL, FEEDBACK_COL


# TODO (AlnisM): Fix these warnings
warnings.filterwarnings(
    "ignore",
    message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated",
)
warnings.filterwarnings(
    "ignore",
    message="DataFrameGroupBy.apply operated on the grouping columns.",
)


class AHTrain:
    """
    This class is responsible for generating a heuristic by using data collected with AutoHeuristic. It will learn a
    regression tree that predicts a score that represents how well a specific choice will perform given an input.
    A higher score means a better choice. The heuristic will be generated in a file named <heuristic_name>.py in the
    torch/_inductor/autoheuristic/artifacts/ directory.
    """

    def __init__(self):
        self.parser = argparse.ArgumentParser()
        self.add_base_arguments()
        self.args = None

    def add_base_arguments(self):
        self.parser.add_argument(
            "dataset",
            type=str,
            help="Path to text file containing data collected with AutoHeuristic.",
        )
        self.parser.add_argument(
            "--nrows",
            type=int,
            default=None,
            help="Only read first n rows of the dataset.",
        )
        self.parser.add_argument(
            "--heuristic-name",
            type=str,
            default="learned_heuristic",
            help="Name of the heuristic to be generated.",
        )
        self.parser.add_argument(
            "--data",
            nargs=2,
            action="append",
            metavar=("TYPE", "PATH"),
            help="Specify name of datasets and file paths to be evaluated.",
        )

    def parse_args(self):
        return self.parser.parse_args()

    def generate_heuristic(self):
        self.args = self.parse_args()
        self.main(
            self.args.dataset, self.args.data, self.args.nrows, self.args.heuristic_name
        )

    def main(self, log_path, other_datasets, nrows, heuristic_name):
        (df, choices, cat_feature2cats, dummy_col_2_col_val, metadata) = self.get_df(
            log_path, nrows=nrows, apply_filters=True
        )
        df_train, df_val, df_test, feature_columns = self.custom_train_test_split(df)
        datasets = {"train": df_train, "val": df_val, "test": df_test}
        self.add_real_datasets(datasets, other_datasets, cat_feature2cats)

        # We will do a grid search over the values
        max_depths = [5, 10, 13, 15, 17, 20, 23, None]
        min_samples_leafs = [1, 2, 5, 10]
        choice_columns = [f"{CHOICE_COL}_{choice}" for choice in choices]
        (results_df, best_model, threshold) = self.train_and_evaluate_models(
            datasets, feature_columns, choice_columns, max_depths, min_samples_leafs
        )

        # prints results for all models and datasets
        print(results_df.to_string())

        # prints results grouped by dataset
        for set_name in results_df["dataset"].unique():
            dataset_results = results_df[results_df["dataset"] == set_name]
            dataset_results = dataset_results.sort_values(by="correct")
            print(dataset_results.to_string() + "\n")

        feature_names = feature_columns + choice_columns
        self.dt_to_python(
            best_model,
            metadata,
            feature_names,
            dummy_col_2_col_val,
            heuristic_name,
            threshold,
        )

    def filter_df(self, df):
        return df

    def get_df(self, log_path, cat_feature2cats=None, nrows=None, apply_filters=False):
        (df, metadata) = deserialize_data(log_path)
        numerical_features = metadata["numerical_features"]
        categorical_features = metadata["categorical_features"]
        choices = df[CHOICE_COL].unique().tolist()
        features = numerical_features + categorical_features
        if nrows is not None:
            df = df.head(nrows)

        df = self.filter_df(df)

        feature_columns = features

        def process_data(
            df,
            feature_columns,
            apply_filters,
            min_count_measurements=3,
            max_relative_std=5,
        ):
            # Calculate statistics for each input and choice combination
            def calculate_stats(group):
                count = len(group)
                mean = group[FEEDBACK_COL].mean()
                std = group[FEEDBACK_COL].std()
                relative_std = (std / mean) * 100 if mean != 0 else np.inf
                median = group[FEEDBACK_COL].median()
                return pd.Series(
                    {
                        "count": count,
                        "median_execution_time": median,
                        "relative_std": relative_std,
                    }
                )

            stats = (
                df.groupby(feature_columns + [CHOICE_COL])
                .apply(calculate_stats)
                .reset_index()
            )

            if apply_filters:
                # Remove unstables measurements
                valid_stats = stats[
                    (stats["count"] >= min_count_measurements)
                    & (stats["relative_std"] <= max_relative_std)
                ]
                # Keep only inputs with at least two valid choices
                valid_inputs = valid_stats.groupby(feature_columns).filter(
                    lambda x: len(x) >= 2
                )
            else:
                valid_inputs = stats

            # Compute the winner and ratios for each input
            def get_winner_and_speedups(group):
                mean_time = group["median_execution_time"].mean()
                winner = group.loc[group["median_execution_time"].idxmin(), CHOICE_COL]
                min_time = group["median_execution_time"].min()
                max_time = group["median_execution_time"].max()

                group["winner"] = winner
                group["speedup"] = max_time / min_time
                group["target"] = mean_time / group["median_execution_time"]

                return group[
                    feature_columns + [CHOICE_COL, "winner", "speedup", "target"]
                ]

            results = (
                valid_inputs.groupby(feature_columns)
                .apply(get_winner_and_speedups)
                .reset_index(drop=True)
            )

            return results

        results = process_data(df, feature_columns, apply_filters)
        (results, added_categorical_features) = self.add_new_features(results)
        categorical_features += added_categorical_features
        categorical_features += [CHOICE_COL]

        # Doing this here because if we create another df for testing purposes
        # and that other df does not contain all categories for a categorical feature,
        # pd.dummies will not create columns for the missing categories
        if not cat_feature2cats:
            cat_feature2cats = {}
        for cat_feature in categorical_features:
            if cat_feature in cat_feature2cats:
                categories = cat_feature2cats[cat_feature]
            else:
                categories = results[cat_feature].unique()
                cat_feature2cats[cat_feature] = categories
            results[cat_feature] = pd.Categorical(
                results[cat_feature], categories=categories
            )

        dummy_col_2_col_val = {}
        for col in categorical_features:
            unique_vals = results[col].unique()
            for val in unique_vals:
                dummy_col_2_col_val[f"{col}_{val}"] = (col, val)

        # one-hot encode categorical features
        results = pd.get_dummies(results, columns=categorical_features)
        return (results, choices, cat_feature2cats, dummy_col_2_col_val, metadata)

    def custom_train_test_split(
        self, df, test_size=0.2, val_size=0.25, random_state=42
    ):
        # We want to make sure that rows with the same input but different choice are kept in the same set
        exclude_columns = ["speedup", "winner", "target"]
        feature_columns = [
            col
            for col in df.columns
            if col not in exclude_columns and not col.startswith(CHOICE_COL + "_")
        ]
        df["input_id"] = df.groupby(feature_columns).ngroup()

        # Get unique input IDs
        unique_inputs = df["input_id"].unique()

        # Split unique inputs into train+val and test
        train_val_inputs, test_inputs = train_test_split(
            unique_inputs, test_size=test_size, random_state=random_state
        )

        # Split train+val inputs into train and val
        train_inputs, val_inputs = train_test_split(
            train_val_inputs, test_size=val_size, random_state=random_state
        )

        # Create masks for each set
        train_mask = df["input_id"].isin(train_inputs)
        val_mask = df["input_id"].isin(val_inputs)
        test_mask = df["input_id"].isin(test_inputs)

        # Split the dataframe
        df_train = df[train_mask]
        df_val = df[val_mask]
        df_test = df[test_mask]

        # Remove the temporary input_id column
        df_train = df_train.drop("input_id", axis=1)
        df_val = df_val.drop("input_id", axis=1)
        df_test = df_test.drop("input_id", axis=1)

        return df_train, df_val, df_test, feature_columns

    def train_and_evaluate_models(
        self,
        datasets,
        feature_columns,
        choice_columns,
        max_depths,
        min_samples_leafs,
        threshold=0.99,
    ):
        results = []
        df_train = datasets["train"]
        df_val = datasets["val"]

        best_model = None
        best_model_threshold = 0
        max_correct_predictions = -1
        for max_depth in max_depths:
            for min_samples_leaf in min_samples_leafs:
                print(
                    f"Evaluating max_depth={max_depth}, min_samples_leaf={min_samples_leaf}"
                )
                model = DecisionTreeRegressor(
                    random_state=42,
                    max_depth=max_depth,
                    min_samples_leaf=min_samples_leaf,
                )
                model.fit(
                    df_train[feature_columns + choice_columns], df_train["target"]
                )

                # we first compute a safe threshold: this threshold ensures that on the validation set,
                # if the heuristic returns a choice, the choice will be correct, although a high threshold
                # can lead to a lot of 'unsure' choices
                eval_result = self.evaluate_model(
                    model, df_val, feature_columns, choice_columns, threshold
                )
                safe_threshold = eval_result["wrong_max_ratio"]
                for dataset_name, dataset in datasets.items():
                    eval_result = self.evaluate_model(
                        model, dataset, feature_columns, choice_columns, safe_threshold
                    )
                    print(eval_result)
                    if dataset_name == "val":
                        eval_correct = eval_result["correct"]
                        if eval_correct > max_correct_predictions:
                            best_model = model
                            best_model_threshold = safe_threshold
                            max_correct_predictions = eval_correct
                    results.append(
                        {
                            "max_depth": max_depth,
                            "min_samples_leaf": min_samples_leaf,
                            "dataset": dataset_name,
                            "correct": eval_result["correct"],
                            "wrong": eval_result["wrong"],
                            "unsure": eval_result["unsure"],
                            "total": eval_result["total"],
                            "max_wrong_speedup": eval_result["max_wrong_speedup"],
                            "gman_wrong_speedup": eval_result["gman_wrong_speedup"],
                            "threshold": safe_threshold,
                        }
                    )

        return (pd.DataFrame(results), best_model, best_model_threshold)

    def evaluate_model(self, model, df, feature_columns, choice_columns, threshold):
        def predict_winner(group):
            predictions = model.predict(group[feature_columns + choice_columns])

            # Find the index of the maximum prediction (best choice)
            best_choice_index = np.argmax(predictions)

            # Get the corresponding choice
            predicted_choice = (
                group[choice_columns].iloc[best_choice_index].idxmax().split("_")[-1]
            )

            # Calculate the ratio between the best and second-best prediction
            sorted_predictions = np.sort(predictions)[::-1]
            top_pred_ratio = (
                sorted_predictions[0] / sorted_predictions[1]
                if len(sorted_predictions) > 1
                else np.inf
            )

            # If the best choice is not "significantly" better than the second best choice,
            # the learned heuristic will return "unsure"
            if top_pred_ratio <= threshold:
                predicted_winner = "unsure"
            else:
                predicted_winner = predicted_choice

            actual_winner = group["winner"].iloc[0]
            is_correct = (
                predicted_winner == actual_winner
                if predicted_winner != "unsure"
                else "unsure"
            )

            return pd.Series(
                {
                    "predicted_winner": predicted_winner,
                    "ratio": top_pred_ratio,
                    "actual_winner": actual_winner,
                    "is_correct": is_correct,
                    "speedup": group["speedup"].iloc[
                        0
                    ],  # Speedup is the same for all rows in the group
                }
            )

        results = df.groupby(feature_columns).apply(predict_winner).reset_index()
        correct = (results["is_correct"].eq(True)).sum()
        unsure = (results["is_correct"] == "unsure").sum()
        wrong_results = results[results["is_correct"].eq(False)]
        wrong = len(wrong_results)

        # Calculate max and geometric mean of speedup for wrong predictions
        # Used for debugging purposes
        wrong_speedups = wrong_results["speedup"]
        max_wrong_speedup = wrong_speedups.max() if not wrong_speedups.empty else np.nan
        geo_mean_wrong_speedup = (
            gmean(wrong_speedups) if not wrong_speedups.empty else np.nan
        )
        wrong_max_ratio = wrong_results["ratio"].max()

        total = correct + wrong + unsure
        return {
            "correct": correct,
            "wrong": wrong,
            "unsure": unsure,
            "total": total,
            "max_wrong_speedup": max_wrong_speedup,
            "gman_wrong_speedup": geo_mean_wrong_speedup,
            "wrong_max_ratio": wrong_max_ratio,
        }

    def add_new_features(self, results):
        return (results, [])

    def codegen_boilerplate(
        self, heuristic_name, opt_name, threshold, shared_memory, device_capa
    ):
        boiler_plate = f"""# flake8: noqa: B950

from torch._inductor.autoheuristic.autoheuristic_utils import AHContext, AHMetadata, Choice, CHOICE_COL
from torch._inductor.autoheuristic.learnedheuristic_interface import (
    LearnedHeuristic,
)

class {heuristic_name}(LearnedHeuristic):

    def __init__(self) -> None:
        pass

    def check_precondition(self, metadata: AHMetadata, context: AHContext,) -> bool:
        return (
            metadata.name == self.get_name()
            and metadata.shared_memory == {shared_memory}
            and str(metadata.device_capa) == "{device_capa}"
        )

    def get_feedback(self, context: AHContext, choice: Choice) -> float:
        context.context_dict[CHOICE_COL] = choice
        return self.predict(context)

    def get_speedup_threshold(self) -> float:
        return {threshold}

    def get_name(self) -> str:
        return '{opt_name}'"""
        return boiler_plate

    def dt_to_python(
        self,
        dt,
        metadata,
        feature_names,
        dummy_col_2_col_val,
        heuristic_name,
        threshold,
    ):
        tree_ = dt.tree_
        feature_name = [
            feature_names[i] if i != -1 else "undefined!" for i in tree_.feature
        ]

        lines = []
        device_capa = metadata["device_capa"]
        device_capa_str = f"({device_capa[0]}, {device_capa[1]})"
        opt_name = metadata["name"]
        lines.append(
            self.codegen_boilerplate(
                heuristic_name,
                opt_name,
                threshold,
                metadata["shared_memory"],
                device_capa_str,
            )
        )
        fn_def = "\n    def predict(self, context: AHContext) -> float:"
        lines.append(fn_def)

        def dt_to_python(node, depth):
            indent = "    " * (depth + 1)
            false_predicate = ""
            if tree_.feature[node] != -2:
                name = feature_name[node]
                threshold = tree_.threshold[node]
                if name in dummy_col_2_col_val:
                    (orig_name, value) = dummy_col_2_col_val[name]
                    predicate = f"{indent}if str(context.get_value('{orig_name}')) != '{value}':"
                    if threshold != 0.5:
                        print(f"expected threshold to be 0.5 but is {threshold}")
                        sys.exit(1)
                else:
                    predicate = (
                        f"{indent}if context.get_value('{name}') <= {threshold}:"
                    )
                lines.append(predicate)
                dt_to_python(tree_.children_left[node], depth + 1)
                lines.append(f"{indent}else:")
                dt_to_python(tree_.children_right[node], depth + 1)
            else:
                value = tree_.value[node][0][0]
                lines.append(f"{indent}return {str(value)}")

        dt_to_python(0, 1)

        output_file = (
            f"../../torch/_inductor/autoheuristic/artifacts/_{heuristic_name}.py"
        )
        path = f"{output_file}"
        with open(path, "w") as f:
            f.write("\n".join(lines) + "\n")

    def add_real_datasets(self, datasets, other_datasets, cat_feature2cats):
        if other_datasets:
            for name, path in other_datasets:
                (df_other, choices, _, _, _) = self.get_df(
                    path, cat_feature2cats=cat_feature2cats, apply_filters=False
                )
                datasets[name] = df_other


if __name__ == "__main__":
    train = AHTrain()
    train.generate_heuristic()