Revert "[reland2][dynamo] Revert "Revert "[reland][dynamo] use optimizers correctly in benchmar… (#90956)"

This reverts commit 8bc38ae4e2. Reverted https://github.com/pytorch/pytorch/pull/90956 on behalf of https://github.com/desertfire due to Causing TIMM model failures
2025-12-06 12:20:52 +01:00 · 2022-12-16 19:28:05 +00:00 · 2022-12-16 19:28:05 +00:00 · 6bc6fb21db
commit 6bc6fb21db
parent 8cd1808dbf
5 changed files with 27 additions and 28 deletions
--- a/benchmarks/dynamo/README.md
+++ b/benchmarks/dynamo/README.md
@ -7,7 +7,7 @@ The runner integrates with models from TorchBenchmark, HuggingFace and TIMM suit

 The infrastructure allows us to specify a loss function. For torchbench models, we use .sum().backward() call in place of the native loss function. For TIMM models, we use a CrossEntropy loss. And HF models contain a loss function inside the model itself, so we don't need any special loss computation handling.

-Training benchmarks approximate training by running the model forward, computing loss, running backward, and then the optimizer (SGD). Note: the optimizer is currently not compiled by Torchdynamo.
+Training benchmarks approximate training by running the model forward, computing loss and then running backward. We entirely skip the optimizer step today.

 Inference benchmarks and Training benchmarks measure correctness by comparing dynamo and eager model outputs given fixed inputs and seeds.

--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -128,12 +128,6 @@ CI_SKIP_INDUCTOR_TRAINING = [
 ]


-CI_SKIP_OPTIMIZER = {
-    # TIMM
-    "convmixer_768_32",  # accuracy
-}
-
-
 def model_specified_by_path(path_and_class_str):
    return ":" in path_and_class_str

@ -878,7 +872,6 @@ class BenchmarkRunner:
        self.use_amp = False
        self.grad_scaler = DummyGradScaler()
        self.autocast = NullContext
-        self.optimizer = None
        self._args = None

    def setup_amp(self):
@ -907,11 +900,16 @@ class BenchmarkRunner:
            # self.grad_scaler = torch.cuda.amp.GradScaler(init_scale=2.0)
            self.autocast = torch.cuda.amp.autocast

-    def init_optimizer(self, name, device, params):
-        if device == "cuda" and self.args.training and name not in CI_SKIP_OPTIMIZER:
-            self.optimizer = torch.optim.SGD(params, lr=0.01)
-        else:
-            self.optimizer = None
+    def init_optimizer(self, device, params):
+        self.optimizer = None
+        # TODO - Currently, optimizers are used incorrectly. Fix optimizers with
+        # https://github.com/pytorch/pytorch/pull/87492
+        # param_list = list(params)
+        # if device == "cuda" and len(param_list) != 0:
+        #     # capturable is only supported on cuda at the moment
+        #     self.optimizer = torch.optim.Adam(param_list, capturable=True)
+        # else:
+        #     self.optimizer = None

    @property
    def args(self):
@ -1094,12 +1092,12 @@ class BenchmarkRunner:
        # Collect the fp64 reference outputs to be used later for accuracy checking.
        fp64_outputs = None
        try:
-            model_fp64, inputs_fp64 = cast_to_fp64(
-                deepcopy_and_maybe_ddp(model),
-                clone_inputs(example_inputs),
+            fp64_outputs = self.run_n_iterations(
+                *cast_to_fp64(
+                    deepcopy_and_maybe_ddp(model),
+                    clone_inputs(example_inputs),
+                )
            )
-            self.init_optimizer(name, current_device, model_fp64.parameters())
-            fp64_outputs = self.run_n_iterations(model_fp64, inputs_fp64)
        except Exception:
            log.warning(
                f"fp64 golden ref were not generated for {name}. Setting accuracy check to cosine"
@ -1120,18 +1118,14 @@ class BenchmarkRunner:
        with self.pick_grad(name, self.args.training):
            # Get results of native pytorch
            reset_rng_state()
-            model_copy = deepcopy_and_maybe_ddp(model)
-            self.init_optimizer(name, current_device, model_copy.parameters())
            correct_result = self.run_n_iterations(
-                model_copy, clone_inputs(example_inputs)
+                deepcopy_and_maybe_ddp(model), clone_inputs(example_inputs)
            )

            # Rerun native pytorch
            reset_rng_state()
-            model_copy = deepcopy_and_maybe_ddp(model)
-            self.init_optimizer(name, current_device, model_copy.parameters())
            correct_rerun_result = self.run_n_iterations(
-                model_copy, clone_inputs(example_inputs)
+                deepcopy_and_maybe_ddp(model), clone_inputs(example_inputs)
            )
            if not same(
                correct_result,
@ -1147,11 +1141,11 @@ class BenchmarkRunner:
            reset_rng_state()
            torch._dynamo.reset()
            try:
-                model_copy = deepcopy_and_maybe_ddp(model)
-                self.init_optimizer(name, current_device, model_copy.parameters())
                optimized_model_iter_fn = optimize_ctx(self.run_n_iterations)

-                new_result = optimized_model_iter_fn(model_copy, example_inputs)
+                new_result = optimized_model_iter_fn(
+                    deepcopy_and_maybe_ddp(model), example_inputs
+                )
            except Exception as e:
                accuracy_status = "fail_to_run"
                print(
@ -1199,7 +1193,6 @@ class BenchmarkRunner:

        # Cast the model to float16/float32 as necessary
        model, example_inputs = self.maybe_cast(model, example_inputs)
-        self.init_optimizer(name, current_device, model.parameters())
        with self.pick_grad(name, self.args.training):
            ok, total = Stats.reset_counters()
            experiment_kwargs = {}
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@ -433,6 +433,8 @@ class HuggingfaceRunner(BenchmarkRunner):
        else:
            model.eval()

+        self.init_optimizer(device, model.parameters())
+
        self.validate_model(model, example_inputs)
        return device, model_name, model, example_inputs, batch_size

--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@ -261,6 +261,8 @@ class TimmRunnner(BenchmarkRunner):
        else:
            model.eval()

+        self.init_optimizer(device, model.parameters())
+
        self.validate_model(model, example_inputs)

        return device, model_name, model, example_inputs, batch_size
--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@ -295,6 +295,8 @@ class TorchBenchmarkRunner(BenchmarkRunner):
        gc.collect()
        batch_size = benchmark.batch_size

+        self.init_optimizer(device, model.parameters())
+
        # Torchbench has quite different setup for yolov3, so directly passing
        # the right example_inputs
        if model_name == "yolov3":