Revert "[reland2][dynamo] Revert "Revert "[reland][dynamo] use optimizers correctly in benchmar… (#90956)"

This reverts commit 8bc38ae4e2.

Reverted https://github.com/pytorch/pytorch/pull/90956 on behalf of https://github.com/desertfire due to Causing TIMM model failures
This commit is contained in:
PyTorch MergeBot 2022-12-16 19:28:05 +00:00
parent 8cd1808dbf
commit 6bc6fb21db
5 changed files with 27 additions and 28 deletions

View File

@ -7,7 +7,7 @@ The runner integrates with models from TorchBenchmark, HuggingFace and TIMM suit
The infrastructure allows us to specify a loss function. For torchbench models, we use .sum().backward() call in place of the native loss function. For TIMM models, we use a CrossEntropy loss. And HF models contain a loss function inside the model itself, so we don't need any special loss computation handling.
Training benchmarks approximate training by running the model forward, computing loss, running backward, and then the optimizer (SGD). Note: the optimizer is currently not compiled by Torchdynamo.
Training benchmarks approximate training by running the model forward, computing loss and then running backward. We entirely skip the optimizer step today.
Inference benchmarks and Training benchmarks measure correctness by comparing dynamo and eager model outputs given fixed inputs and seeds.

View File

@ -128,12 +128,6 @@ CI_SKIP_INDUCTOR_TRAINING = [
]
CI_SKIP_OPTIMIZER = {
# TIMM
"convmixer_768_32", # accuracy
}
def model_specified_by_path(path_and_class_str):
return ":" in path_and_class_str
@ -878,7 +872,6 @@ class BenchmarkRunner:
self.use_amp = False
self.grad_scaler = DummyGradScaler()
self.autocast = NullContext
self.optimizer = None
self._args = None
def setup_amp(self):
@ -907,11 +900,16 @@ class BenchmarkRunner:
# self.grad_scaler = torch.cuda.amp.GradScaler(init_scale=2.0)
self.autocast = torch.cuda.amp.autocast
def init_optimizer(self, name, device, params):
if device == "cuda" and self.args.training and name not in CI_SKIP_OPTIMIZER:
self.optimizer = torch.optim.SGD(params, lr=0.01)
else:
self.optimizer = None
def init_optimizer(self, device, params):
self.optimizer = None
# TODO - Currently, optimizers are used incorrectly. Fix optimizers with
# https://github.com/pytorch/pytorch/pull/87492
# param_list = list(params)
# if device == "cuda" and len(param_list) != 0:
# # capturable is only supported on cuda at the moment
# self.optimizer = torch.optim.Adam(param_list, capturable=True)
# else:
# self.optimizer = None
@property
def args(self):
@ -1094,12 +1092,12 @@ class BenchmarkRunner:
# Collect the fp64 reference outputs to be used later for accuracy checking.
fp64_outputs = None
try:
model_fp64, inputs_fp64 = cast_to_fp64(
deepcopy_and_maybe_ddp(model),
clone_inputs(example_inputs),
fp64_outputs = self.run_n_iterations(
*cast_to_fp64(
deepcopy_and_maybe_ddp(model),
clone_inputs(example_inputs),
)
)
self.init_optimizer(name, current_device, model_fp64.parameters())
fp64_outputs = self.run_n_iterations(model_fp64, inputs_fp64)
except Exception:
log.warning(
f"fp64 golden ref were not generated for {name}. Setting accuracy check to cosine"
@ -1120,18 +1118,14 @@ class BenchmarkRunner:
with self.pick_grad(name, self.args.training):
# Get results of native pytorch
reset_rng_state()
model_copy = deepcopy_and_maybe_ddp(model)
self.init_optimizer(name, current_device, model_copy.parameters())
correct_result = self.run_n_iterations(
model_copy, clone_inputs(example_inputs)
deepcopy_and_maybe_ddp(model), clone_inputs(example_inputs)
)
# Rerun native pytorch
reset_rng_state()
model_copy = deepcopy_and_maybe_ddp(model)
self.init_optimizer(name, current_device, model_copy.parameters())
correct_rerun_result = self.run_n_iterations(
model_copy, clone_inputs(example_inputs)
deepcopy_and_maybe_ddp(model), clone_inputs(example_inputs)
)
if not same(
correct_result,
@ -1147,11 +1141,11 @@ class BenchmarkRunner:
reset_rng_state()
torch._dynamo.reset()
try:
model_copy = deepcopy_and_maybe_ddp(model)
self.init_optimizer(name, current_device, model_copy.parameters())
optimized_model_iter_fn = optimize_ctx(self.run_n_iterations)
new_result = optimized_model_iter_fn(model_copy, example_inputs)
new_result = optimized_model_iter_fn(
deepcopy_and_maybe_ddp(model), example_inputs
)
except Exception as e:
accuracy_status = "fail_to_run"
print(
@ -1199,7 +1193,6 @@ class BenchmarkRunner:
# Cast the model to float16/float32 as necessary
model, example_inputs = self.maybe_cast(model, example_inputs)
self.init_optimizer(name, current_device, model.parameters())
with self.pick_grad(name, self.args.training):
ok, total = Stats.reset_counters()
experiment_kwargs = {}

View File

@ -433,6 +433,8 @@ class HuggingfaceRunner(BenchmarkRunner):
else:
model.eval()
self.init_optimizer(device, model.parameters())
self.validate_model(model, example_inputs)
return device, model_name, model, example_inputs, batch_size

View File

@ -261,6 +261,8 @@ class TimmRunnner(BenchmarkRunner):
else:
model.eval()
self.init_optimizer(device, model.parameters())
self.validate_model(model, example_inputs)
return device, model_name, model, example_inputs, batch_size

View File

@ -295,6 +295,8 @@ class TorchBenchmarkRunner(BenchmarkRunner):
gc.collect()
batch_size = benchmark.batch_size
self.init_optimizer(device, model.parameters())
# Torchbench has quite different setup for yolov3, so directly passing
# the right example_inputs
if model_name == "yolov3":