blob: cbc29a535ccd92a08e6faafcd54445ef91ab1747 [file] [log] [blame]
#!/usr/bin/env python3
import gc
import importlib
import logging
import os
import re
import sys
import warnings
from os.path import abspath, exists
import torch
try:
from .common import BenchmarkRunner, main
except ImportError:
from common import BenchmarkRunner, main
from torch._dynamo.testing import collect_results, reduce_to_scalar_loss
from torch._dynamo.utils import clone_inputs
# We are primarily interested in tf32 datatype
torch.backends.cuda.matmul.allow_tf32 = True
def setup_torchbench_cwd():
original_dir = abspath(os.getcwd())
os.environ["KALDI_ROOT"] = "/tmp" # avoids some spam
for torchbench_dir in (
"./torchbenchmark",
"../torchbenchmark",
"../torchbench",
"../benchmark",
"../../torchbenchmark",
"../../torchbench",
"../../benchmark",
):
if exists(torchbench_dir):
break
if exists(torchbench_dir):
torchbench_dir = abspath(torchbench_dir)
os.chdir(torchbench_dir)
sys.path.append(torchbench_dir)
return original_dir
# Some models have large dataset that doesn't fit in memory. Lower the batch
# size to test the accuracy.
USE_SMALL_BATCH_SIZE = {
"demucs": 4,
"dlrm": 1024,
"densenet121": 4,
"hf_Reformer": 4,
"timm_efficientdet": 1,
}
DETECTRON2_MODELS = {
"detectron2_fasterrcnn_r_101_c4",
"detectron2_fasterrcnn_r_101_dc5",
"detectron2_fasterrcnn_r_101_fpn",
"detectron2_fasterrcnn_r_50_c4",
"detectron2_fasterrcnn_r_50_dc5",
"detectron2_fasterrcnn_r_50_fpn",
"detectron2_maskrcnn_r_101_c4",
"detectron2_maskrcnn_r_101_fpn",
"detectron2_maskrcnn_r_50_fpn",
}
SKIP = {
# https://github.com/pytorch/torchdynamo/issues/101
"detectron2_maskrcnn",
# https://github.com/pytorch/torchdynamo/issues/145
"fambench_xlmr",
}
# Additional models that are skipped in training
SKIP_TRAIN = {
# not designed for training
"pyhpc_equation_of_state",
"pyhpc_isoneutral_mixing",
"pyhpc_turbulent_kinetic_energy",
# Unusual training setup
"opacus_cifar10",
"maml",
}
SKIP_TRAIN.update(DETECTRON2_MODELS)
# These models support only train mode. So accuracy checking can't be done in
# eval mode.
ONLY_TRAINING_MODE = {
"tts_angular",
"tacotron2",
"demucs",
"hf_Reformer",
"pytorch_struct",
"yolov3",
}
ONLY_TRAINING_MODE.update(DETECTRON2_MODELS)
# Need lower tolerance on GPU. GPU kernels have non deterministic kernels for these models.
REQUIRE_HIGHER_TOLERANCE = {
"alexnet",
"attention_is_all_you_need_pytorch",
"densenet121",
"hf_Albert",
"vgg16",
"mobilenet_v3_large",
"nvidia_deeprecommender",
"timm_efficientdet",
"vision_maskrcnn",
}
# These models need >1e-3 tolerance
REQUIRE_EVEN_HIGHER_TOLERANCE = {
"soft_actor_critic",
"tacotron2",
}
REQUIRE_COSINE_TOLERACE = {
# Just keeping it here even though its empty, if we need this in future.
}
# non-deterministic output / cant check correctness
NONDETERMINISTIC = set()
# These benchmarks took >600s on an i9-11900K CPU
VERY_SLOW_BENCHMARKS = {
"hf_BigBird", # 3339s
"hf_Longformer", # 3062s
"hf_T5", # 930s
}
# These benchmarks took >60s on an i9-11900K CPU
SLOW_BENCHMARKS = {
*VERY_SLOW_BENCHMARKS,
"BERT_pytorch", # 137s
"demucs", # 116s
"fastNLP_Bert", # 242s
"hf_Albert", # 221s
"hf_Bart", # 400s
"hf_Bert", # 334s
"hf_DistilBert", # 187s
"hf_GPT2", # 470s
"hf_Reformer", # 141s
"speech_transformer", # 317s
"vision_maskrcnn", # 99s
}
TRT_NOT_YET_WORKING = {
"alexnet",
"resnet18",
"resnet50",
"mobilenet_v2",
"mnasnet1_0",
"squeezenet1_1",
"shufflenetv2_x1_0",
"vgg16",
"resnext50_32x4d",
}
DYNAMIC_SHAPES_NOT_YET_WORKING = {
"demucs",
"timm_nfnet",
}
DONT_CHANGE_BATCH_SIZE = {
"demucs",
"pytorch_struct",
"pyhpc_turbulent_kinetic_energy",
}
SKIP_ACCURACY_CHECK_MODELS = {
# Models too large to have eager, dynamo and fp64_numbers simultaneosuly
# even for 40 GB machine. We have tested accuracy for smaller version of
# these models
"hf_GPT2_large",
"hf_T5_large",
"timm_vision_transformer_large",
}
MAX_BATCH_SIZE_FOR_ACCURACY_CHECK = {
"hf_GPT2": 2,
"pytorch_unet": 2,
}
class TorchBenchmarkRunner(BenchmarkRunner):
def __init__(self):
super(TorchBenchmarkRunner, self).__init__()
self.suite_name = "torchbench"
@property
def skip_models(self):
return SKIP
@property
def slow_models(self):
return SLOW_BENCHMARKS
@property
def very_slow_models(self):
return VERY_SLOW_BENCHMARKS
@property
def non_deterministic_models(self):
return NONDETERMINISTIC
@property
def skip_not_suitable_for_training_models(self):
return SKIP_TRAIN
@property
def failing_fx2trt_models(self):
return TRT_NOT_YET_WORKING
@property
def failing_dynamic_shape_models(self):
return DYNAMIC_SHAPES_NOT_YET_WORKING
@property
def skip_accuracy_checks_large_models_dashboard(self):
if self.args.dashboard or self.args.accuracy:
return SKIP_ACCURACY_CHECK_MODELS
return set()
def load_model(
self,
device,
model_name,
batch_size=None,
part=None,
):
is_training = self.args.training
use_eval_mode = self.args.use_eval_mode
dynamic_shapes = self.args.dynamic_shapes
try:
module = importlib.import_module(f"torchbenchmark.models.{model_name}")
except ModuleNotFoundError:
module = importlib.import_module(f"torchbenchmark.models.fb.{model_name}")
benchmark_cls = getattr(module, "Model", None)
if not hasattr(benchmark_cls, "name"):
benchmark_cls.name = model_name
cant_change_batch_size = (
not getattr(benchmark_cls, "ALLOW_CUSTOMIZE_BSIZE", True)
or model_name in DONT_CHANGE_BATCH_SIZE
)
if cant_change_batch_size:
batch_size = None
if batch_size is None and is_training and model_name in USE_SMALL_BATCH_SIZE:
batch_size = USE_SMALL_BATCH_SIZE[model_name]
# Control the memory footprint for few models
if self.args.accuracy and model_name in MAX_BATCH_SIZE_FOR_ACCURACY_CHECK:
batch_size = min(batch_size, MAX_BATCH_SIZE_FOR_ACCURACY_CHECK[model_name])
# workaround "RuntimeError: not allowed to set torch.backends.cudnn flags"
torch.backends.__allow_nonbracketed_mutation_flag = True
extra_args = []
if part:
extra_args = ["--part", part]
if is_training:
benchmark = benchmark_cls(
test="train",
device=device,
jit=False,
batch_size=batch_size,
extra_args=extra_args,
)
else:
benchmark = benchmark_cls(
test="eval",
device=device,
jit=False,
batch_size=batch_size,
extra_args=extra_args,
)
model, example_inputs = benchmark.get_module()
# Models that must be in train mode while training
if is_training and (not use_eval_mode or model_name in ONLY_TRAINING_MODE):
model.train()
else:
model.eval()
gc.collect()
batch_size = benchmark.batch_size
# Torchbench has quite different setup for yolov3, so directly passing
# the right example_inputs
if model_name == "yolov3":
example_inputs = (torch.rand(batch_size, 3, 384, 512).to(device),)
# global current_name, current_device
# current_device = device
# current_name = benchmark.name
self.validate_model(model, example_inputs)
return device, benchmark.name, model, example_inputs, batch_size
def iter_model_names(self, args):
from torchbenchmark import _list_model_paths
models = _list_model_paths()
start, end = self.get_benchmark_indices(len(models))
for index, model_path in enumerate(models):
if index < start or index >= end:
continue
model_name = os.path.basename(model_path)
if (
not re.search("|".join(args.filter), model_name, re.I)
or re.search("|".join(args.exclude), model_name, re.I)
or model_name in SKIP
):
continue
yield model_name
def pick_grad(self, name, is_training):
if is_training or name in ("maml",):
return torch.enable_grad()
else:
return torch.no_grad()
def get_tolerance_and_cosine_flag(self, is_training, current_device, name):
tolerance = 1e-4
cosine = self.args.cosine
# Increase the tolerance for torch allclose
if self.args.float16 or self.args.amp:
return 1e-3, cosine
if is_training and current_device == "cuda":
tolerance = 1e-3
if name in REQUIRE_COSINE_TOLERACE:
cosine = True
elif name in REQUIRE_HIGHER_TOLERANCE:
tolerance = 1e-3
elif name in REQUIRE_EVEN_HIGHER_TOLERANCE:
tolerance = 8 * 1e-2
return tolerance, cosine
def compute_loss(self, pred):
return reduce_to_scalar_loss(pred)
def forward_pass(self, mod, inputs, collect_outputs=True):
return mod(*inputs)
def forward_and_backward_pass(self, mod, inputs, collect_outputs=True):
cloned_inputs = clone_inputs(inputs)
self.optimizer_zero_grad(mod)
with self.autocast():
pred = mod(*cloned_inputs)
loss = self.compute_loss(pred)
self.grad_scaler.scale(loss).backward()
self.optimizer_step()
if collect_outputs:
return collect_results(mod, pred, loss, cloned_inputs)
return None
if __name__ == "__main__":
original_dir = setup_torchbench_cwd()
logging.basicConfig(level=logging.WARNING)
warnings.filterwarnings("ignore")
main(TorchBenchmarkRunner(), original_dir)