| #!/usr/bin/env python3 |
| import gc |
| import importlib |
| import logging |
| import os |
| import re |
| import sys |
| import warnings |
| from os.path import abspath, exists |
| |
| import torch |
| |
| try: |
| from .common import BenchmarkRunner, main |
| except ImportError: |
| from common import BenchmarkRunner, main |
| |
| from torch._dynamo.testing import collect_results, reduce_to_scalar_loss |
| from torch._dynamo.utils import clone_inputs |
| |
| # We are primarily interested in tf32 datatype |
| torch.backends.cuda.matmul.allow_tf32 = True |
| |
| |
| def setup_torchbench_cwd(): |
| original_dir = abspath(os.getcwd()) |
| |
| os.environ["KALDI_ROOT"] = "/tmp" # avoids some spam |
| for torchbench_dir in ( |
| "./torchbenchmark", |
| "../torchbenchmark", |
| "../torchbench", |
| "../benchmark", |
| "../../torchbenchmark", |
| "../../torchbench", |
| "../../benchmark", |
| ): |
| if exists(torchbench_dir): |
| break |
| |
| if exists(torchbench_dir): |
| torchbench_dir = abspath(torchbench_dir) |
| os.chdir(torchbench_dir) |
| sys.path.append(torchbench_dir) |
| |
| return original_dir |
| |
| |
| # Some models have large dataset that doesn't fit in memory. Lower the batch |
| # size to test the accuracy. |
| USE_SMALL_BATCH_SIZE = { |
| "demucs": 4, |
| "dlrm": 1024, |
| "densenet121": 4, |
| "hf_Reformer": 4, |
| "timm_efficientdet": 1, |
| } |
| |
| DETECTRON2_MODELS = { |
| "detectron2_fasterrcnn_r_101_c4", |
| "detectron2_fasterrcnn_r_101_dc5", |
| "detectron2_fasterrcnn_r_101_fpn", |
| "detectron2_fasterrcnn_r_50_c4", |
| "detectron2_fasterrcnn_r_50_dc5", |
| "detectron2_fasterrcnn_r_50_fpn", |
| "detectron2_maskrcnn_r_101_c4", |
| "detectron2_maskrcnn_r_101_fpn", |
| "detectron2_maskrcnn_r_50_fpn", |
| } |
| |
| SKIP = { |
| # https://github.com/pytorch/torchdynamo/issues/101 |
| "detectron2_maskrcnn", |
| # https://github.com/pytorch/torchdynamo/issues/145 |
| "fambench_xlmr", |
| } |
| |
| # Additional models that are skipped in training |
| SKIP_TRAIN = { |
| # not designed for training |
| "pyhpc_equation_of_state", |
| "pyhpc_isoneutral_mixing", |
| "pyhpc_turbulent_kinetic_energy", |
| # Unusual training setup |
| "opacus_cifar10", |
| "maml", |
| } |
| SKIP_TRAIN.update(DETECTRON2_MODELS) |
| |
| # These models support only train mode. So accuracy checking can't be done in |
| # eval mode. |
| ONLY_TRAINING_MODE = { |
| "tts_angular", |
| "tacotron2", |
| "demucs", |
| "hf_Reformer", |
| "pytorch_struct", |
| "yolov3", |
| } |
| ONLY_TRAINING_MODE.update(DETECTRON2_MODELS) |
| |
| # Need lower tolerance on GPU. GPU kernels have non deterministic kernels for these models. |
| REQUIRE_HIGHER_TOLERANCE = { |
| "alexnet", |
| "attention_is_all_you_need_pytorch", |
| "densenet121", |
| "hf_Albert", |
| "vgg16", |
| "mobilenet_v3_large", |
| "nvidia_deeprecommender", |
| "timm_efficientdet", |
| "vision_maskrcnn", |
| } |
| |
| # These models need >1e-3 tolerance |
| REQUIRE_EVEN_HIGHER_TOLERANCE = { |
| "soft_actor_critic", |
| "tacotron2", |
| } |
| |
| REQUIRE_COSINE_TOLERACE = { |
| # Just keeping it here even though its empty, if we need this in future. |
| } |
| |
| # non-deterministic output / cant check correctness |
| NONDETERMINISTIC = set() |
| |
| # These benchmarks took >600s on an i9-11900K CPU |
| VERY_SLOW_BENCHMARKS = { |
| "hf_BigBird", # 3339s |
| "hf_Longformer", # 3062s |
| "hf_T5", # 930s |
| } |
| |
| # These benchmarks took >60s on an i9-11900K CPU |
| SLOW_BENCHMARKS = { |
| *VERY_SLOW_BENCHMARKS, |
| "BERT_pytorch", # 137s |
| "demucs", # 116s |
| "fastNLP_Bert", # 242s |
| "hf_Albert", # 221s |
| "hf_Bart", # 400s |
| "hf_Bert", # 334s |
| "hf_DistilBert", # 187s |
| "hf_GPT2", # 470s |
| "hf_Reformer", # 141s |
| "speech_transformer", # 317s |
| "vision_maskrcnn", # 99s |
| } |
| |
| TRT_NOT_YET_WORKING = { |
| "alexnet", |
| "resnet18", |
| "resnet50", |
| "mobilenet_v2", |
| "mnasnet1_0", |
| "squeezenet1_1", |
| "shufflenetv2_x1_0", |
| "vgg16", |
| "resnext50_32x4d", |
| } |
| |
| DYNAMIC_SHAPES_NOT_YET_WORKING = { |
| "demucs", |
| "timm_nfnet", |
| } |
| |
| DONT_CHANGE_BATCH_SIZE = { |
| "demucs", |
| "pytorch_struct", |
| "pyhpc_turbulent_kinetic_energy", |
| } |
| |
| |
| SKIP_ACCURACY_CHECK_MODELS = { |
| # Models too large to have eager, dynamo and fp64_numbers simultaneosuly |
| # even for 40 GB machine. We have tested accuracy for smaller version of |
| # these models |
| "hf_GPT2_large", |
| "hf_T5_large", |
| "timm_vision_transformer_large", |
| } |
| |
| |
| MAX_BATCH_SIZE_FOR_ACCURACY_CHECK = { |
| "hf_GPT2": 2, |
| "pytorch_unet": 2, |
| } |
| |
| |
| class TorchBenchmarkRunner(BenchmarkRunner): |
| def __init__(self): |
| super(TorchBenchmarkRunner, self).__init__() |
| self.suite_name = "torchbench" |
| |
| @property |
| def skip_models(self): |
| return SKIP |
| |
| @property |
| def slow_models(self): |
| return SLOW_BENCHMARKS |
| |
| @property |
| def very_slow_models(self): |
| return VERY_SLOW_BENCHMARKS |
| |
| @property |
| def non_deterministic_models(self): |
| return NONDETERMINISTIC |
| |
| @property |
| def skip_not_suitable_for_training_models(self): |
| return SKIP_TRAIN |
| |
| @property |
| def failing_fx2trt_models(self): |
| return TRT_NOT_YET_WORKING |
| |
| @property |
| def failing_dynamic_shape_models(self): |
| return DYNAMIC_SHAPES_NOT_YET_WORKING |
| |
| @property |
| def skip_accuracy_checks_large_models_dashboard(self): |
| if self.args.dashboard or self.args.accuracy: |
| return SKIP_ACCURACY_CHECK_MODELS |
| return set() |
| |
| def load_model( |
| self, |
| device, |
| model_name, |
| batch_size=None, |
| part=None, |
| ): |
| |
| is_training = self.args.training |
| use_eval_mode = self.args.use_eval_mode |
| dynamic_shapes = self.args.dynamic_shapes |
| try: |
| module = importlib.import_module(f"torchbenchmark.models.{model_name}") |
| except ModuleNotFoundError: |
| module = importlib.import_module(f"torchbenchmark.models.fb.{model_name}") |
| benchmark_cls = getattr(module, "Model", None) |
| if not hasattr(benchmark_cls, "name"): |
| benchmark_cls.name = model_name |
| |
| cant_change_batch_size = ( |
| not getattr(benchmark_cls, "ALLOW_CUSTOMIZE_BSIZE", True) |
| or model_name in DONT_CHANGE_BATCH_SIZE |
| ) |
| if cant_change_batch_size: |
| batch_size = None |
| if batch_size is None and is_training and model_name in USE_SMALL_BATCH_SIZE: |
| batch_size = USE_SMALL_BATCH_SIZE[model_name] |
| |
| # Control the memory footprint for few models |
| if self.args.accuracy and model_name in MAX_BATCH_SIZE_FOR_ACCURACY_CHECK: |
| batch_size = min(batch_size, MAX_BATCH_SIZE_FOR_ACCURACY_CHECK[model_name]) |
| |
| # workaround "RuntimeError: not allowed to set torch.backends.cudnn flags" |
| torch.backends.__allow_nonbracketed_mutation_flag = True |
| extra_args = [] |
| if part: |
| extra_args = ["--part", part] |
| if is_training: |
| benchmark = benchmark_cls( |
| test="train", |
| device=device, |
| jit=False, |
| batch_size=batch_size, |
| extra_args=extra_args, |
| ) |
| else: |
| benchmark = benchmark_cls( |
| test="eval", |
| device=device, |
| jit=False, |
| batch_size=batch_size, |
| extra_args=extra_args, |
| ) |
| model, example_inputs = benchmark.get_module() |
| |
| # Models that must be in train mode while training |
| if is_training and (not use_eval_mode or model_name in ONLY_TRAINING_MODE): |
| model.train() |
| else: |
| model.eval() |
| gc.collect() |
| batch_size = benchmark.batch_size |
| |
| # Torchbench has quite different setup for yolov3, so directly passing |
| # the right example_inputs |
| if model_name == "yolov3": |
| example_inputs = (torch.rand(batch_size, 3, 384, 512).to(device),) |
| # global current_name, current_device |
| # current_device = device |
| # current_name = benchmark.name |
| self.validate_model(model, example_inputs) |
| return device, benchmark.name, model, example_inputs, batch_size |
| |
| def iter_model_names(self, args): |
| from torchbenchmark import _list_model_paths |
| |
| models = _list_model_paths() |
| start, end = self.get_benchmark_indices(len(models)) |
| for index, model_path in enumerate(models): |
| if index < start or index >= end: |
| continue |
| |
| model_name = os.path.basename(model_path) |
| if ( |
| not re.search("|".join(args.filter), model_name, re.I) |
| or re.search("|".join(args.exclude), model_name, re.I) |
| or model_name in SKIP |
| ): |
| continue |
| |
| yield model_name |
| |
| def pick_grad(self, name, is_training): |
| if is_training or name in ("maml",): |
| return torch.enable_grad() |
| else: |
| return torch.no_grad() |
| |
| def get_tolerance_and_cosine_flag(self, is_training, current_device, name): |
| tolerance = 1e-4 |
| cosine = self.args.cosine |
| # Increase the tolerance for torch allclose |
| if self.args.float16 or self.args.amp: |
| return 1e-3, cosine |
| if is_training and current_device == "cuda": |
| tolerance = 1e-3 |
| if name in REQUIRE_COSINE_TOLERACE: |
| cosine = True |
| elif name in REQUIRE_HIGHER_TOLERANCE: |
| tolerance = 1e-3 |
| elif name in REQUIRE_EVEN_HIGHER_TOLERANCE: |
| tolerance = 8 * 1e-2 |
| return tolerance, cosine |
| |
| def compute_loss(self, pred): |
| return reduce_to_scalar_loss(pred) |
| |
| def forward_pass(self, mod, inputs, collect_outputs=True): |
| return mod(*inputs) |
| |
| def forward_and_backward_pass(self, mod, inputs, collect_outputs=True): |
| cloned_inputs = clone_inputs(inputs) |
| self.optimizer_zero_grad(mod) |
| with self.autocast(): |
| pred = mod(*cloned_inputs) |
| loss = self.compute_loss(pred) |
| self.grad_scaler.scale(loss).backward() |
| self.optimizer_step() |
| if collect_outputs: |
| return collect_results(mod, pred, loss, cloned_inputs) |
| return None |
| |
| |
| if __name__ == "__main__": |
| |
| original_dir = setup_torchbench_cwd() |
| logging.basicConfig(level=logging.WARNING) |
| warnings.filterwarnings("ignore") |
| main(TorchBenchmarkRunner(), original_dir) |