| #!/usr/bin/env python3 |
| import argparse |
| import inspect |
| import sys |
| |
| import numpy as np |
| import tabulate |
| import torch |
| |
| import torch._inductor |
| from torch._dynamo.backends.cudagraphs import cudagraphs_inner |
| from torch._dynamo.testing import same |
| from torch._inductor.compile_fx import compile_fx |
| from torch._inductor.utils import timed |
| |
| try: |
| import test.test_torchinductor as tti |
| except ImportError: |
| tti = None |
| |
| |
| def compute_speedups(args, models, example_inputs): |
| expected = models[0](*example_inputs) |
| for model in models[1:]: |
| actual = model(*example_inputs) |
| assert same(actual, expected), expected[0] - actual[0] |
| |
| timings = np.zeros((args.repeat, len(models)), np.float64) |
| for rep in range(args.repeat): |
| # interleave the runs to handle frequency scaling and load changes |
| for m, model in enumerate(models): |
| timings[rep, m] = timed(model, example_inputs) |
| median = np.median(timings, axis=0) |
| return (median[0] / median[1:]).tolist() |
| |
| |
| def microbenchmark(args, model, example_inputs): |
| compiled_fn = compile_fx(torch.fx.symbolic_trace(model), example_inputs) |
| cudagraphs_eager = cudagraphs_inner(model, example_inputs, copy_outputs=False) |
| cudagraphs_jit = cudagraphs_inner( |
| torch.jit.trace(model, example_inputs), example_inputs, copy_outputs=False |
| ) |
| return compute_speedups( |
| args, |
| [cudagraphs_eager, cudagraphs_jit, compiled_fn], |
| example_inputs, |
| ) |
| |
| |
| class MyModel1(torch.nn.Module): |
| def __init__(self): |
| super().__init__() |
| self.model = torch.nn.Sequential( |
| torch.nn.Linear(1024, 1024), |
| torch.nn.ReLU(), |
| ) |
| |
| def forward(self, input): |
| # return (self.model(input) + 1,) |
| return (self.model(input),) |
| |
| |
| class MyModel2(torch.nn.Module): |
| def forward(self, x, y): |
| # return x / (torch.abs(x) + 1.0), |
| return (x + y,) |
| |
| |
| class MicroBenchmarks: |
| @staticmethod |
| def add(a, b): |
| return (a + b,) |
| |
| @staticmethod |
| def scale(x, m, d): |
| return ((x - m) / torch.clip(d, 1e-4),) |
| |
| @staticmethod |
| def abs_norm(x): |
| return (x / (torch.abs(x) + 1),) |
| |
| @staticmethod |
| def add_relu_softmax(x, a): |
| return (torch.softmax(torch.relu(x + a), -1),) |
| |
| @staticmethod |
| def sum(a, b): |
| return ((a + b).sum(),) |
| |
| |
| def main(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| "--filter", "-k", action="append", help="filter benchmarks with regexp" |
| ) |
| parser.add_argument( |
| "--exclude", "-x", action="append", help="filter benchmarks with regexp" |
| ) |
| parser.add_argument("--devices", "-d", action="append", help="cpu or cuda") |
| parser.add_argument("--size", "-s", action="append", help="cpu or cuda") |
| parser.add_argument( |
| "--repeat", "-n", type=int, default=30, help="number of timing runs" |
| ) |
| parser.add_argument( |
| "--threads", "-t", type=int, help="number of threads to use for eager" |
| ) |
| parser.add_argument( |
| "--verbose", "-v", action="store_true", help="enable verbose debug printouts" |
| ) |
| parser.add_argument( |
| "--nvfuser", action="store_true", help="enable nvfuser globally" |
| ) |
| parser.add_argument("--transpose", action="store_true", help="transpose one input") |
| parser.add_argument("--broadcast", action="store_true", help="broadcast one input") |
| args = parser.parse_args() |
| |
| # defaults |
| args.devices = args.devices or ["cpu", "cuda"] |
| args.filter = args.filter or [r"."] |
| args.exclude = args.exclude or [r"^$"] |
| args.size = args.size or [64, 256, 1024, 4096, 8192] |
| |
| if args.nvfuser: |
| torch._C._jit_override_can_fuse_on_cpu(False) |
| torch._C._jit_override_can_fuse_on_gpu(False) |
| torch._C._jit_set_texpr_fuser_enabled(False) |
| torch._C._jit_set_nvfuser_enabled(True) |
| else: |
| torch._C._jit_override_can_fuse_on_cpu(torch._C._llvm_enabled()) |
| torch._C._jit_override_can_fuse_on_gpu(True) |
| torch._C._jit_set_texpr_fuser_enabled(True) |
| if torch.cuda.is_available(): |
| torch._C._jit_set_nvfuser_enabled(False) |
| |
| if args.threads: |
| torch.set_num_threads(args.threads) |
| torch._inductor.config.cpp.threads = args.threads |
| |
| if args.verbose: |
| torch._inductor.config.debug = True |
| |
| torch._inductor.config.triton.autotune_pointwise = True |
| |
| rows = [] |
| for model in (MicroBenchmarks.sum,): |
| nargs = len(inspect.signature(model).parameters) |
| for device in args.devices: |
| for n in args.size: |
| n = int(n) |
| sys.stdout.write(f"{model.__name__:10} {device:4} {n:5} ") |
| sys.stdout.flush() |
| inputs = [torch.rand((n, n), device=device) for _ in range(nargs)] |
| if args.broadcast: |
| inputs[-1] = torch.rand((1, n), device=device) |
| if args.transpose: |
| inputs[-1] = inputs[-1].transpose(0, 1) |
| result = microbenchmark(args, model, inputs) |
| rows.append([model.__name__, device, str(n)] + result) |
| print(" ".join(f"{v:.2f}x" for v in result)) |
| |
| print( |
| tabulate.tabulate( |
| rows, |
| headers=[ |
| "model", |
| "dev", |
| "n", |
| "ts", |
| "inductor", |
| ], |
| ) |
| ) |
| |
| |
| if __name__ == "__main__": |
| main() |