torch/_dynamo/optimizations/backends.py - platform/external/pytorch - Git at Google

 import copy
 import functools
 import io
 import logging
 import os
 import subprocess
 import tempfile

 import numpy as np

 import torch

 from ..utils import identity
 from .subgraph import SubGraph

 log = logging.getLogger(__name__)
 BACKENDS = dict()
 _NP_DTYPE = {
     torch.float16: np.float16,
     torch.float32: np.float32,
     torch.float64: np.float64,
     torch.uint8: np.uint8,
     torch.int8: np.int8,
     torch.int16: np.int16,
     torch.int32: np.int32,
     torch.int64: np.longlong,
     torch.bool: np.bool_,
 }


 def register_backend(fn):
     @functools.wraps(fn)
     def inner(gm, example_inputs, **kwargs):
         return fn(gm, example_inputs, **kwargs)

     BACKENDS[fn.__name__] = inner
     return inner


 def create_backend(fn):
     @functools.wraps(fn)
     def inner(model, example_inputs=None, **kwargs):
         if model is None:
             return None

         if not isinstance(model, SubGraph):
             with tempfile.TemporaryDirectory() as tmp:
                 return inner(SubGraph(model, example_inputs, tmp), **kwargs)
         else:
             assert example_inputs is None

         try:
             return fn(model, **kwargs)
         except KeyboardInterrupt:
             raise
         except Exception:
             log.exception(f"{fn.__name__} error")
             return None

     BACKENDS[fn.__name__] = inner
     return inner


 @create_backend
 def eager(subgraph):
     return subgraph.model


 @create_backend
 def ts(subgraph):
     return subgraph.scripted


 def reload_jit_model(subgraph, opt_fn=identity):
     tmp = io.BytesIO()
     torch.jit.save(subgraph.scripted, tmp)
     tmp.seek(0)
     model = torch.jit.load(tmp)
     model = opt_fn(model)
     # populate cache
     for _ in range(3):
         model(*subgraph.example_inputs)
     return model


 def reload_jit_model_ofi(subgraph):
     return reload_jit_model(subgraph, torch.jit.optimize_for_inference)


 @create_backend
 def nnc(subgraph):
     with torch.jit.fuser("fuser1"):
         return reload_jit_model(subgraph)


 @create_backend
 def nnc_ofi(subgraph):
     with torch.jit.fuser("fuser1"):
         return reload_jit_model_ofi(subgraph)


 @create_backend
 def nvfuser(subgraph):
     with torch.jit.fuser("fuser2"):
         return reload_jit_model(subgraph)


 @create_backend
 def nvfuser_ofi(subgraph):
     with torch.jit.fuser("fuser2"):
         return reload_jit_model_ofi(subgraph)


 @create_backend
 def onednn(subgraph):
     with torch.jit.fuser("fuser3"):
         return reload_jit_model(subgraph)


 @create_backend
 def ofi(subgraph):
     return torch.jit.optimize_for_inference(subgraph.scripted)


 @create_backend
 def static_runtime(subgraph):
     scripted = subgraph.scripted
     if hasattr(scripted, "_c"):
         static_module = torch._C._jit_to_static_module(scripted._c)
     else:
         static_module = torch._C._jit_to_static_module(scripted.graph)
     return subgraph.wrap_returns(static_module)


 def onnxrt_common(subgraph, provider, onnx_filename=None):
     import onnxruntime

     assert provider in onnxruntime.get_available_providers()
     session = onnxruntime.InferenceSession(
         onnx_filename or subgraph.onnx_filename, providers=[provider]
     )
     input_names = subgraph.input_names
     output_names = subgraph.output_names
     create_outputs = subgraph.empty_outputs_factory()
     is_cpu = subgraph.is_cpu

     def _call(*args):
         binding = session.io_binding()
         args = [a.contiguous() for a in args]
         for name, value in zip(input_names, args):
             dev = value.device
             binding.bind_input(
                 name,
                 dev.type,
                 dev.index or 0,
                 _NP_DTYPE[value.dtype],
                 value.size(),
                 value.data_ptr(),
             )
         outputs = create_outputs()
         for name, value in zip(output_names, outputs):
             dev = value.device
             binding.bind_output(
                 name,
                 dev.type,
                 dev.index or 0,
                 _NP_DTYPE[value.dtype],
                 value.size(),
                 value.data_ptr(),
             )
         session.run_with_iobinding(binding)
         if is_cpu:
             binding.copy_outputs_to_cpu()
         return outputs

     return subgraph.wrap_returns(_call)


 @create_backend
 def onnxrt_cpu(subgraph):
     return onnxrt_common(subgraph, provider="CPUExecutionProvider")


 @create_backend
 def onnxrt_cuda(subgraph):
     return onnxrt_common(subgraph, provider="CUDAExecutionProvider")


 @create_backend
 def onnx2tensorrt(subgraph):
     if subgraph.will_tensorrt_barf():
         # TensorRT fails violently with an abort() on this
         return None

     return onnxrt_common(subgraph, provider="TensorrtExecutionProvider")


 @create_backend
 def onnxrt_cpu_numpy(subgraph, provider="CPUExecutionProvider"):
     """Alternate version that integrates via numpy"""
     import onnxruntime

     assert provider in onnxruntime.get_available_providers()
     ort_session = onnxruntime.InferenceSession(
         subgraph.onnx_filename, providers=[provider]
     )

     def to_numpy(x):
         try:
             return x.numpy()
         except RuntimeError:
             return x.detach().numpy()

     def _call(*args):
         res = ort_session.run(
             None, {f"i{i}": to_numpy(arg) for i, arg in enumerate(args)}
         )
         res = [torch.from_numpy(x) for x in res]
         return res

     return subgraph.wrap_returns(_call)


 @create_backend
 def onnxrt(subgraph):
     if subgraph.is_cuda:
         return onnxrt_cuda(subgraph)
     else:
         return onnxrt_cpu(subgraph)


 @functools.lru_cache(None)
 def _init_tensorflow():
     import tensorflow as tf

     # prevent tensorflow from eating all the GPU memory
     gpus = tf.config.list_physical_devices("GPU")
     for gpu in gpus:
         tf.config.experimental.set_memory_growth(gpu, True)
     return tf


 @create_backend
 def onnx2tf(subgraph):
     import onnx
     from onnx_tf.backend import prepare

     tf = _init_tensorflow()
     filename = subgraph.filename("tensorflow")
     input_names = subgraph.input_names
     output_names = subgraph.output_names
     device = "/CPU:0" if subgraph.is_cpu else f"/GPU:{subgraph.device_index}"
     with tf.device(device):
         if not os.path.exists(filename):
             prepare(onnx.load(subgraph.onnx_filename)).export_graph(filename)
         tf_module = tf.saved_model.load(filename)
         tf_module = tf.function(tf_module, jit_compile=True)

     def run(*args):
         args = [a.contiguous() for a in args]
         with tf.device(device):
             outs = tf_module(
                 **{
                     name: tf.experimental.dlpack.from_dlpack(
                         torch.utils.dlpack.to_dlpack(args[idx])
                     )
                     for idx, name in enumerate(input_names)
                 }
             )
             return [
                 torch.utils.dlpack.from_dlpack(
                     tf.experimental.dlpack.to_dlpack(outs[name])
                 )
                 for name in output_names
             ]

     return subgraph.wrap_returns(run)


 @create_backend
 def taso(subgraph):
     taso_filename = subgraph.filename("taso")
     subprocess.check_call(
         [
             os.path.expanduser("~/conda/envs/taso/bin/python"),
             "-c",
             "import taso,onnx; onnx.save(taso.export_onnx(taso.optimize("
             f"taso.load_onnx('{subgraph.onnx_filename}'))), '{taso_filename}')",
         ]
     )
     return onnxrt_common(
         subgraph, provider="CUDAExecutionProvider", onnx_filename=taso_filename
     )


 @create_backend
 def ipex(subgraph, **kwargs):
     import intel_extension_for_pytorch as ipex

     inputs = subgraph.example_inputs
     model = subgraph.model
     with torch.no_grad():
         model.eval()
         if kwargs["datatype"] == "bf16":
             model = ipex.optimize(model, dtype=torch.bfloat16)
         else:
             model = ipex.optimize(model, dtype=torch.float32)
         try:
             traced_model = torch.jit.trace(model, inputs).eval()
             traced_model = torch.jit.freeze(traced_model)
             return traced_model
         except Exception:
             log.warning("JIT trace failed during the 'ipex' optimize process.")
             return model


 def _raise_timeout(signum, frame):
     raise TimeoutError()


 @create_backend
 def fx2trt(subgraph, **kwargs):
     if subgraph.will_tensorrt_barf():
         # TensorRT fails violently with an abort() on this
         return None

     from torch_tensorrt.fx.fx2trt import InputTensorSpec, TRTInterpreter
     from torch_tensorrt.fx.passes.lower_basic_pass import transform_setitem
     from torch_tensorrt.fx.tools.trt_splitter import TRTSplitter, TRTSplitterSetting
     from torch_tensorrt.fx.tracer.acc_tracer import acc_tracer
     from torch_tensorrt.fx.trt_module import TRTModule
     from torch_tensorrt.fx.utils import LowerPrecision

     from .normalize import normalize_ir

     try:
         model = subgraph.model
         inputs = subgraph.example_inputs
         # normalize
         model = normalize_ir(model, inputs)
         # pass rewrite
         model = transform_setitem(model, inputs)
         acc_model = acc_tracer.trace(model, inputs)
         # Split out unsupported ops
         splitter_setting = TRTSplitterSetting()
         splitter_setting.use_implicit_batch_dim = False
         splitter = TRTSplitter(acc_model, inputs, settings=splitter_setting)
         splitter.node_support_preview()
         split_mod = splitter()
         num_piece = 0
         for name, _ in split_mod.named_children():
             print(f"graph is split into {name}")
             num_piece += 1

         # if the graph module is split into pieces larger than 8, we consider its perf
         # is not good and fall back to non-TRT
         if num_piece > 8:
             print(
                 f"The graph module is split into {num_piece} which is large than the \
                 threshold=8. Fall back to non-TRT module."
             )
             return None

         if "fp16_mode" in kwargs and kwargs["fp16_mode"]:
             precision = LowerPrecision.FP16
         else:
             precision = LowerPrecision.FP32

         def get_submod_inputs(mod, submod, inputs):
             acc_inputs = None

             def get_input(self, inputs):
                 nonlocal acc_inputs
                 acc_inputs = inputs

             handle = submod.register_forward_pre_hook(get_input)
             mod(*inputs)
             handle.remove()
             return acc_inputs

         for name, _ in split_mod.named_children():
             if "_run_on_acc" in name:
                 submod = getattr(split_mod, name)
                 # print("acc=",submod.code)
                 # Get submodule inputs for fx2trt
                 acc_inputs = get_submod_inputs(split_mod, submod, inputs)

                 # fx2trt replacement
                 interp = TRTInterpreter(
                     submod,
                     InputTensorSpec.from_tensors(acc_inputs),
                     explicit_batch_dimension=True,
                 )
                 r = interp.run(
                     max_workspace_size=20 << 30,
                     lower_precision=precision,
                     # profiling_verbosity=trt.ProfilingVerbosity.DETAILED, #For profile
                 )
                 # For profile
                 # from fx2trt_oss.fx.tools.trt_profiler_sorted import profile_trt_module
                 # profile_trt_module("", trt_mod, acc_inputs)
                 trt_mod = TRTModule(*r)

                 setattr(split_mod, name, trt_mod)
             else:
                 submod = getattr(split_mod, name)
                 # print("gpu=",submod.code)
         return subgraph.wrap_returns(split_mod)
     except Exception:
         log.exception("FX2TRT conversion error")
         return None


 @create_backend
 def torch2trt(subgraph):
     if subgraph.will_tensorrt_barf():
         # TensorRT fails violently with an abort() on this
         return None

     from torch2trt import torch2trt

     inputs = subgraph.example_inputs
     trt_mod = torch2trt(
         subgraph.model,
         inputs,
         max_batch_size=len(inputs[0]),
         strict_type_constraints=True,
     )
     return subgraph.wrap_returns(trt_mod)


 @create_backend
 def tensorrt(subgraph):
     if subgraph.will_tensorrt_barf():
         # TensorRT fails violently with an abort() on this
         return None

     model = onnx2tensorrt(subgraph)
     if model is None:
         model = torch2trt(subgraph)
     return model


 @create_backend
 def onnx2tensorrt_alt(subgraph):
     if subgraph.will_tensorrt_barf():
         # TensorRT fails violently with an abort() on this
         return None

     import tensorrt as trt

     from torch.fx.experimental.fx2trt.trt_module import TRTModule

     inputs = subgraph.example_inputs

     logger = trt.Logger(trt.Logger.ERROR)
     builder = trt.Builder(logger)
     config = builder.create_builder_config()
     assert isinstance(inputs, (list, tuple))
     inputs = tuple(inputs)
     input_names = subgraph.input_names
     output_names = subgraph.output_names
     network = builder.create_network(
         1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
     )
     parser = trt.OnnxParser(network, logger)
     success = parser.parse(open(subgraph.onnx_filename, "rb").read())
     for idx in range(parser.num_errors):
         print(parser.get_error(idx))
     assert success

     config.max_workspace_size = 1 << 25
     config.set_flag(trt.BuilderFlag.STRICT_TYPES)
     builder.max_batch_size = len(inputs[0])

     engine = builder.build_engine(network, config)
     assert engine

     trt_mod = TRTModule(engine, input_names, output_names)
     return subgraph.wrap_returns(trt_mod)


 @create_backend
 def cudagraphs(subgraph):
     model = subgraph.model
     inputs = subgraph.example_inputs
     assert subgraph.is_cuda
     return subgraph.wrap_returns(cudagraphs_inner(model, inputs))


 @create_backend
 def cudagraphs_ts(subgraph):
     assert subgraph.is_cuda
     model = subgraph.scripted
     inputs = subgraph.example_inputs

     # warmup
     for _ in range(3):
         model(*inputs)

     return subgraph.wrap_returns(cudagraphs_inner(model, inputs))


 @create_backend
 def cudagraphs_ts_ofi(subgraph):
     assert subgraph.is_cuda
     model = torch.jit.optimize_for_inference(torch.jit.freeze(subgraph.scripted))
     inputs = subgraph.example_inputs

     # warmup
     for _ in range(3):
         model(*inputs)

     return subgraph.wrap_returns(cudagraphs_inner(model, inputs))


 def cudagraphs_inner(model, inputs, copy_outputs=True):
     assert isinstance(inputs, (list, tuple))
     static_inputs = [torch.zeros_like(x) for x in inputs]

     # warmup
     torch.cuda.synchronize()
     stream = torch.cuda.Stream()
     stream.wait_stream(torch.cuda.current_stream())
     with torch.cuda.stream(stream):
         model(*inputs)
     stream.synchronize()
     torch.cuda.current_stream().wait_stream(stream)
     torch.cuda.synchronize()

     # record
     graph = torch.cuda.CUDAGraph()
     with torch.cuda.graph(graph, stream=stream):
         static_outputs = model(*static_inputs)
     if not isinstance(static_outputs, (list, tuple)):
         static_outputs = (static_outputs,)

     def run(*new_inputs):
         assert len(static_inputs) == len(new_inputs)
         for dst, src in zip(static_inputs, new_inputs):
             dst.copy_(src)
         graph.replay()
         if copy_outputs:
             return [x.clone() for x in static_outputs]
         else:
             return static_outputs

     return run


 @create_backend
 def aot_autograd(subgraph, **kwargs):
     def _wrapped_bw_compiler(*args, **kwargs):
         # stop TorchDynamo from trying to compile our generated backwards pass
         return disable(bw_compiler(*args, **kwargs))

     bw_compiler = kwargs.get("bw_compiler") or kwargs["fw_compiler"]
     kwargs["bw_compiler"] = _wrapped_bw_compiler

     from functorch.compile import aot_module_simplified

     from .. import disable

     return aot_module_simplified(subgraph.model, **kwargs)


 def tvm_compile(jit_mod, example_inputs, log_file=None, **kwargs):
     if jit_mod is None:
         return None
     try:
         return tvm_compile_inner(jit_mod, example_inputs, None, log_file, **kwargs)
     except Exception as e:
         if log_file and os.path.exists(log_file):
             os.unlink(log_file)
         if isinstance(e, KeyboardInterrupt):
             raise
         log.exception("tvm error")
         return None


 @create_backend
 def tvm(subgraph):
     return subgraph.wrap_returns(
         tvm_compile_inner(
             subgraph.scripted,
             subgraph.example_inputs,
             tuning_option=None,
             cuda=subgraph.is_cuda,
         )
     )


 @create_backend
 def ansor(subgraph):
     """
     WARNING: this backend takes hours or days to train and
     often produces a slower result than the default schedule.
     """
     return subgraph.wrap_returns(
         tvm_compile_inner(
             subgraph.scripted,
             subgraph.example_inputs,
             tuning_option="auto_scheduler",
             log_file=subgraph.filename("ansor"),
             cuda=subgraph.is_cuda,
         )
     )


 @create_backend
 def tvm_meta_schedule(subgraph):
     return subgraph.wrap_returns(
         tvm_compile_inner(
             subgraph.scripted,
             subgraph.example_inputs,
             tuning_option="meta_schedule",
             trials=20000,
             cuda=subgraph.is_cuda,
         )
     )


 @functools.lru_cache(None)
 def llvm_target():
     if "avx512" in open("/proc/cpuinfo").read():
         return "llvm -mcpu=skylake-avx512"
     return "llvm -mcpu=core-avx2"


 def tvm_compile_inner(
     jit_mod, example_inputs, tuning_option=None, log_file=None, trials=20000, cuda=False
 ):
     try:
         import tvm
         from tvm import relay
         from tvm.contrib import graph_executor

         shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
         mod, params = relay.frontend.from_pytorch(jit_mod, shape_list)
         if cuda:
             dev = tvm.cuda(0)
             target = tvm.target.cuda()
         else:
             dev = tvm.cpu(0)
             target = tvm.target.Target(llvm_target())

         if tuning_option == "auto_scheduler":
             from tvm import auto_scheduler

             if log_file is None:
                 log_file = tempfile.NamedTemporaryFile()
             if not os.path.exists(log_file):
                 tasks, task_weights = auto_scheduler.extract_tasks(
                     mod["main"], params, target
                 )
                 for task in tasks:
                     print(task.compute_dag)
                 else:
                     print("No tasks")
                 if len(tasks) != 0:
                     tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
                     if not os.path.exists(log_file):
                         assert trials > 0
                         tune_option = auto_scheduler.TuningOptions(
                             num_measure_trials=trials,
                             measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
                             early_stopping=2000,
                         )
                         try:
                             tuner.tune(tune_option)
                         except Exception:
                             if os.path.exists(log_file):
                                 os.unlink(log_file)
                             raise

             with auto_scheduler.ApplyHistoryBest(log_file):
                 with tvm.transform.PassContext(
                     opt_level=3, config={"relay.backend.use_auto_scheduler": True}
                 ):
                     lib = relay.build(mod, target=target, params=params)
         elif tuning_option == "meta_schedule":
             from os import path as osp

             from tvm.contrib.torch import optimize_torch

             with tempfile.TemporaryDirectory() as work_dir:
                 if log_file is not None:
                     assert osp.isdir(
                         log_file
                     ), "TVM's meta_schedule requires a directory for storing log files."
                     work_dir = log_file

                 lib = optimize_torch(
                     jit_mod,
                     example_inputs,
                     max_trials_global=20000,
                     work_dir=work_dir,
                     target=target,
                     max_trials_per_task=64,
                 )

         elif tuning_option is None:
             # no autotuning (for debugging)
             with tvm.transform.PassContext(opt_level=10):
                 lib = relay.build(mod, target=target, params=params)
         else:
             raise NotImplementedError(
                 "This tuning option is invalid/not implemented for torchdynamo's TVM-related backend. "
                 "There are three available options including None, auto_scheduler and meta_schedule."
             )
         if tune_option != "meta_schedule":
             m = graph_executor.GraphModule(lib["default"](dev))

             def to_torch_tensor(nd_tensor):
                 """A helper function to transfer a NDArray to torch.tensor."""
                 if nd_tensor.dtype == "bool":
                     # DLPack does not support boolean so it can't be handled by
                     # torch.utils.dlpack.from_pack. Workaround by going through
                     # numpy, although this brings additional data copy overhead.
                     return torch.from_numpy(nd_tensor.numpy())
                 return torch.utils.dlpack.from_dlpack(nd_tensor.to_dlpack())

             def exec_tvm(*args):
                 args = [a.contiguous() for a in args]
                 for idx, arg in enumerate(args, 0):
                     if arg.dim() != 0:
                         if arg.requires_grad:
                             arg = arg.detach()
                         m.set_input(
                             f"inp_{idx}",
                             tvm.nd.array(arg.numpy(), dev),
                         )
                 m.run()
                 return [
                     to_torch_tensor(m.get_output(i)) for i in range(m.get_num_outputs())
                 ]

         else:

             def exec_tvm(*args):
                 args = [a.contiguous() for a in args]
                 return lib(*args)

         return exec_tvm

     except Exception:
         log.exception("tvm error")
         return jit_mod  # explicit fall back to eager


 @functools.lru_cache(None)
 def _init_ltc():
     try:
         import torch._lazy.extract_compiled_graph
         from torch._lazy.ts_backend import init as init_ts_backend

         # hopefully changing this line to sth like _ltc_init_xla_backend in future
         # will enable XLA
         init_ts_backend()

         return torch._lazy
     except ModuleNotFoundError as e:
         print(f"ltc backend fails. Can not import {e.name}")
         raise


 def ltc_reuse_graph(gm: torch.fx.GraphModule, example_inputs):
     ltc = _init_ltc()
     return ltc.extract_compiled_graph.extract_compiled_graph(gm, example_inputs)


 def ltc_trivial(gm: torch.fx.GraphModule, example_inputs):
     ltc = _init_ltc()
     lazy_model = copy.deepcopy(gm).to(device="lazy")
     ltc.extract_compiled_graph.force_lazy_device(lazy_model)

     def ltc_model(*inputs):
         orig_device = inputs[0].device if len(inputs) > 0 else "cuda"
         lazy_inputs = tuple(inp.to(device="lazy") for inp in inputs)

         lazy_out = lazy_model(*lazy_inputs)
         out = tuple(out.to(device=orig_device) for out in lazy_out)
         return out

     return ltc_model


 def ipex_fp32(gm: torch.fx.GraphModule, example_inputs):
     kwargs_ipex = {"datatype": "fp32"}
     return BACKENDS["ipex"](gm, example_inputs, **kwargs_ipex)


 def ipex_bf16(gm: torch.fx.GraphModule, example_inputs):
     kwargs_ipex = {"datatype": "bf16"}
     return BACKENDS["ipex"](gm, example_inputs, **kwargs_ipex)


 def fx2trt_compiler_fp16(gm: torch.fx.GraphModule, example_inputs):
     kwargs_fx2trt = {"fp16_mode": True}
     trt_compiled = BACKENDS["fx2trt"](gm, example_inputs, **kwargs_fx2trt)
     if trt_compiled is not None:
         return trt_compiled
     else:
         print(
             "FX2TRT conversion failed on the subgraph. Return GraphModule forward instead"
         )
         return gm.forward


 def fx2trt_compiler(gm: torch.fx.GraphModule, example_inputs):
     kwargs_fx2trt = {"fp16_mode": False}
     trt_compiled = BACKENDS["fx2trt"](gm, example_inputs, **kwargs_fx2trt)
     if trt_compiled is not None:
         return trt_compiled
     else:
         print(
             "FX2TRT conversion failed on the subgraph. Return GraphModule forward instead"
         )
         return gm.forward
	import copy
	import functools
	import io
	import logging
	import os
	import subprocess
	import tempfile

	import numpy as np

	import torch

	from ..utils import identity
	from .subgraph import SubGraph

	log = logging.getLogger(__name__)
	BACKENDS = dict()
	_NP_DTYPE = {
	torch.float16: np.float16,
	torch.float32: np.float32,
	torch.float64: np.float64,
	torch.uint8: np.uint8,
	torch.int8: np.int8,
	torch.int16: np.int16,
	torch.int32: np.int32,
	torch.int64: np.longlong,
	torch.bool: np.bool_,
	}


	def register_backend(fn):
	@functools.wraps(fn)
	def inner(gm, example_inputs, **kwargs):
	return fn(gm, example_inputs, **kwargs)

	BACKENDS[fn.__name__] = inner
	return inner


	def create_backend(fn):
	@functools.wraps(fn)
	def inner(model, example_inputs=None, **kwargs):
	if model is None:
	return None

	if not isinstance(model, SubGraph):
	with tempfile.TemporaryDirectory() as tmp:
	return inner(SubGraph(model, example_inputs, tmp), **kwargs)
	else:
	assert example_inputs is None

	try:
	return fn(model, **kwargs)
	except KeyboardInterrupt:
	raise
	except Exception:
	log.exception(f"{fn.__name__} error")
	return None

	BACKENDS[fn.__name__] = inner
	return inner


	@create_backend
	def eager(subgraph):
	return subgraph.model


	@create_backend
	def ts(subgraph):
	return subgraph.scripted


	def reload_jit_model(subgraph, opt_fn=identity):
	tmp = io.BytesIO()
	torch.jit.save(subgraph.scripted, tmp)
	tmp.seek(0)
	model = torch.jit.load(tmp)
	model = opt_fn(model)
	# populate cache
	for _ in range(3):
	model(*subgraph.example_inputs)
	return model


	def reload_jit_model_ofi(subgraph):
	return reload_jit_model(subgraph, torch.jit.optimize_for_inference)


	@create_backend
	def nnc(subgraph):
	with torch.jit.fuser("fuser1"):
	return reload_jit_model(subgraph)


	@create_backend
	def nnc_ofi(subgraph):
	with torch.jit.fuser("fuser1"):
	return reload_jit_model_ofi(subgraph)


	@create_backend
	def nvfuser(subgraph):
	with torch.jit.fuser("fuser2"):
	return reload_jit_model(subgraph)


	@create_backend
	def nvfuser_ofi(subgraph):
	with torch.jit.fuser("fuser2"):
	return reload_jit_model_ofi(subgraph)


	@create_backend
	def onednn(subgraph):
	with torch.jit.fuser("fuser3"):
	return reload_jit_model(subgraph)


	@create_backend
	def ofi(subgraph):
	return torch.jit.optimize_for_inference(subgraph.scripted)


	@create_backend
	def static_runtime(subgraph):
	scripted = subgraph.scripted
	if hasattr(scripted, "_c"):
	static_module = torch._C._jit_to_static_module(scripted._c)
	else:
	static_module = torch._C._jit_to_static_module(scripted.graph)
	return subgraph.wrap_returns(static_module)


	def onnxrt_common(subgraph, provider, onnx_filename=None):
	import onnxruntime

	assert provider in onnxruntime.get_available_providers()
	session = onnxruntime.InferenceSession(
	onnx_filename or subgraph.onnx_filename, providers=[provider]
	)
	input_names = subgraph.input_names
	output_names = subgraph.output_names
	create_outputs = subgraph.empty_outputs_factory()
	is_cpu = subgraph.is_cpu

	def _call(*args):
	binding = session.io_binding()
	args = [a.contiguous() for a in args]
	for name, value in zip(input_names, args):
	dev = value.device
	binding.bind_input(
	name,
	dev.type,
	dev.index or 0,
	_NP_DTYPE[value.dtype],
	value.size(),
	value.data_ptr(),
	)
	outputs = create_outputs()
	for name, value in zip(output_names, outputs):
	dev = value.device
	binding.bind_output(
	name,
	dev.type,
	dev.index or 0,
	_NP_DTYPE[value.dtype],
	value.size(),
	value.data_ptr(),
	)
	session.run_with_iobinding(binding)
	if is_cpu:
	binding.copy_outputs_to_cpu()
	return outputs

	return subgraph.wrap_returns(_call)


	@create_backend
	def onnxrt_cpu(subgraph):
	return onnxrt_common(subgraph, provider="CPUExecutionProvider")


	@create_backend
	def onnxrt_cuda(subgraph):
	return onnxrt_common(subgraph, provider="CUDAExecutionProvider")


	@create_backend
	def onnx2tensorrt(subgraph):
	if subgraph.will_tensorrt_barf():
	# TensorRT fails violently with an abort() on this
	return None

	return onnxrt_common(subgraph, provider="TensorrtExecutionProvider")


	@create_backend
	def onnxrt_cpu_numpy(subgraph, provider="CPUExecutionProvider"):
	"""Alternate version that integrates via numpy"""
	import onnxruntime

	assert provider in onnxruntime.get_available_providers()
	ort_session = onnxruntime.InferenceSession(
	subgraph.onnx_filename, providers=[provider]
	)

	def to_numpy(x):
	try:
	return x.numpy()
	except RuntimeError:
	return x.detach().numpy()

	def _call(*args):
	res = ort_session.run(
	None, {f"i{i}": to_numpy(arg) for i, arg in enumerate(args)}
	)
	res = [torch.from_numpy(x) for x in res]
	return res

	return subgraph.wrap_returns(_call)


	@create_backend
	def onnxrt(subgraph):
	if subgraph.is_cuda:
	return onnxrt_cuda(subgraph)
	else:
	return onnxrt_cpu(subgraph)


	@functools.lru_cache(None)
	def _init_tensorflow():
	import tensorflow as tf

	# prevent tensorflow from eating all the GPU memory
	gpus = tf.config.list_physical_devices("GPU")
	for gpu in gpus:
	tf.config.experimental.set_memory_growth(gpu, True)
	return tf


	@create_backend
	def onnx2tf(subgraph):
	import onnx
	from onnx_tf.backend import prepare

	tf = _init_tensorflow()
	filename = subgraph.filename("tensorflow")
	input_names = subgraph.input_names
	output_names = subgraph.output_names
	device = "/CPU:0" if subgraph.is_cpu else f"/GPU:{subgraph.device_index}"
	with tf.device(device):
	if not os.path.exists(filename):
	prepare(onnx.load(subgraph.onnx_filename)).export_graph(filename)
	tf_module = tf.saved_model.load(filename)
	tf_module = tf.function(tf_module, jit_compile=True)

	def run(*args):
	args = [a.contiguous() for a in args]
	with tf.device(device):
	outs = tf_module(
	**{
	name: tf.experimental.dlpack.from_dlpack(
	torch.utils.dlpack.to_dlpack(args[idx])
	)
	for idx, name in enumerate(input_names)
	}
	)
	return [
	torch.utils.dlpack.from_dlpack(
	tf.experimental.dlpack.to_dlpack(outs[name])
	)
	for name in output_names
	]

	return subgraph.wrap_returns(run)


	@create_backend
	def taso(subgraph):
	taso_filename = subgraph.filename("taso")
	subprocess.check_call(
	[
	os.path.expanduser("~/conda/envs/taso/bin/python"),
	"-c",
	"import taso,onnx; onnx.save(taso.export_onnx(taso.optimize("
	f"taso.load_onnx('{subgraph.onnx_filename}'))), '{taso_filename}')",
	]
	)
	return onnxrt_common(
	subgraph, provider="CUDAExecutionProvider", onnx_filename=taso_filename
	)


	@create_backend
	def ipex(subgraph, **kwargs):
	import intel_extension_for_pytorch as ipex

	inputs = subgraph.example_inputs
	model = subgraph.model
	with torch.no_grad():
	model.eval()
	if kwargs["datatype"] == "bf16":
	model = ipex.optimize(model, dtype=torch.bfloat16)
	else:
	model = ipex.optimize(model, dtype=torch.float32)
	try:
	traced_model = torch.jit.trace(model, inputs).eval()
	traced_model = torch.jit.freeze(traced_model)
	return traced_model
	except Exception:
	log.warning("JIT trace failed during the 'ipex' optimize process.")
	return model


	def _raise_timeout(signum, frame):
	raise TimeoutError()


	@create_backend
	def fx2trt(subgraph, **kwargs):
	if subgraph.will_tensorrt_barf():
	# TensorRT fails violently with an abort() on this
	return None

	from torch_tensorrt.fx.fx2trt import InputTensorSpec, TRTInterpreter
	from torch_tensorrt.fx.passes.lower_basic_pass import transform_setitem
	from torch_tensorrt.fx.tools.trt_splitter import TRTSplitter, TRTSplitterSetting
	from torch_tensorrt.fx.tracer.acc_tracer import acc_tracer
	from torch_tensorrt.fx.trt_module import TRTModule
	from torch_tensorrt.fx.utils import LowerPrecision

	from .normalize import normalize_ir

	try:
	model = subgraph.model
	inputs = subgraph.example_inputs
	# normalize
	model = normalize_ir(model, inputs)
	# pass rewrite
	model = transform_setitem(model, inputs)
	acc_model = acc_tracer.trace(model, inputs)
	# Split out unsupported ops
	splitter_setting = TRTSplitterSetting()
	splitter_setting.use_implicit_batch_dim = False
	splitter = TRTSplitter(acc_model, inputs, settings=splitter_setting)
	splitter.node_support_preview()
	split_mod = splitter()
	num_piece = 0
	for name, _ in split_mod.named_children():
	print(f"graph is split into {name}")
	num_piece += 1

	# if the graph module is split into pieces larger than 8, we consider its perf
	# is not good and fall back to non-TRT
	if num_piece > 8:
	print(
	f"The graph module is split into {num_piece} which is large than the \
	threshold=8. Fall back to non-TRT module."
	)
	return None

	if "fp16_mode" in kwargs and kwargs["fp16_mode"]:
	precision = LowerPrecision.FP16
	else:
	precision = LowerPrecision.FP32

	def get_submod_inputs(mod, submod, inputs):
	acc_inputs = None

	def get_input(self, inputs):
	nonlocal acc_inputs
	acc_inputs = inputs

	handle = submod.register_forward_pre_hook(get_input)
	mod(*inputs)
	handle.remove()
	return acc_inputs

	for name, _ in split_mod.named_children():
	if "_run_on_acc" in name:
	submod = getattr(split_mod, name)
	# print("acc=",submod.code)
	# Get submodule inputs for fx2trt
	acc_inputs = get_submod_inputs(split_mod, submod, inputs)

	# fx2trt replacement
	interp = TRTInterpreter(
	submod,
	InputTensorSpec.from_tensors(acc_inputs),
	explicit_batch_dimension=True,
	)
	r = interp.run(
	max_workspace_size=20 << 30,
	lower_precision=precision,
	# profiling_verbosity=trt.ProfilingVerbosity.DETAILED, #For profile
	)
	# For profile
	# from fx2trt_oss.fx.tools.trt_profiler_sorted import profile_trt_module
	# profile_trt_module("", trt_mod, acc_inputs)
	trt_mod = TRTModule(*r)

	setattr(split_mod, name, trt_mod)
	else:
	submod = getattr(split_mod, name)
	# print("gpu=",submod.code)
	return subgraph.wrap_returns(split_mod)
	except Exception:
	log.exception("FX2TRT conversion error")
	return None


	@create_backend
	def torch2trt(subgraph):
	if subgraph.will_tensorrt_barf():
	# TensorRT fails violently with an abort() on this
	return None

	from torch2trt import torch2trt

	inputs = subgraph.example_inputs
	trt_mod = torch2trt(
	subgraph.model,
	inputs,
	max_batch_size=len(inputs[0]),
	strict_type_constraints=True,
	)
	return subgraph.wrap_returns(trt_mod)


	@create_backend
	def tensorrt(subgraph):
	if subgraph.will_tensorrt_barf():
	# TensorRT fails violently with an abort() on this
	return None

	model = onnx2tensorrt(subgraph)
	if model is None:
	model = torch2trt(subgraph)
	return model


	@create_backend
	def onnx2tensorrt_alt(subgraph):
	if subgraph.will_tensorrt_barf():
	# TensorRT fails violently with an abort() on this
	return None

	import tensorrt as trt

	from torch.fx.experimental.fx2trt.trt_module import TRTModule

	inputs = subgraph.example_inputs

	logger = trt.Logger(trt.Logger.ERROR)
	builder = trt.Builder(logger)
	config = builder.create_builder_config()
	assert isinstance(inputs, (list, tuple))
	inputs = tuple(inputs)
	input_names = subgraph.input_names
	output_names = subgraph.output_names
	network = builder.create_network(
	1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
	)
	parser = trt.OnnxParser(network, logger)
	success = parser.parse(open(subgraph.onnx_filename, "rb").read())
	for idx in range(parser.num_errors):
	print(parser.get_error(idx))
	assert success

	config.max_workspace_size = 1 << 25
	config.set_flag(trt.BuilderFlag.STRICT_TYPES)
	builder.max_batch_size = len(inputs[0])

	engine = builder.build_engine(network, config)
	assert engine

	trt_mod = TRTModule(engine, input_names, output_names)
	return subgraph.wrap_returns(trt_mod)


	@create_backend
	def cudagraphs(subgraph):
	model = subgraph.model
	inputs = subgraph.example_inputs
	assert subgraph.is_cuda
	return subgraph.wrap_returns(cudagraphs_inner(model, inputs))


	@create_backend
	def cudagraphs_ts(subgraph):
	assert subgraph.is_cuda
	model = subgraph.scripted
	inputs = subgraph.example_inputs

	# warmup
	for _ in range(3):
	model(*inputs)

	return subgraph.wrap_returns(cudagraphs_inner(model, inputs))


	@create_backend
	def cudagraphs_ts_ofi(subgraph):
	assert subgraph.is_cuda
	model = torch.jit.optimize_for_inference(torch.jit.freeze(subgraph.scripted))
	inputs = subgraph.example_inputs

	# warmup
	for _ in range(3):
	model(*inputs)

	return subgraph.wrap_returns(cudagraphs_inner(model, inputs))


	def cudagraphs_inner(model, inputs, copy_outputs=True):
	assert isinstance(inputs, (list, tuple))
	static_inputs = [torch.zeros_like(x) for x in inputs]

	# warmup
	torch.cuda.synchronize()
	stream = torch.cuda.Stream()
	stream.wait_stream(torch.cuda.current_stream())
	with torch.cuda.stream(stream):
	model(*inputs)
	stream.synchronize()
	torch.cuda.current_stream().wait_stream(stream)
	torch.cuda.synchronize()

	# record
	graph = torch.cuda.CUDAGraph()
	with torch.cuda.graph(graph, stream=stream):
	static_outputs = model(*static_inputs)
	if not isinstance(static_outputs, (list, tuple)):
	static_outputs = (static_outputs,)

	def run(*new_inputs):
	assert len(static_inputs) == len(new_inputs)
	for dst, src in zip(static_inputs, new_inputs):
	dst.copy_(src)
	graph.replay()
	if copy_outputs:
	return [x.clone() for x in static_outputs]
	else:
	return static_outputs

	return run


	@create_backend
	def aot_autograd(subgraph, **kwargs):
	def _wrapped_bw_compiler(args, *kwargs):
	# stop TorchDynamo from trying to compile our generated backwards pass
	return disable(bw_compiler(args, *kwargs))

	bw_compiler = kwargs.get("bw_compiler") or kwargs["fw_compiler"]
	kwargs["bw_compiler"] = _wrapped_bw_compiler

	from functorch.compile import aot_module_simplified

	from .. import disable

	return aot_module_simplified(subgraph.model, **kwargs)


	def tvm_compile(jit_mod, example_inputs, log_file=None, **kwargs):
	if jit_mod is None:
	return None
	try:
	return tvm_compile_inner(jit_mod, example_inputs, None, log_file, **kwargs)
	except Exception as e:
	if log_file and os.path.exists(log_file):
	os.unlink(log_file)
	if isinstance(e, KeyboardInterrupt):
	raise
	log.exception("tvm error")
	return None


	@create_backend
	def tvm(subgraph):
	return subgraph.wrap_returns(
	tvm_compile_inner(
	subgraph.scripted,
	subgraph.example_inputs,
	tuning_option=None,
	cuda=subgraph.is_cuda,
	)
	)


	@create_backend
	def ansor(subgraph):
	"""
	WARNING: this backend takes hours or days to train and
	often produces a slower result than the default schedule.
	"""
	return subgraph.wrap_returns(
	tvm_compile_inner(
	subgraph.scripted,
	subgraph.example_inputs,
	tuning_option="auto_scheduler",
	log_file=subgraph.filename("ansor"),
	cuda=subgraph.is_cuda,
	)
	)


	@create_backend
	def tvm_meta_schedule(subgraph):
	return subgraph.wrap_returns(
	tvm_compile_inner(
	subgraph.scripted,
	subgraph.example_inputs,
	tuning_option="meta_schedule",
	trials=20000,
	cuda=subgraph.is_cuda,
	)
	)


	@functools.lru_cache(None)
	def llvm_target():
	if "avx512" in open("/proc/cpuinfo").read():
	return "llvm -mcpu=skylake-avx512"
	return "llvm -mcpu=core-avx2"


	def tvm_compile_inner(
	jit_mod, example_inputs, tuning_option=None, log_file=None, trials=20000, cuda=False
	):
	try:
	import tvm
	from tvm import relay
	from tvm.contrib import graph_executor

	shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
	mod, params = relay.frontend.from_pytorch(jit_mod, shape_list)
	if cuda:
	dev = tvm.cuda(0)
	target = tvm.target.cuda()
	else:
	dev = tvm.cpu(0)
	target = tvm.target.Target(llvm_target())

	if tuning_option == "auto_scheduler":
	from tvm import auto_scheduler

	if log_file is None:
	log_file = tempfile.NamedTemporaryFile()
	if not os.path.exists(log_file):
	tasks, task_weights = auto_scheduler.extract_tasks(
	mod["main"], params, target
	)
	for task in tasks:
	print(task.compute_dag)
	else:
	print("No tasks")
	if len(tasks) != 0:
	tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
	if not os.path.exists(log_file):
	assert trials > 0
	tune_option = auto_scheduler.TuningOptions(
	num_measure_trials=trials,
	measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
	early_stopping=2000,
	)
	try:
	tuner.tune(tune_option)
	except Exception:
	if os.path.exists(log_file):
	os.unlink(log_file)
	raise

	with auto_scheduler.ApplyHistoryBest(log_file):
	with tvm.transform.PassContext(
	opt_level=3, config={"relay.backend.use_auto_scheduler": True}
	):
	lib = relay.build(mod, target=target, params=params)
	elif tuning_option == "meta_schedule":
	from os import path as osp

	from tvm.contrib.torch import optimize_torch

	with tempfile.TemporaryDirectory() as work_dir:
	if log_file is not None:
	assert osp.isdir(
	log_file
	), "TVM's meta_schedule requires a directory for storing log files."
	work_dir = log_file

	lib = optimize_torch(
	jit_mod,
	example_inputs,
	max_trials_global=20000,
	work_dir=work_dir,
	target=target,
	max_trials_per_task=64,
	)

	elif tuning_option is None:
	# no autotuning (for debugging)
	with tvm.transform.PassContext(opt_level=10):
	lib = relay.build(mod, target=target, params=params)
	else:
	raise NotImplementedError(
	"This tuning option is invalid/not implemented for torchdynamo's TVM-related backend. "
	"There are three available options including None, auto_scheduler and meta_schedule."
	)
	if tune_option != "meta_schedule":
	m = graph_executor.GraphModule(lib["default"](dev))

	def to_torch_tensor(nd_tensor):
	"""A helper function to transfer a NDArray to torch.tensor."""
	if nd_tensor.dtype == "bool":
	# DLPack does not support boolean so it can't be handled by
	# torch.utils.dlpack.from_pack. Workaround by going through
	# numpy, although this brings additional data copy overhead.
	return torch.from_numpy(nd_tensor.numpy())
	return torch.utils.dlpack.from_dlpack(nd_tensor.to_dlpack())

	def exec_tvm(*args):
	args = [a.contiguous() for a in args]
	for idx, arg in enumerate(args, 0):
	if arg.dim() != 0:
	if arg.requires_grad:
	arg = arg.detach()
	m.set_input(
	f"inp_{idx}",
	tvm.nd.array(arg.numpy(), dev),
	)
	m.run()
	return [
	to_torch_tensor(m.get_output(i)) for i in range(m.get_num_outputs())
	]

	else:

	def exec_tvm(*args):
	args = [a.contiguous() for a in args]
	return lib(*args)

	return exec_tvm

	except Exception:
	log.exception("tvm error")
	return jit_mod # explicit fall back to eager


	@functools.lru_cache(None)
	def _init_ltc():
	try:
	import torch._lazy.extract_compiled_graph
	from torch._lazy.ts_backend import init as init_ts_backend

	# hopefully changing this line to sth like _ltc_init_xla_backend in future
	# will enable XLA
	init_ts_backend()

	return torch._lazy
	except ModuleNotFoundError as e:
	print(f"ltc backend fails. Can not import {e.name}")
	raise


	def ltc_reuse_graph(gm: torch.fx.GraphModule, example_inputs):
	ltc = _init_ltc()
	return ltc.extract_compiled_graph.extract_compiled_graph(gm, example_inputs)


	def ltc_trivial(gm: torch.fx.GraphModule, example_inputs):
	ltc = _init_ltc()
	lazy_model = copy.deepcopy(gm).to(device="lazy")
	ltc.extract_compiled_graph.force_lazy_device(lazy_model)

	def ltc_model(*inputs):
	orig_device = inputs[0].device if len(inputs) > 0 else "cuda"
	lazy_inputs = tuple(inp.to(device="lazy") for inp in inputs)

	lazy_out = lazy_model(*lazy_inputs)
	out = tuple(out.to(device=orig_device) for out in lazy_out)
	return out

	return ltc_model


	def ipex_fp32(gm: torch.fx.GraphModule, example_inputs):
	kwargs_ipex = {"datatype": "fp32"}
	return BACKENDS["ipex"](gm, example_inputs, **kwargs_ipex)


	def ipex_bf16(gm: torch.fx.GraphModule, example_inputs):
	kwargs_ipex = {"datatype": "bf16"}
	return BACKENDS["ipex"](gm, example_inputs, **kwargs_ipex)


	def fx2trt_compiler_fp16(gm: torch.fx.GraphModule, example_inputs):
	kwargs_fx2trt = {"fp16_mode": True}
	trt_compiled = BACKENDS["fx2trt"](gm, example_inputs, **kwargs_fx2trt)
	if trt_compiled is not None:
	return trt_compiled
	else:
	print(
	"FX2TRT conversion failed on the subgraph. Return GraphModule forward instead"
	)
	return gm.forward


	def fx2trt_compiler(gm: torch.fx.GraphModule, example_inputs):
	kwargs_fx2trt = {"fp16_mode": False}
	trt_compiled = BACKENDS["fx2trt"](gm, example_inputs, **kwargs_fx2trt)
	if trt_compiled is not None:
	return trt_compiled
	else:
	print(
	"FX2TRT conversion failed on the subgraph. Return GraphModule forward instead"
	)
	return gm.forward