benchmarks/tensorexpr/__main__.py - platform/external/pytorch - Git at Google

 import argparse
 import itertools
 import os

 # from . import conv           # noqa: F401
 # from . import normalization  # noqa: F401
 # from . import pooling        # noqa: F401
 from . import (  # noqa: F401  # noqa: F401  # noqa: F401  # noqa: F401  # noqa: F401  # noqa: F401  # noqa: F401  # noqa: F401  # noqa: F401
     attention,
     benchmark,
     broadcast,
     concat,
     elementwise,
     matmul,
     reduction,
     rnn_eltwise,
     softmax,
     swish,
     tensor_engine,
 )


 def main():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.RawDescriptionHelpFormatter,
         description="""Benchmark operators in specific shapes.
 Works only with Python3.\n A few examples:
   * benchmark.py: runs all the default configs with all the benchmarks.
   * benchmark.py reduce: runs all the default configs with all benchmark with a prefix 'reduce'
   * benchmark.py layernorm_fwd_cpu_128_32_128_128: run a particular benchmark in that config""",
     )
     parser.add_argument(
         "benchmark_names",
         type=str,
         default=None,
         nargs="*",
         help="name of the benchmark to run",
     )
     parser.add_argument(
         "--device",
         type=str,
         default="cpu,cuda",
         help="a comma separated list of device names",
     )
     parser.add_argument(
         "--mode",
         type=str,
         default="fwd,both",
         help="a comma separated list of running modes",
     )
     parser.add_argument(
         "--dtype",
         type=str,
         default="float32",
         help="a comma separated list of Data Types: {float32[default], float16}",
     )
     parser.add_argument(
         "--input-iter",
         type=str,
         default=None,
         help="a comma separated list of Tensor dimensions that includes a start, \
               stop, and increment that can be constant or a power of 2 \
               {start:stop:inc,start:stop:pow2}",
     )
     parser.add_argument(
         "--engine",
         type=str,
         default="pt",
         help="the underlying tensor engine. only pt for now",
     )
     parser.add_argument(
         "--jit-mode",
         "--jit_mode",
         type=str,
         default="trace",
         help="the jit mode to use: one of {trace, none}",
     )
     parser.add_argument(
         "--cuda-pointwise-loop-levels",
         "--cuda_pointwise_loop_levels",
         type=int,
         default=None,
         help="num of loop levesl for Cuda pointwise operations: 2 or 3",
     )
     parser.add_argument(
         "--cuda-pointwise-block-count",
         "--cuda_pointwise_block_count",
         type=int,
         default=None,
         help="num of block for Cuda pointwise operations",
     )
     parser.add_argument(
         "--cuda-pointwise-block-size",
         "--cuda_pointwise_block_size",
         type=int,
         default=None,
         help="num of blocks for Cuda pointwise operations",
     )
     parser.add_argument(
         "--cuda-fuser",
         "--cuda_fuser",
         type=str,
         default="te",
         help="The Cuda fuser backend to use: one of {te, nvf, old, none}",
     )
     parser.add_argument(
         "--output",
         type=str,
         default="stdout",
         help="The output format of the benchmark run {stdout[default], json}",
     )
     parser.add_argument(
         "--print-ir",
         action="store_true",
         help="Print the IR graph of the Fusion.",
     )
     parser.add_argument(
         "--print-kernel",
         action="store_true",
         help="Print generated kernel(s).",
     )
     parser.add_argument(
         "--no-dynamic-shape",
         action="store_true",
         help="Disable shape randomization in dynamic benchmarks.",
     )
     parser.add_argument(
         "--cpu-fusion",
         "--cpu_fusion",
         default=False,
         action="store_true",
         help="Enable CPU fusion.",
     )
     parser.add_argument(
         "--cat-wo-conditionals",
         "--cat_wo_conditionals",
         default=False,
         action="store_true",
         help="Enable CAT wo conditionals.",
     )

     args = parser.parse_args()

     if args.cuda_fuser == "te":
         import torch

         torch._C._jit_set_profiling_executor(True)
         torch._C._jit_set_texpr_fuser_enabled(True)
         torch._C._jit_override_can_fuse_on_gpu(True)
         torch._C._get_graph_executor_optimize(True)
     elif args.cuda_fuser == "old":
         import torch

         torch._C._jit_set_profiling_executor(False)
         torch._C._jit_set_texpr_fuser_enabled(False)
         torch._C._jit_override_can_fuse_on_gpu(True)
     elif args.cuda_fuser == "nvf":
         import torch

         torch._C._jit_set_profiling_executor(True)
         torch._C._jit_set_texpr_fuser_enabled(False)
         torch._C._jit_set_nvfuser_enabled(True)
         torch._C._get_graph_executor_optimize(True)
     else:
         raise ValueError(f"Undefined fuser: {args.cuda_fuser}")

     if args.cpu_fusion:
         import torch

         torch._C._jit_override_can_fuse_on_cpu(True)
     else:
         import torch

         torch._C._jit_override_can_fuse_on_cpu(False)

     if args.cat_wo_conditionals:
         import torch

         torch._C._jit_cat_wo_conditionals(True)
     else:
         import torch

         torch._C._jit_cat_wo_conditionals(False)

     def set_global_threads(num_threads):
         os.environ["OMP_NUM_THREADS"] = str(num_threads)
         os.environ["MKL_NUM_THREADS"] = str(num_threads)
         os.environ["TVM_NUM_THREADS"] = str(num_threads)
         os.environ["NNC_NUM_THREADS"] = str(num_threads)

     devices = args.device.split(",")
     # accept 'gpu' as an alternative as the 'cuda' device
     devices = ["cuda" if device == "gpu" else device for device in devices]
     cpu_count = 0
     for index, device in enumerate(devices):
         if device.startswith("cpu"):
             cpu_count += 1
             if cpu_count > 1:
                 raise ValueError(
                     "more than one CPU device is not allowed: %d" % (cpu_count)
                 )
             if device == "cpu":
                 continue
             num_threads_str = device[3:]
             try:
                 # see if the device is in 'cpu1' or 'cpu4' format
                 num_threads = int(num_threads_str)
                 set_global_threads(num_threads)
                 devices[index] = "cpu"
             except ValueError:
                 continue

     modes = args.mode.split(",")

     datatypes = args.dtype.split(",")
     for index, dtype in enumerate(datatypes):
         datatypes[index] = getattr(torch, dtype)
         if not datatypes[index]:
             raise AttributeError(f"DataType: {dtype} is not valid!")

     tensor_engine.set_engine_mode(args.engine)

     def run_default_configs(bench_cls, allow_skip=True):
         for mode, device, dtype, config in itertools.product(
             modes, devices, datatypes, bench_cls.default_configs()
         ):
             bench = bench_cls(mode, device, dtype, *config)
             bench.output_type = args.output
             bench.jit_mode = args.jit_mode
             if not bench.is_supported():
                 if allow_skip:
                     continue
                 else:
                     raise ValueError(
                         f"attempted to run an unsupported benchmark: {bench.desc()}"
                     )
             bench.run(args)

     def run_with_input_iter(bench_cls, input_iter, allow_skip=True):
         tensor_dim_specs = input_iter.split(",")
         tensor_dim_specs = [dim.split(":") for dim in tensor_dim_specs]

         configs = []
         for start, stop, inc in tensor_dim_specs:
             dim_list = []
             if inc == "pow2":
                 curr = int(start)
                 while curr <= int(stop):
                     dim_list.append(curr)
                     curr <<= 1
             elif inc == "pow2+1":
                 curr = int(start)
                 while curr <= int(stop):
                     dim_list.append(curr)
                     curr -= 1
                     curr <<= 1
                     curr += 1
             else:
                 dim_list = list(range(int(start), int(stop) + int(inc), int(inc)))
             configs.append(dim_list)
         configs = itertools.product(*configs)

         for mode, device, dtype, config in itertools.product(
             modes, devices, datatypes, list(configs)
         ):
             bench = bench_cls(mode, device, dtype, *config)
             bench.output_type = args.output
             bench.jit_mode = args.jit_mode
             if not bench.is_supported():
                 if allow_skip:
                     continue
                 else:
                     raise ValueError(
                         f"attempted to run an unsupported benchmark: {bench.desc()}"
                     )
             bench.run(args)

     benchmark_classes = benchmark.benchmark_classes
     if not args.benchmark_names:
         # by default, run all the benchmarks
         for benchmark_cls in benchmark_classes:
             run_default_configs(benchmark_cls, allow_skip=True)
     else:
         for name in args.benchmark_names:
             # if the name is the prefix of a benchmark class, run all the benchmarks for that class
             match_class_name = False
             for bench_cls in benchmark_classes:
                 if name in bench_cls.module():
                     match_class_name = True
                     if (args.input_iter is not None) and bench_cls.input_iterable():
                         run_with_input_iter(bench_cls, args.input_iter, allow_skip=True)
                     else:
                         if args.input_iter is not None:
                             print(
                                 f"WARNING: Incompatible benchmark class called with input_iter arg: {name}"
                             )
                         run_default_configs(bench_cls, allow_skip=True)

             if match_class_name:
                 continue

             # if not a class module, parse the config and call it that way
             match_class_name = False
             for bench_cls in benchmark_classes:
                 cls_module = bench_cls.module()
                 if name.startswith(cls_module):
                     match_class_name = True
                     if name[len(cls_module)] != "_":
                         raise ValueError(f"invalid name: {name}")
                     config_str = name[(len(cls_module) + 1) :]
                     config = config_str.split("_")
                     if len(config) < 2:
                         raise ValueError(f"invalid config: {config}")
                     mode, device = config[0:2]
                     # TODO: make sure virtual devices such as 'cpu1' and 'cpu4' are supported.
                     if mode not in ["fwd", "both"]:
                         raise ValueError(f"invalid mode: {mode}")
                     for i, entry in enumerate(config):
                         try:
                             value = int(entry)
                             config[i] = value
                         except ValueError:
                             pass
                     # TODO: output dtype in the config and  parse it back from the str
                     bench = bench_cls(config[0], config[1], torch.float32, *config[2:])
                     bench.jit_mode = args.jit_mode
                     bench.output_type = args.output
                     bench.run(args)

             if not match_class_name:
                 available_classes = ", ".join(
                     [bench_cls.module() for bench_cls in benchmark_classes]
                 )
                 raise ValueError(
                     f"invalid name: {name}\nAvailable benchmark classes:\n{available_classes}"
                 )


 if __name__ == "__main__":
     main()
	import argparse
	import itertools
	import os

	# from . import conv # noqa: F401
	# from . import normalization # noqa: F401
	# from . import pooling # noqa: F401
	from . import ( # noqa: F401 # noqa: F401 # noqa: F401 # noqa: F401 # noqa: F401 # noqa: F401 # noqa: F401 # noqa: F401 # noqa: F401
	attention,
	benchmark,
	broadcast,
	concat,
	elementwise,
	matmul,
	reduction,
	rnn_eltwise,
	softmax,
	swish,
	tensor_engine,
	)


	def main():
	parser = argparse.ArgumentParser(
	formatter_class=argparse.RawDescriptionHelpFormatter,
	description="""Benchmark operators in specific shapes.
	Works only with Python3.\n A few examples:
	* benchmark.py: runs all the default configs with all the benchmarks.
	* benchmark.py reduce: runs all the default configs with all benchmark with a prefix 'reduce'
	* benchmark.py layernorm_fwd_cpu_128_32_128_128: run a particular benchmark in that config""",
	)
	parser.add_argument(
	"benchmark_names",
	type=str,
	default=None,
	nargs="*",
	help="name of the benchmark to run",
	)
	parser.add_argument(
	"--device",
	type=str,
	default="cpu,cuda",
	help="a comma separated list of device names",
	)
	parser.add_argument(
	"--mode",
	type=str,
	default="fwd,both",
	help="a comma separated list of running modes",
	)
	parser.add_argument(
	"--dtype",
	type=str,
	default="float32",
	help="a comma separated list of Data Types: {float32[default], float16}",
	)
	parser.add_argument(
	"--input-iter",
	type=str,
	default=None,
	help="a comma separated list of Tensor dimensions that includes a start, \
	stop, and increment that can be constant or a power of 2 \
	{start:stop:inc,start:stop:pow2}",
	)
	parser.add_argument(
	"--engine",
	type=str,
	default="pt",
	help="the underlying tensor engine. only pt for now",
	)
	parser.add_argument(
	"--jit-mode",
	"--jit_mode",
	type=str,
	default="trace",
	help="the jit mode to use: one of {trace, none}",
	)
	parser.add_argument(
	"--cuda-pointwise-loop-levels",
	"--cuda_pointwise_loop_levels",
	type=int,
	default=None,
	help="num of loop levesl for Cuda pointwise operations: 2 or 3",
	)
	parser.add_argument(
	"--cuda-pointwise-block-count",
	"--cuda_pointwise_block_count",
	type=int,
	default=None,
	help="num of block for Cuda pointwise operations",
	)
	parser.add_argument(
	"--cuda-pointwise-block-size",
	"--cuda_pointwise_block_size",
	type=int,
	default=None,
	help="num of blocks for Cuda pointwise operations",
	)
	parser.add_argument(
	"--cuda-fuser",
	"--cuda_fuser",
	type=str,
	default="te",
	help="The Cuda fuser backend to use: one of {te, nvf, old, none}",
	)
	parser.add_argument(
	"--output",
	type=str,
	default="stdout",
	help="The output format of the benchmark run {stdout[default], json}",
	)
	parser.add_argument(
	"--print-ir",
	action="store_true",
	help="Print the IR graph of the Fusion.",
	)
	parser.add_argument(
	"--print-kernel",
	action="store_true",
	help="Print generated kernel(s).",
	)
	parser.add_argument(
	"--no-dynamic-shape",
	action="store_true",
	help="Disable shape randomization in dynamic benchmarks.",
	)
	parser.add_argument(
	"--cpu-fusion",
	"--cpu_fusion",
	default=False,
	action="store_true",
	help="Enable CPU fusion.",
	)
	parser.add_argument(
	"--cat-wo-conditionals",
	"--cat_wo_conditionals",
	default=False,
	action="store_true",
	help="Enable CAT wo conditionals.",
	)

	args = parser.parse_args()

	if args.cuda_fuser == "te":
	import torch

	torch._C._jit_set_profiling_executor(True)
	torch._C._jit_set_texpr_fuser_enabled(True)
	torch._C._jit_override_can_fuse_on_gpu(True)
	torch._C._get_graph_executor_optimize(True)
	elif args.cuda_fuser == "old":
	import torch

	torch._C._jit_set_profiling_executor(False)
	torch._C._jit_set_texpr_fuser_enabled(False)
	torch._C._jit_override_can_fuse_on_gpu(True)
	elif args.cuda_fuser == "nvf":
	import torch

	torch._C._jit_set_profiling_executor(True)
	torch._C._jit_set_texpr_fuser_enabled(False)
	torch._C._jit_set_nvfuser_enabled(True)
	torch._C._get_graph_executor_optimize(True)
	else:
	raise ValueError(f"Undefined fuser: {args.cuda_fuser}")

	if args.cpu_fusion:
	import torch

	torch._C._jit_override_can_fuse_on_cpu(True)
	else:
	import torch

	torch._C._jit_override_can_fuse_on_cpu(False)

	if args.cat_wo_conditionals:
	import torch

	torch._C._jit_cat_wo_conditionals(True)
	else:
	import torch

	torch._C._jit_cat_wo_conditionals(False)

	def set_global_threads(num_threads):
	os.environ["OMP_NUM_THREADS"] = str(num_threads)
	os.environ["MKL_NUM_THREADS"] = str(num_threads)
	os.environ["TVM_NUM_THREADS"] = str(num_threads)
	os.environ["NNC_NUM_THREADS"] = str(num_threads)

	devices = args.device.split(",")
	# accept 'gpu' as an alternative as the 'cuda' device
	devices = ["cuda" if device == "gpu" else device for device in devices]
	cpu_count = 0
	for index, device in enumerate(devices):
	if device.startswith("cpu"):
	cpu_count += 1
	if cpu_count > 1:
	raise ValueError(
	"more than one CPU device is not allowed: %d" % (cpu_count)
	)
	if device == "cpu":
	continue
	num_threads_str = device[3:]
	try:
	# see if the device is in 'cpu1' or 'cpu4' format
	num_threads = int(num_threads_str)
	set_global_threads(num_threads)
	devices[index] = "cpu"
	except ValueError:
	continue

	modes = args.mode.split(",")

	datatypes = args.dtype.split(",")
	for index, dtype in enumerate(datatypes):
	datatypes[index] = getattr(torch, dtype)
	if not datatypes[index]:
	raise AttributeError(f"DataType: {dtype} is not valid!")

	tensor_engine.set_engine_mode(args.engine)

	def run_default_configs(bench_cls, allow_skip=True):
	for mode, device, dtype, config in itertools.product(
	modes, devices, datatypes, bench_cls.default_configs()
	):
	bench = bench_cls(mode, device, dtype, *config)
	bench.output_type = args.output
	bench.jit_mode = args.jit_mode
	if not bench.is_supported():
	if allow_skip:
	continue
	else:
	raise ValueError(
	f"attempted to run an unsupported benchmark: {bench.desc()}"
	)
	bench.run(args)

	def run_with_input_iter(bench_cls, input_iter, allow_skip=True):
	tensor_dim_specs = input_iter.split(",")
	tensor_dim_specs = [dim.split(":") for dim in tensor_dim_specs]

	configs = []
	for start, stop, inc in tensor_dim_specs:
	dim_list = []
	if inc == "pow2":
	curr = int(start)
	while curr <= int(stop):
	dim_list.append(curr)
	curr <<= 1
	elif inc == "pow2+1":
	curr = int(start)
	while curr <= int(stop):
	dim_list.append(curr)
	curr -= 1
	curr <<= 1
	curr += 1
	else:
	dim_list = list(range(int(start), int(stop) + int(inc), int(inc)))
	configs.append(dim_list)
	configs = itertools.product(*configs)

	for mode, device, dtype, config in itertools.product(
	modes, devices, datatypes, list(configs)
	):
	bench = bench_cls(mode, device, dtype, *config)
	bench.output_type = args.output
	bench.jit_mode = args.jit_mode
	if not bench.is_supported():
	if allow_skip:
	continue
	else:
	raise ValueError(
	f"attempted to run an unsupported benchmark: {bench.desc()}"
	)
	bench.run(args)

	benchmark_classes = benchmark.benchmark_classes
	if not args.benchmark_names:
	# by default, run all the benchmarks
	for benchmark_cls in benchmark_classes:
	run_default_configs(benchmark_cls, allow_skip=True)
	else:
	for name in args.benchmark_names:
	# if the name is the prefix of a benchmark class, run all the benchmarks for that class
	match_class_name = False
	for bench_cls in benchmark_classes:
	if name in bench_cls.module():
	match_class_name = True
	if (args.input_iter is not None) and bench_cls.input_iterable():
	run_with_input_iter(bench_cls, args.input_iter, allow_skip=True)
	else:
	if args.input_iter is not None:
	print(
	f"WARNING: Incompatible benchmark class called with input_iter arg: {name}"
	)
	run_default_configs(bench_cls, allow_skip=True)

	if match_class_name:
	continue

	# if not a class module, parse the config and call it that way
	match_class_name = False
	for bench_cls in benchmark_classes:
	cls_module = bench_cls.module()
	if name.startswith(cls_module):
	match_class_name = True
	if name[len(cls_module)] != "_":
	raise ValueError(f"invalid name: {name}")
	config_str = name[(len(cls_module) + 1) :]
	config = config_str.split("_")
	if len(config) < 2:
	raise ValueError(f"invalid config: {config}")
	mode, device = config[0:2]
	# TODO: make sure virtual devices such as 'cpu1' and 'cpu4' are supported.
	if mode not in ["fwd", "both"]:
	raise ValueError(f"invalid mode: {mode}")
	for i, entry in enumerate(config):
	try:
	value = int(entry)
	config[i] = value
	except ValueError:
	pass
	# TODO: output dtype in the config and parse it back from the str
	bench = bench_cls(config[0], config[1], torch.float32, *config[2:])
	bench.jit_mode = args.jit_mode
	bench.output_type = args.output
	bench.run(args)

	if not match_class_name:
	available_classes = ", ".join(
	[bench_cls.module() for bench_cls in benchmark_classes]
	)
	raise ValueError(
	f"invalid name: {name}\nAvailable benchmark classes:\n{available_classes}"
	)


	if __name__ == "__main__":
	main()