benchmarks/gpt_fast/generate.py - platform/external/pytorch - Git at Google

 import dataclasses
 import itertools
 import time
 from typing import Optional, Tuple

 from mixtral_moe_model import ConditionalFeedForward, Transformer as MixtralMoE
 from mixtral_moe_quantize import (
     ConditionalFeedForwardInt8,
     WeightOnlyInt8QuantHandler as MixtralMoEWeightOnlyInt8QuantHandler,
 )
 from model import Transformer as LLaMA
 from quantize import WeightOnlyInt8QuantHandler as LLaMAWeightOnlyInt8QuantHandler

 import torch
 import torch._inductor.config

 torch._inductor.config.coordinate_descent_tuning = True
 torch._inductor.config.triton.unique_kernel_names = True
 torch._inductor.config.fx_graph_cache = True  # Experimental feature to reduce compilation times, will be on by default in future
 torch._inductor.config.assert_indirect_indexing = False


 @dataclasses.dataclass
 class GPTModelConfig:
     name: str
     module: type
     mode: Optional[str]
     quantizer: type
     token_per_sec: float
     memory_bandwidth: float
     compilation_time: float


 def device_sync(device):
     if "cuda" in device:
         torch.cuda.synchronize(device)
     elif "cpu" in device:
         pass
     else:
         print(f"device={device} is not yet suppported")


 def multinomial_sample_one_no_sync(
     probs_sort,
 ):  # Does multinomial sampling without a cuda synchronization
     q = torch.empty_like(probs_sort).exponential_(1)
     return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)


 def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
     logits = logits / max(temperature, 1e-5)

     if top_k is not None:
         v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
         pivot = v.select(-1, -1).unsqueeze(-1)
         logits = torch.where(logits < pivot, -float("Inf"), logits)
     probs = torch.nn.functional.softmax(logits, dim=-1)
     return probs


 def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
     probs = logits_to_probs(logits[0, -1], temperature, top_k)
     idx_next = multinomial_sample_one_no_sync(probs)
     return idx_next, probs


 @torch.compile(fullgraph=True)
 def prefill(
     model: torch.nn.Module, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
 ) -> torch.Tensor:
     # input_pos: [B, S]
     logits = model(x, input_pos)
     return sample(logits, **sampling_kwargs)[0]


 @torch.compile(fullgraph=True, mode="reduce-overhead")
 def decode_one_token(
     model: torch.nn.Module, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     # input_pos: [B, 1]
     assert input_pos.shape[-1] == 1
     logits = model(x, input_pos)
     return sample(logits, **sampling_kwargs)


 def decode_n_tokens(
     model: torch.nn.Module,
     cur_token: torch.Tensor,
     input_pos: torch.Tensor,
     num_new_tokens: int,
     **sampling_kwargs,
 ):
     new_tokens, new_probs = [], []
     for i in range(num_new_tokens):
         with torch.nn.attention.sdpa_kernel(
             torch.nn.attention.SDPBackend.MATH
         ):  # Actually better for Inductor to codegen attention here
             next_token, next_prob = decode_one_token(
                 model, cur_token, input_pos, **sampling_kwargs
             )
             input_pos += 1
             new_tokens.append(next_token.clone())
             new_probs.append(next_prob.clone())
             cur_token = next_token.view(1, -1)

     return new_tokens, new_probs


 @torch.no_grad()
 def generate(
     model: torch.nn.Module, prompt: torch.Tensor, max_new_tokens: int, **sampling_kwargs
 ) -> torch.Tensor:
     device, dtype = prompt.device, prompt.dtype
     T = prompt.size(0)
     T_new = T + max_new_tokens
     max_seq_length = min(T_new, model.config.block_size)

     with torch.device(device):
         model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)

     # create an empty tensor of the expected final shape and fill in the current tokens
     empty = torch.empty(T_new, dtype=dtype, device=device)
     empty[:T] = prompt
     seq = empty
     input_pos = torch.arange(0, T, device=device)

     next_token = prefill(model, prompt.view(1, -1), input_pos, **sampling_kwargs)
     seq[T] = next_token

     input_pos = torch.tensor([T], device=device, dtype=torch.int)

     generated_tokens, _ = decode_n_tokens(
         model, next_token.view(1, -1), input_pos, max_new_tokens - 1, **sampling_kwargs
     )
     seq[T + 1 :] = torch.cat(generated_tokens)
     return seq


 def _load_model(x: GPTModelConfig, device="cuda", precision=torch.bfloat16):
     with torch.device("meta"):
         model = x.module.from_name(x.name)
     model = model.to(dtype=precision)

     if x.mode == "int8":
         print("Using int8 weight-only quantization!")
         model = x.quantizer(model).convert_for_runtime()

     state_dict = model.state_dict()
     for k, v in state_dict.items():
         state_dict[k] = torch.nn.Parameter(
             torch.randn(v.shape, device=device).to(dtype=v.dtype),
             requires_grad=v.requires_grad,
         )
     model.load_state_dict(state_dict, assign=True)
     return model.eval()


 # Only count activated parameters and buffers.
 def _get_model_size(model):
     model_size = 0
     for name, child in model.named_children():
         if not isinstance(child, torch.nn.Embedding):
             model_size += sum(
                 [
                     p.numel() * p.dtype.itemsize
                     for p in itertools.chain(child.parameters(), child.buffers())
                 ]
             )

     # Remove the inactivated experts from the model size if this is mixture of experts
     # architecture, since only activated experts are loaded.
     if hasattr(model.config, "num_experts"):
         config = model.config
         for submodule in model.modules():
             if isinstance(
                 submodule, (ConditionalFeedForward, ConditionalFeedForwardInt8)
             ):
                 model_size -= (
                     sum(
                         [
                             p.numel() * p.dtype.itemsize
                             for p in itertools.chain(
                                 submodule.parameters(), child.buffers()
                             )
                         ]
                     )
                     * (config.num_experts - config.num_activated_experts)
                     / config.num_experts
                 )

     return model_size


 def run_experiment(
     x: GPTModelConfig,
     num_samples: int = 5,
     max_new_tokens: int = 200,
     top_k: int = 200,
     temperature: float = 0.8,
     device: str = "cuda",
 ) -> None:
     print(f"Loading model {x.name}")
     t0 = time.time()
     model = _load_model(x)
     device_sync(device=device)  # MKG
     print(f"Time to load model: {time.time() - t0:.02f} seconds")

     prompt = torch.tensor(
         [1, 15043, 29892, 590, 1024, 338], device=device, dtype=torch.int32
     )
     prompt_length = prompt.size(0)

     torch.manual_seed(1234)
     model_size = _get_model_size(model)

     aggregate_metrics = {"tokens_per_sec": [], "memory_bandwidth": []}
     start = -1
     compilation_time = None

     for i in range(start, num_samples):
         device_sync(device=device)  # MKG

         t0 = time.perf_counter()
         y = generate(
             model, prompt, max_new_tokens, temperature=temperature, top_k=top_k
         )

         if i == -1:
             compilation_time = time.perf_counter() - t0
             print(f"Compilation time: {compilation_time:.2f} seconds")
             continue

         device_sync(device=device)  # MKG
         t = time.perf_counter() - t0
         tokens_generated = y.size(0) - prompt_length
         tokens_sec = tokens_generated / t
         aggregate_metrics["tokens_per_sec"].append(tokens_sec)
         aggregate_metrics["memory_bandwidth"].append(model_size * tokens_sec / 1e9)

     token_per_sec = torch.mean(torch.tensor(aggregate_metrics["tokens_per_sec"])).item()
     memory_bandwidth = torch.mean(
         torch.tensor(aggregate_metrics["memory_bandwidth"])
     ).item()
     print(f"Average tokens/sec: {token_per_sec:.2f} tokens/sec")
     print(f"Average bandwidth achieved: {memory_bandwidth:.02f} GB/s")
     print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
     return token_per_sec, memory_bandwidth, compilation_time


 # token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
 def run_llama2_7b_bf16(device: str = "cuda"):
     from benchmark import Experiment

     model = GPTModelConfig(
         "Llama-2-7b-chat-hf",
         LLaMA,
         "bfloat16",
         LLaMAWeightOnlyInt8QuantHandler,
         94,
         1253,
         162,
     )
     token_per_sec, memory_bandwidth, compilation_time = run_experiment(model)
     return [
         Experiment(
             model.name,
             "token_per_sec",
             model.token_per_sec,
             f"{token_per_sec:.02f}",
             model.mode,
             device,
             True,
         ),
         Experiment(
             model.name,
             "memory_bandwidth(GB/s)",
             model.memory_bandwidth,
             f"{memory_bandwidth:.02f}",
             model.mode,
             device,
             True,
         ),
         Experiment(
             model.name,
             "compilation_time(s)",
             model.compilation_time,
             f"{compilation_time:.02f}",
             model.mode,
             device,
             True,
         ),
     ]


 # token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
 def run_llama2_7b_int8(device: str = "cuda"):
     from benchmark import Experiment

     model = GPTModelConfig(
         "Llama-2-7b-chat-hf",
         LLaMA,
         "int8",
         LLaMAWeightOnlyInt8QuantHandler,
         144,
         957,
         172,
     )
     token_per_sec, memory_bandwidth, compilation_time = run_experiment(model)
     return [
         Experiment(
             model.name,
             "token_per_sec",
             model.token_per_sec,
             f"{token_per_sec:.02f}",
             model.mode,
             device,
             True,
         ),
         Experiment(
             model.name,
             "memory_bandwidth(GB/s)",
             model.memory_bandwidth,
             f"{memory_bandwidth:.02f}",
             model.mode,
             device,
             True,
         ),
         Experiment(
             model.name,
             "compilation_time(s)",
             model.compilation_time,
             f"{compilation_time:.02f}",
             model.mode,
             device,
             True,
         ),
     ]


 # token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
 def run_mixtral_8x7b_int8(device: str = "cuda"):
     from benchmark import Experiment

     # We reduced the original number of layers from 32 to 16 to adapt CI memory limitation.
     model = GPTModelConfig(
         "Mixtral-8x7B-v0.1",
         MixtralMoE,
         "int8",
         MixtralMoEWeightOnlyInt8QuantHandler,
         175,
         1280,
         162,
     )
     token_per_sec, memory_bandwidth, compilation_time = run_experiment(model)
     return [
         Experiment(
             model.name,
             "token_per_sec",
             model.token_per_sec,
             f"{token_per_sec:.02f}",
             model.mode,
             device,
             True,
         ),
         Experiment(
             model.name,
             "memory_bandwidth(GB/s)",
             model.memory_bandwidth,
             f"{memory_bandwidth:.02f}",
             model.mode,
             device,
             True,
         ),
         Experiment(
             model.name,
             "compilation_time(s)",
             model.compilation_time,
             f"{compilation_time:.02f}",
             model.mode,
             device,
             True,
         ),
     ]
	import dataclasses
	import itertools
	import time
	from typing import Optional, Tuple

	from mixtral_moe_model import ConditionalFeedForward, Transformer as MixtralMoE
	from mixtral_moe_quantize import (
	ConditionalFeedForwardInt8,
	WeightOnlyInt8QuantHandler as MixtralMoEWeightOnlyInt8QuantHandler,
	)
	from model import Transformer as LLaMA
	from quantize import WeightOnlyInt8QuantHandler as LLaMAWeightOnlyInt8QuantHandler

	import torch
	import torch._inductor.config

	torch._inductor.config.coordinate_descent_tuning = True
	torch._inductor.config.triton.unique_kernel_names = True
	torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future
	torch._inductor.config.assert_indirect_indexing = False


	@dataclasses.dataclass
	class GPTModelConfig:
	name: str
	module: type
	mode: Optional[str]
	quantizer: type
	token_per_sec: float
	memory_bandwidth: float
	compilation_time: float


	def device_sync(device):
	if "cuda" in device:
	torch.cuda.synchronize(device)
	elif "cpu" in device:
	pass
	else:
	print(f"device={device} is not yet suppported")


	def multinomial_sample_one_no_sync(
	probs_sort,
	): # Does multinomial sampling without a cuda synchronization
	q = torch.empty_like(probs_sort).exponential_(1)
	return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)


	def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
	logits = logits / max(temperature, 1e-5)

	if top_k is not None:
	v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
	pivot = v.select(-1, -1).unsqueeze(-1)
	logits = torch.where(logits < pivot, -float("Inf"), logits)
	probs = torch.nn.functional.softmax(logits, dim=-1)
	return probs


	def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
	probs = logits_to_probs(logits[0, -1], temperature, top_k)
	idx_next = multinomial_sample_one_no_sync(probs)
	return idx_next, probs


	@torch.compile(fullgraph=True)
	def prefill(
	model: torch.nn.Module, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
	) -> torch.Tensor:
	# input_pos: [B, S]
	logits = model(x, input_pos)
	return sample(logits, **sampling_kwargs)[0]


	@torch.compile(fullgraph=True, mode="reduce-overhead")
	def decode_one_token(
	model: torch.nn.Module, x: torch.Tensor, input_pos: torch.Tensor, **sampling_kwargs
	) -> Tuple[torch.Tensor, torch.Tensor]:
	# input_pos: [B, 1]
	assert input_pos.shape[-1] == 1
	logits = model(x, input_pos)
	return sample(logits, **sampling_kwargs)


	def decode_n_tokens(
	model: torch.nn.Module,
	cur_token: torch.Tensor,
	input_pos: torch.Tensor,
	num_new_tokens: int,
	**sampling_kwargs,
	):
	new_tokens, new_probs = [], []
	for i in range(num_new_tokens):
	with torch.nn.attention.sdpa_kernel(
	torch.nn.attention.SDPBackend.MATH
	): # Actually better for Inductor to codegen attention here
	next_token, next_prob = decode_one_token(
	model, cur_token, input_pos, **sampling_kwargs
	)
	input_pos += 1
	new_tokens.append(next_token.clone())
	new_probs.append(next_prob.clone())
	cur_token = next_token.view(1, -1)

	return new_tokens, new_probs


	@torch.no_grad()
	def generate(
	model: torch.nn.Module, prompt: torch.Tensor, max_new_tokens: int, **sampling_kwargs
	) -> torch.Tensor:
	device, dtype = prompt.device, prompt.dtype
	T = prompt.size(0)
	T_new = T + max_new_tokens
	max_seq_length = min(T_new, model.config.block_size)

	with torch.device(device):
	model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)

	# create an empty tensor of the expected final shape and fill in the current tokens
	empty = torch.empty(T_new, dtype=dtype, device=device)
	empty[:T] = prompt
	seq = empty
	input_pos = torch.arange(0, T, device=device)

	next_token = prefill(model, prompt.view(1, -1), input_pos, **sampling_kwargs)
	seq[T] = next_token

	input_pos = torch.tensor([T], device=device, dtype=torch.int)

	generated_tokens, _ = decode_n_tokens(
	model, next_token.view(1, -1), input_pos, max_new_tokens - 1, **sampling_kwargs
	)
	seq[T + 1 :] = torch.cat(generated_tokens)
	return seq


	def _load_model(x: GPTModelConfig, device="cuda", precision=torch.bfloat16):
	with torch.device("meta"):
	model = x.module.from_name(x.name)
	model = model.to(dtype=precision)

	if x.mode == "int8":
	print("Using int8 weight-only quantization!")
	model = x.quantizer(model).convert_for_runtime()

	state_dict = model.state_dict()
	for k, v in state_dict.items():
	state_dict[k] = torch.nn.Parameter(
	torch.randn(v.shape, device=device).to(dtype=v.dtype),
	requires_grad=v.requires_grad,
	)
	model.load_state_dict(state_dict, assign=True)
	return model.eval()


	# Only count activated parameters and buffers.
	def _get_model_size(model):
	model_size = 0
	for name, child in model.named_children():
	if not isinstance(child, torch.nn.Embedding):
	model_size += sum(
	[
	p.numel() * p.dtype.itemsize
	for p in itertools.chain(child.parameters(), child.buffers())
	]
	)

	# Remove the inactivated experts from the model size if this is mixture of experts
	# architecture, since only activated experts are loaded.
	if hasattr(model.config, "num_experts"):
	config = model.config
	for submodule in model.modules():
	if isinstance(
	submodule, (ConditionalFeedForward, ConditionalFeedForwardInt8)
	):
	model_size -= (
	sum(
	[
	p.numel() * p.dtype.itemsize
	for p in itertools.chain(
	submodule.parameters(), child.buffers()
	)
	]
	)
	* (config.num_experts - config.num_activated_experts)
	/ config.num_experts
	)

	return model_size


	def run_experiment(
	x: GPTModelConfig,
	num_samples: int = 5,
	max_new_tokens: int = 200,
	top_k: int = 200,
	temperature: float = 0.8,
	device: str = "cuda",
	) -> None:
	print(f"Loading model {x.name}")
	t0 = time.time()
	model = _load_model(x)
	device_sync(device=device) # MKG
	print(f"Time to load model: {time.time() - t0:.02f} seconds")

	prompt = torch.tensor(
	[1, 15043, 29892, 590, 1024, 338], device=device, dtype=torch.int32
	)
	prompt_length = prompt.size(0)

	torch.manual_seed(1234)
	model_size = _get_model_size(model)

	aggregate_metrics = {"tokens_per_sec": [], "memory_bandwidth": []}
	start = -1
	compilation_time = None

	for i in range(start, num_samples):
	device_sync(device=device) # MKG

	t0 = time.perf_counter()
	y = generate(
	model, prompt, max_new_tokens, temperature=temperature, top_k=top_k
	)

	if i == -1:
	compilation_time = time.perf_counter() - t0
	print(f"Compilation time: {compilation_time:.2f} seconds")
	continue

	device_sync(device=device) # MKG
	t = time.perf_counter() - t0
	tokens_generated = y.size(0) - prompt_length
	tokens_sec = tokens_generated / t
	aggregate_metrics["tokens_per_sec"].append(tokens_sec)
	aggregate_metrics["memory_bandwidth"].append(model_size * tokens_sec / 1e9)

	token_per_sec = torch.mean(torch.tensor(aggregate_metrics["tokens_per_sec"])).item()
	memory_bandwidth = torch.mean(
	torch.tensor(aggregate_metrics["memory_bandwidth"])
	).item()
	print(f"Average tokens/sec: {token_per_sec:.2f} tokens/sec")
	print(f"Average bandwidth achieved: {memory_bandwidth:.02f} GB/s")
	print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
	return token_per_sec, memory_bandwidth, compilation_time


	# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
	def run_llama2_7b_bf16(device: str = "cuda"):
	from benchmark import Experiment

	model = GPTModelConfig(
	"Llama-2-7b-chat-hf",
	LLaMA,
	"bfloat16",
	LLaMAWeightOnlyInt8QuantHandler,
	94,
	1253,
	162,
	)
	token_per_sec, memory_bandwidth, compilation_time = run_experiment(model)
	return [
	Experiment(
	model.name,
	"token_per_sec",
	model.token_per_sec,
	f"{token_per_sec:.02f}",
	model.mode,
	device,
	True,
	),
	Experiment(
	model.name,
	"memory_bandwidth(GB/s)",
	model.memory_bandwidth,
	f"{memory_bandwidth:.02f}",
	model.mode,
	device,
	True,
	),
	Experiment(
	model.name,
	"compilation_time(s)",
	model.compilation_time,
	f"{compilation_time:.02f}",
	model.mode,
	device,
	True,
	),
	]


	# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
	def run_llama2_7b_int8(device: str = "cuda"):
	from benchmark import Experiment

	model = GPTModelConfig(
	"Llama-2-7b-chat-hf",
	LLaMA,
	"int8",
	LLaMAWeightOnlyInt8QuantHandler,
	144,
	957,
	172,
	)
	token_per_sec, memory_bandwidth, compilation_time = run_experiment(model)
	return [
	Experiment(
	model.name,
	"token_per_sec",
	model.token_per_sec,
	f"{token_per_sec:.02f}",
	model.mode,
	device,
	True,
	),
	Experiment(
	model.name,
	"memory_bandwidth(GB/s)",
	model.memory_bandwidth,
	f"{memory_bandwidth:.02f}",
	model.mode,
	device,
	True,
	),
	Experiment(
	model.name,
	"compilation_time(s)",
	model.compilation_time,
	f"{compilation_time:.02f}",
	model.mode,
	device,
	True,
	),
	]


	# token_per_sec and memory_bandwidth target numbers are for A100-40GB, which are different from the typical A100-80GB.
	def run_mixtral_8x7b_int8(device: str = "cuda"):
	from benchmark import Experiment

	# We reduced the original number of layers from 32 to 16 to adapt CI memory limitation.
	model = GPTModelConfig(
	"Mixtral-8x7B-v0.1",
	MixtralMoE,
	"int8",
	MixtralMoEWeightOnlyInt8QuantHandler,
	175,
	1280,
	162,
	)
	token_per_sec, memory_bandwidth, compilation_time = run_experiment(model)
	return [
	Experiment(
	model.name,
	"token_per_sec",
	model.token_per_sec,
	f"{token_per_sec:.02f}",
	model.mode,
	device,
	True,
	),
	Experiment(
	model.name,
	"memory_bandwidth(GB/s)",
	model.memory_bandwidth,
	f"{memory_bandwidth:.02f}",
	model.mode,
	device,
	True,
	),
	Experiment(
	model.name,
	"compilation_time(s)",
	model.compilation_time,
	f"{compilation_time:.02f}",
	model.mode,
	device,
	True,
	),
	]