benchmarks/sparse/benchmark_semi_structured_sparsity.py - platform/external/pytorch - Git at Google

 import argparse
 import random

 import pandas as pd
 from tqdm import tqdm

 import torch
 import torch.utils.benchmark as benchmark
 from torch import nn
 from torch.sparse import SparseSemiStructuredTensor, to_sparse_semi_structured


 torch.set_printoptions(
     precision=2,
     threshold=None,
     edgeitems=16,
     linewidth=480,
     profile=None,
     sci_mode=False,
 )


 # helper model definition for pruner
 class Model(nn.Module):
     def __init__(self, m, k, dtype=None):
         super().__init__()
         # transposed so reversed
         self.linear = nn.Linear(k, m)

     def forward(self, x):
         return self.linear(x)


 def rand_sparse_semi_structured_mask(
     r, c, dtype=torch.float16, device="cuda", choice=None
 ):
     """
     This function returns a 1:2 sparse matrix of size (r, c).
     Note that this means this matrix will also be 2:4 and 4:8 sparse as well.
     """

     choices = [[0, 1], [1, 0]]
     mask_entries = [choice or random.choice(choices) for i in range(r * c // 2)]

     return (
         torch.tensor(mask_entries, dtype=dtype, device=device)
         .reshape(r, c)
         .contiguous()
     )


 def test_linear(m, k, n, dtype, contiguous, backend):
     SparseSemiStructuredTensor._FORCE_CUTLASS = backend == "cutlass"
     mask = rand_sparse_semi_structured_mask(m, k, dtype=dtype)
     sparse_weight = torch.rand(m, k).to(dtype).cuda() * mask
     input_tensor = torch.zeros(n, k).to(dtype).cuda()
     model = Model(m, k).to(dtype).cuda().eval()

     dense_measurement = benchmark.Timer(
         stmt="model(input_tensor)",
         globals=locals(),
     ).blocked_autorange()

     dense_output = model(input_tensor)
     print(dense_output.shape)

     # sparsify weights
     model.linear.weight = nn.Parameter(
         to_sparse_semi_structured(
             sparse_weight,
         )
     )

     sparse_output = model(input_tensor)
     print(sparse_output.shape)

     sparse_measurement = benchmark.Timer(
         stmt="model(input_tensor)",
         globals=locals(),
     ).blocked_autorange()

     correct = torch.allclose(dense_output, sparse_output, rtol=1e-3, atol=1e-3)

     return {
         "test_function": "linear",
         "m": m,
         "k": k,
         "n": n,
         "dtype": str(dtype),
         "backend": backend,
         "sparse_latency (ms)": sparse_measurement.median * 1000,
         "dense_latency (ms)": dense_measurement.median * 1000,
         "speedup (d/s)": dense_measurement.median / sparse_measurement.median,
         "correct": correct,
         "contiguous": sparse_output.is_contiguous(),
     }


 def test_tensor(m, k, n, dtype, contiguous, backend):
     A = rand_sparse_semi_structured_mask(m, k, dtype=dtype)
     B = torch.zeros(k, n).to(dtype).cuda()
     bias = torch.rand(n).to(dtype).cuda()

     sA = to_sparse_semi_structured(A)

     # torch.mm calculation
     if dtype is not torch.int8:
         dense_output = torch.mm(A, B)

         dense_measurement = benchmark.Timer(
             stmt="torch.mm(A, B)",
             globals=locals(),
         ).blocked_autorange()

     else:
         print("int8 baseline not supported")
         dense_output = torch.mm(sA, B)

         dense_measurement = benchmark.Timer(
             stmt="torch.mm(sA, B)",
             globals=locals(),
         ).blocked_autorange()

     sparse_output = torch.mm(sA, B)
     sparse_measurement = benchmark.Timer(
         stmt="torch.mm(sA, B)",
         globals=locals(),
     ).blocked_autorange()

     correct = torch.allclose(dense_output, sparse_output, rtol=1e-3, atol=1e-3)

     return {
         "test_function": "tensor",
         "m": m,
         "k": k,
         "n": n,
         "dtype": str(dtype),
         "backend": backend,
         "sparse_latency (ms)": sparse_measurement.median * 1000,
         "dense_latency (ms)": dense_measurement.median * 1000,
         "speedup (d/s)": dense_measurement.median / sparse_measurement.median,
         "correct": correct,
         "contiguous": sparse_output.is_contiguous(),
     }


 if __name__ == "__main__":
     dtype_lookup = {
         "int8": torch.int8,
         "fp16": torch.float16,
         "bf16": torch.bfloat16,
         "fp32": torch.float32,
     }

     parser = argparse.ArgumentParser(description="Semi-Structured Sparsity Benchmarks")
     parser.add_argument(
         "--mode",
         type=str,
         choices=[
             "nvidia-bert",
             "nvidia-fixed-k",
             "nvidia-fixed-mn",
         ],
     )
     parser.add_argument(
         "--dtype",
         type=str,
         choices=dtype_lookup.keys(),
         default="fp16",
     )
     parser.add_argument(
         "--backend", type=str, choices=["cutlass", "cusparselt"], default="cusparselt"
     )
     parser.add_argument("-contiguous", action="store_true")
     parser.add_argument("-e2e", action="store_true")
     parser.add_argument("-save", action="store_true")
     args = parser.parse_args()

     if args.e2e:
         eval_fn = test_linear
     else:
         eval_fn = test_tensor

     print(f"Started benchmark: {args.mode} | dtype: {args.dtype}")
     dtype = dtype_lookup[args.dtype]

     if args.mode == "nvidia-bert":
         bert_shapes = [
             (3072, 1024, 16384),
             (4096, 1024, 16384),
             (1024, 1024, 16384),
             (1024, 4096, 16384),
         ]
         results = (
             eval_fn(m, k, n, dtype, args.contiguous, args.backend)
             for (m, k, n) in tqdm(bert_shapes)
         )

     elif args.mode == "nvidia-fixed-k":
         mn_vals = [
             3072,
             4096,
             5120,
             6144,
             7168,
             8192,
             9216,
             10240,
             11264,
             12288,
             13312,
             14336,
             15360,
             16384,
             17408,
             18432,
             19456,
             20480,
         ]
         results = (
             eval_fn(mn, 10240, mn, dtype, args.contiguous, args.backend)
             for mn in tqdm(mn_vals)
         )

     elif args.mode == "nvidia-fixed-mn":
         k_vals = [
             2560,
             3840,
             5120,
             6400,
             7680,
             8960,
             10240,
             11520,
             12800,
             14080,
             15360,
             16640,
             17920,
             19200,
             20480,
         ]
         results = (
             eval_fn(10240, k, 10240, dtype, args.contiguous, args.backend)
             for k in tqdm(k_vals)
         )

     df = pd.DataFrame.from_records(results)
     if args.save:
         save_file = f"{args.mode}_{args.dtype}_{args.backend}.csv"
         df.to_csv(save_file)
         print(f"Finished benchmark: {args.mode} saved results to {save_file}")
     print(df)
	import argparse
	import random

	import pandas as pd
	from tqdm import tqdm

	import torch
	import torch.utils.benchmark as benchmark
	from torch import nn
	from torch.sparse import SparseSemiStructuredTensor, to_sparse_semi_structured


	torch.set_printoptions(
	precision=2,
	threshold=None,
	edgeitems=16,
	linewidth=480,
	profile=None,
	sci_mode=False,
	)


	# helper model definition for pruner
	class Model(nn.Module):
	def __init__(self, m, k, dtype=None):
	super().__init__()
	# transposed so reversed
	self.linear = nn.Linear(k, m)

	def forward(self, x):
	return self.linear(x)


	def rand_sparse_semi_structured_mask(
	r, c, dtype=torch.float16, device="cuda", choice=None
	):
	"""
	This function returns a 1:2 sparse matrix of size (r, c).
	Note that this means this matrix will also be 2:4 and 4:8 sparse as well.
	"""

	choices = [[0, 1], [1, 0]]
	mask_entries = [choice or random.choice(choices) for i in range(r * c // 2)]

	return (
	torch.tensor(mask_entries, dtype=dtype, device=device)
	.reshape(r, c)
	.contiguous()
	)


	def test_linear(m, k, n, dtype, contiguous, backend):
	SparseSemiStructuredTensor._FORCE_CUTLASS = backend == "cutlass"
	mask = rand_sparse_semi_structured_mask(m, k, dtype=dtype)
	sparse_weight = torch.rand(m, k).to(dtype).cuda() * mask
	input_tensor = torch.zeros(n, k).to(dtype).cuda()
	model = Model(m, k).to(dtype).cuda().eval()

	dense_measurement = benchmark.Timer(
	stmt="model(input_tensor)",
	globals=locals(),
	).blocked_autorange()

	dense_output = model(input_tensor)
	print(dense_output.shape)

	# sparsify weights
	model.linear.weight = nn.Parameter(
	to_sparse_semi_structured(
	sparse_weight,
	)
	)

	sparse_output = model(input_tensor)
	print(sparse_output.shape)

	sparse_measurement = benchmark.Timer(
	stmt="model(input_tensor)",
	globals=locals(),
	).blocked_autorange()

	correct = torch.allclose(dense_output, sparse_output, rtol=1e-3, atol=1e-3)

	return {
	"test_function": "linear",
	"m": m,
	"k": k,
	"n": n,
	"dtype": str(dtype),
	"backend": backend,
	"sparse_latency (ms)": sparse_measurement.median * 1000,
	"dense_latency (ms)": dense_measurement.median * 1000,
	"speedup (d/s)": dense_measurement.median / sparse_measurement.median,
	"correct": correct,
	"contiguous": sparse_output.is_contiguous(),
	}


	def test_tensor(m, k, n, dtype, contiguous, backend):
	A = rand_sparse_semi_structured_mask(m, k, dtype=dtype)
	B = torch.zeros(k, n).to(dtype).cuda()
	bias = torch.rand(n).to(dtype).cuda()

	sA = to_sparse_semi_structured(A)

	# torch.mm calculation
	if dtype is not torch.int8:
	dense_output = torch.mm(A, B)

	dense_measurement = benchmark.Timer(
	stmt="torch.mm(A, B)",
	globals=locals(),
	).blocked_autorange()

	else:
	print("int8 baseline not supported")
	dense_output = torch.mm(sA, B)

	dense_measurement = benchmark.Timer(
	stmt="torch.mm(sA, B)",
	globals=locals(),
	).blocked_autorange()

	sparse_output = torch.mm(sA, B)
	sparse_measurement = benchmark.Timer(
	stmt="torch.mm(sA, B)",
	globals=locals(),
	).blocked_autorange()

	correct = torch.allclose(dense_output, sparse_output, rtol=1e-3, atol=1e-3)

	return {
	"test_function": "tensor",
	"m": m,
	"k": k,
	"n": n,
	"dtype": str(dtype),
	"backend": backend,
	"sparse_latency (ms)": sparse_measurement.median * 1000,
	"dense_latency (ms)": dense_measurement.median * 1000,
	"speedup (d/s)": dense_measurement.median / sparse_measurement.median,
	"correct": correct,
	"contiguous": sparse_output.is_contiguous(),
	}


	if __name__ == "__main__":
	dtype_lookup = {
	"int8": torch.int8,
	"fp16": torch.float16,
	"bf16": torch.bfloat16,
	"fp32": torch.float32,
	}

	parser = argparse.ArgumentParser(description="Semi-Structured Sparsity Benchmarks")
	parser.add_argument(
	"--mode",
	type=str,
	choices=[
	"nvidia-bert",
	"nvidia-fixed-k",
	"nvidia-fixed-mn",
	],
	)
	parser.add_argument(
	"--dtype",
	type=str,
	choices=dtype_lookup.keys(),
	default="fp16",
	)
	parser.add_argument(
	"--backend", type=str, choices=["cutlass", "cusparselt"], default="cusparselt"
	)
	parser.add_argument("-contiguous", action="store_true")
	parser.add_argument("-e2e", action="store_true")
	parser.add_argument("-save", action="store_true")
	args = parser.parse_args()

	if args.e2e:
	eval_fn = test_linear
	else:
	eval_fn = test_tensor

	print(f"Started benchmark: {args.mode} \| dtype: {args.dtype}")
	dtype = dtype_lookup[args.dtype]

	if args.mode == "nvidia-bert":
	bert_shapes = [
	(3072, 1024, 16384),
	(4096, 1024, 16384),
	(1024, 1024, 16384),
	(1024, 4096, 16384),
	]
	results = (
	eval_fn(m, k, n, dtype, args.contiguous, args.backend)
	for (m, k, n) in tqdm(bert_shapes)
	)

	elif args.mode == "nvidia-fixed-k":
	mn_vals = [
	3072,
	4096,
	5120,
	6144,
	7168,
	8192,
	9216,
	10240,
	11264,
	12288,
	13312,
	14336,
	15360,
	16384,
	17408,
	18432,
	19456,
	20480,
	]
	results = (
	eval_fn(mn, 10240, mn, dtype, args.contiguous, args.backend)
	for mn in tqdm(mn_vals)
	)

	elif args.mode == "nvidia-fixed-mn":
	k_vals = [
	2560,
	3840,
	5120,
	6400,
	7680,
	8960,
	10240,
	11520,
	12800,
	14080,
	15360,
	16640,
	17920,
	19200,
	20480,
	]
	results = (
	eval_fn(10240, k, 10240, dtype, args.contiguous, args.backend)
	for k in tqdm(k_vals)
	)

	df = pd.DataFrame.from_records(results)
	if args.save:
	save_file = f"{args.mode}_{args.dtype}_{args.backend}.csv"
	df.to_csv(save_file)
	print(f"Finished benchmark: {args.mode} saved results to {save_file}")
	print(df)