test/inductor/test_codecache.py - platform/external/pytorch - Git at Google

 # Owner(s): ["module: inductor"]
 import functools
 import pickle
 import tempfile
 import unittest
 from unittest.mock import patch

 import torch
 from torch._dynamo.test_case import run_tests, TestCase
 from torch._dynamo.utils import counters
 from torch._inductor import config
 from torch._inductor.codecache import (
     AsyncCompile,
     FxGraphCachePickler,
     FxGraphHashDetails,
     TensorMetadata,
     TensorMetadataAndValues,
 )
 from torch.testing._internal.common_cuda import SM80OrLater
 from torch.testing._internal.common_device_type import largeTensorTest
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     parametrize,
 )
 from torch.testing._internal.inductor_utils import HAS_CUDA
 from torch.utils._triton import has_triton

 HAS_TRITON = has_triton()

 requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
 requires_triton = functools.partial(unittest.skipIf, not HAS_TRITON, "requires triton")


 class MyModel(torch.nn.Module):
     def __init__(self):
         super().__init__()
         self.fc1 = torch.nn.Linear(10, 10)

     def forward(self, inp):
         return self.fc1(inp)


 def _run_codecache_test(start_method):
     torch._inductor.config.worker_start_method = start_method
     torch._inductor.config.compile_threads = 16
     AsyncCompile.warm_pool()

     model = MyModel().cuda()
     model = torch.compile(model)
     inp = torch.rand(10, 10).cuda()
     model(inp).sum().backward()


 @requires_cuda()
 def test_codecache_spawn():
     _run_codecache_test("spawn")


 @requires_cuda()
 def test_codecache_fork():
     _run_codecache_test("fork")


 class MyModelConv2d(torch.nn.Module):
     def __init__(self, dim=512):
         super().__init__()
         self.conv1 = torch.nn.Conv2d(3, dim, kernel_size=3, stride=2, bias=False)
         self.conv2 = torch.nn.Conv2d(dim, dim, kernel_size=3, stride=2, bias=False)

     def forward(self, x):
         x = self.conv1(x)
         torch._dynamo.graph_break()
         x = self.conv2(x)
         return x


 @instantiate_parametrized_tests
 class TestFxGraphCache(TestCase):
     @classmethod
     def setUpClass(cls):
         # Reroute all cache disk activity to a clean temporary directory to
         # ensure isolation (and initial cache misses). Deliberately create the
         # temp dir in setUpClass, however, so that individual test runs reuse
         # the same location. We don't expect different tests to reuse cache
         # entries, so preserving the temp dir provides that additional testing.
         cls.tmpdir = tempfile.TemporaryDirectory()
         cls.cache_dir_patch = patch("torch._inductor.codecache.cache_dir")
         cls.cache_dir_patch.start().return_value = cls.tmpdir.name

     @classmethod
     def tearDownClass(cls):
         cls.cache_dir_patch.stop()
         cls.tmpdir.cleanup()

     def setUp(self):
         counters.clear()

     @requires_triton()
     @config.patch({"fx_graph_cache": True})
     @parametrize("device", ("cuda", "cpu"))
     @parametrize("dtype", (torch.float32, torch.bfloat16))
     @parametrize("dynamic", (False, True))
     def test_cache_load_function(self, device, dtype, dynamic):
         """
         Verify that we can populate and load functions from the cache.
         """
         if device == "cuda" and not HAS_CUDA:
             raise unittest.SkipTest("requires CUDA")
         if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
             raise unittest.SkipTest("requires SM80 or later")

         def fn(x, y):
             return (x * 2, y @ y)

         a = torch.rand(25, dtype=dtype, device=device)
         b = torch.rand(5, 5, dtype=dtype, device=device)
         c = a.view(5, 5)

         compiled_fn = torch.compile(fn, dynamic=dynamic)

         # A first call shold miss in the cache.
         self.assertEqual(fn(a, b), compiled_fn(a, b))
         self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
         self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)

         # A second call should hit. (First reset so in-memory guards
         # don't prevent compilation).
         torch._dynamo.reset()
         self.assertEqual(fn(a, b), compiled_fn(a, b))
         self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
         self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)

         # But we expect different code if the tensors are aliased.
         torch._dynamo.reset()
         self.assertEqual(fn(a, c), compiled_fn(a, c))
         self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 2)
         self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)

     @requires_triton()
     @config.patch({"fx_graph_cache": True})
     @parametrize("device", ("cuda", "cpu"))
     @parametrize("dtype", (torch.float32, torch.float64))
     @parametrize("dynamic", (False, True))
     def test_cache_load_model(self, device, dtype, dynamic):
         """
         Verify that we can populate and load models from the cache.
         """
         if device == "cuda" and not HAS_CUDA:
             raise unittest.SkipTest("requires CUDA")

         def fn(mod, x):
             mod.zero_grad()
             mod(x).sum().backward()
             return [p.grad for p in mod.parameters()]

         compiled_fn = torch.compile(fn, dynamic=dynamic)

         mod = MyModelConv2d().to(device=device, dtype=dtype)
         inp = torch.randn(2, 3, 16, 16, device=device, dtype=dtype)

         # The first call should see all cache misses.
         counters.clear()
         grads1 = compiled_fn(mod, inp)
         self.assertGreater(counters["inductor"]["fxgraph_cache_miss"], 0)
         self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)

         # The second should see all hits. (First reset so in-memory guards
         # don't prevent compilation).
         counters.clear()
         torch._dynamo.reset()
         grads2 = compiled_fn(mod, inp)
         self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
         self.assertGreater(counters["inductor"]["fxgraph_cache_hit"], 0)

         # And the results should be the same.
         self.assertEqual(grads1, grads2)

     @largeTensorTest("64GB", device="cuda")
     @config.patch({"fx_graph_cache": True})
     @parametrize("device", ("cuda",))
     @parametrize("dtype", (torch.float16, torch.bfloat16))
     def test_cache_load_with_guards_int32_bounds(self, device, dtype):
         """
         Test caching the same graph, but under conditions that introduce guards
         for tensor sizes < int32.
         """
         if device == "cuda" and not HAS_CUDA:
             raise unittest.SkipTest("requires CUDA")
         if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
             raise unittest.SkipTest("requires SM80 or later")

         def fn(x, y):
             return (x + x, y + y)

         compiled_fn = torch.compile(fn, dynamic=True)

         # Iterate over different shapes, varying whether the total
         # size is below or above int32. For each combination, we expect
         # different guards around whether the symbolic sizes do or do
         # not exceed int32.
         shapes = (
             ((5, 6), (7, 8)),
             ((5, 6), (47000, 47001)),
             ((47000, 47001), (5, 6)),
         )
         for a_shape, b_shape in shapes:
             a = torch.rand(a_shape, device=device, dtype=dtype)
             b = torch.rand(b_shape, device=device, dtype=dtype)

             # AVOID a dynamo reset here. We expect guards to have been
             # added that will be violated with the new shape. We should
             # see a recompilation (along with a cache miss).
             counters.clear()
             res1 = compiled_fn(a, b)
             self.assertGreater(counters["inductor"]["fxgraph_cache_miss"], 0)
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)

             # A second call should hit. (Reset here to force compilation).
             counters.clear()
             torch._dynamo.reset()
             res2 = compiled_fn(a, b)
             self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
             self.assertGreater(counters["inductor"]["fxgraph_cache_hit"], 0)

             self.assertEqual(res1, res2)

     @config.patch({"fx_graph_cache": True})
     @parametrize("device", ("cuda", "cpu"))
     @parametrize("dtype", (torch.float32, torch.bfloat16))
     def test_cache_load_with_guards_static_bounds(self, device, dtype):
         """
         Test caching the same graph, but under conditions that introduce guards
         for static bounds.
         """
         if device == "cuda" and not HAS_CUDA:
             raise unittest.SkipTest("requires CUDA")
         if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
             raise unittest.SkipTest("requires SM80 or later")

         # See lowering; for all of the pooling operators, we always guard and
         # make the height/width static.
         def fn(x):
             return torch.nn.functional.adaptive_avg_pool2d(x, [5, 7])

         compiled_fn = torch.compile(fn, dynamic=True)

         # Iterate over different input shapes. Each new shape should cause
         # a cache miss.
         shapes = ((1, 64, 8, 9), (1, 64, 9, 10), (1, 64, 10, 11))
         for shape in shapes:
             x = torch.rand(shape, device=device, dtype=dtype)

             # AVOID a dynamo reset here. For each cache hit, we expect guards
             # to have been added that will be violated with each new shape.
             # We should see a recompilation (along with a cache miss).
             counters.clear()
             res1 = compiled_fn(x)
             self.assertGreater(counters["inductor"]["fxgraph_cache_miss"], 0)
             self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)

             # A second call should hit.
             counters.clear()
             torch._dynamo.reset()
             res2 = compiled_fn(x)
             self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
             self.assertGreater(counters["inductor"]["fxgraph_cache_hit"], 0)

             self.assertEqual(res1, res2)

     @config.patch({"fx_graph_cache": True})
     def test_cache_clear(self):
         """
         Test clearing the cache.
         """

         def fn(x, y):
             return (x * y,)

         a = torch.rand(5, 5)
         b = torch.rand(5, 5)

         compiled_fn = torch.compile(fn)

         # A first call shold miss in the cache.
         self.assertEqual(fn(a, b), compiled_fn(a, b))
         self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
         self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)

         # A second call should hit.
         counters.clear()
         torch._dynamo.reset()
         self.assertEqual(fn(a, b), compiled_fn(a, b))
         self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
         self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)

         # Clear the cache; now we should miss.
         counters.clear()
         torch._dynamo.reset()
         torch._inductor.codecache.FxGraphCache.clear()
         self.assertEqual(fn(a, b), compiled_fn(a, b))
         self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
         self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)


 class TestFxGraphCacheHashing(TestCase):
     def test_tensor_constants(self):
         """
         Test the handling of small vs. large tensor constants.
         """
         data = FxGraphCachePickler.dumps(torch.tensor(list(range(9))))
         self.assertIsInstance(pickle.loads(data), TensorMetadata)

         data = FxGraphCachePickler.dumps(torch.tensor(list(range(8))))
         self.assertIsInstance(pickle.loads(data), TensorMetadataAndValues)

     def test_hash_fake_tensors(self):
         """
         Test hashing (pickling) FakeTensors with various characteristics.
         """
         with torch._subclasses.FakeTensorMode():
             # Verify that FakeTensors get pickled into a TensorMetadata:
             data = FxGraphCachePickler.dumps(torch.randn(1))
             self.assertIsInstance(pickle.loads(data), TensorMetadata)

             # Different shapes:
             self.assertEqual(
                 FxGraphCachePickler.dumps(torch.randn(3)),
                 FxGraphCachePickler.dumps(torch.randn(3)),
             )
             self.assertNotEqual(
                 FxGraphCachePickler.dumps(torch.randn(3)),
                 FxGraphCachePickler.dumps(torch.randn(4)),
             )
             self.assertNotEqual(
                 FxGraphCachePickler.dumps(torch.randn(3)),
                 FxGraphCachePickler.dumps(torch.randn(3, 3)),
             )

             self.assertEqual(
                 FxGraphCachePickler.dumps(torch.randn(3, 3)),
                 FxGraphCachePickler.dumps(torch.randn(3, 3)),
             )
             self.assertNotEqual(
                 FxGraphCachePickler.dumps(torch.randn(3, 3)),
                 FxGraphCachePickler.dumps(torch.randn(3, 4)),
             )
             self.assertNotEqual(
                 FxGraphCachePickler.dumps(torch.randn(3, 3)),
                 FxGraphCachePickler.dumps(torch.randn(4, 3)),
             )

             # Different strides:
             self.assertEqual(
                 FxGraphCachePickler.dumps(torch.randn(3, 3)),
                 FxGraphCachePickler.dumps(
                     torch.randn(3, 3).transpose(0, 1).transpose(0, 1)
                 ),
             )
             self.assertNotEqual(
                 FxGraphCachePickler.dumps(torch.randn(3, 3)),
                 FxGraphCachePickler.dumps(torch.randn(3, 3).transpose(0, 1)),
             )

             # Different storage offsets:
             self.assertEqual(
                 FxGraphCachePickler.dumps(torch.randn(3)[1:]),
                 FxGraphCachePickler.dumps(torch.randn(3)[1:]),
             )
             self.assertNotEqual(
                 FxGraphCachePickler.dumps(torch.randn(3)[1:]),
                 FxGraphCachePickler.dumps(torch.randn(2)),
             )

             # Different dtypes:
             self.assertEqual(
                 FxGraphCachePickler.dumps(torch.randn(3, dtype=torch.float32)),
                 FxGraphCachePickler.dumps(torch.randn(3, dtype=torch.float32)),
             )
             self.assertNotEqual(
                 FxGraphCachePickler.dumps(torch.randn(3, dtype=torch.float32)),
                 FxGraphCachePickler.dumps(torch.randn(3, dtype=torch.float64)),
             )

             # Different 'requires_grad':
             self.assertEqual(
                 FxGraphCachePickler.dumps(torch.randn(3, requires_grad=True)),
                 FxGraphCachePickler.dumps(torch.randn(3, requires_grad=True)),
             )
             self.assertNotEqual(
                 FxGraphCachePickler.dumps(torch.randn(3, requires_grad=True)),
                 FxGraphCachePickler.dumps(torch.randn(3, requires_grad=False)),
             )

             # Different memory formats:
             self.assertNotEqual(
                 FxGraphCachePickler.dumps(torch.randn(1, 2, 3, 4)),
                 FxGraphCachePickler.dumps(
                     torch.randn(1, 2, 3, 4).to(memory_format=torch.channels_last)
                 ),
             )

             # Different devices:
             self.assertEqual(
                 FxGraphCachePickler.dumps(torch.randn(3, device="meta")),
                 FxGraphCachePickler.dumps(torch.randn(3, device="meta")),
             )
             self.assertNotEqual(
                 FxGraphCachePickler.dumps(torch.randn(3, device="meta")),
                 FxGraphCachePickler.dumps(torch.randn(3, device="cpu")),
             )

             if HAS_CUDA and torch.cuda.device_count() >= 2:
                 self.assertEqual(
                     FxGraphCachePickler.dumps(torch.randn(3, device="cuda:1")),
                     FxGraphCachePickler.dumps(torch.randn(3, device="cuda:1")),
                 )
                 self.assertNotEqual(
                     FxGraphCachePickler.dumps(torch.randn(3, device="cuda:0")),
                     FxGraphCachePickler.dumps(torch.randn(3, device="cuda:1")),
                 )

     def test_hash_kwargs(self):
         """
         Test the special handling of the kwargs when hashing, i.e.,
         ordering of the kwargs dict and any set arguments.
         """
         # Dict order of the kwargs should not affect hashes.
         details1 = FxGraphHashDetails(None, [], {"a": 0, "z": 1})
         details2 = FxGraphHashDetails(None, [], {"z": 1, "a": 0})
         self.assertEqual(
             FxGraphCachePickler.dumps(details1),
             FxGraphCachePickler.dumps(details2),
         )

         # Different kwarg values should affect hashes.
         details1 = FxGraphHashDetails(None, [], {"a": 0})
         details2 = FxGraphHashDetails(None, [], {"a": 1})
         self.assertNotEqual(
             FxGraphCachePickler.dumps(details1),
             FxGraphCachePickler.dumps(details2),
         )

         # Set order should not affect hashes. Sets are unordered, but
         # sorting and creating a new set seems to change the order.
         set1 = {"a", "b", "c", "d", "e", "f", "g"}
         set2 = set(sorted(set1))  # noqa: C414
         details1 = FxGraphHashDetails(None, [], {"a": set1})
         details2 = FxGraphHashDetails(None, [], {"a": set2})
         self.assertEqual(
             FxGraphCachePickler.dumps(details1),
             FxGraphCachePickler.dumps(details2),
         )

         # But different set contents should affect hashes.
         details1 = FxGraphHashDetails(None, [], {"a": {1, 2, 3}})
         details2 = FxGraphHashDetails(None, [], {"a": {1, 2}})
         self.assertNotEqual(
             FxGraphCachePickler.dumps(details1),
             FxGraphCachePickler.dumps(details2),
         )

     def test_hash_config_changes(self):
         """
         Test that different config settings affect hashes.
         """
         with config.patch({"max_autotune": False}):
             details1 = FxGraphHashDetails(None, [], {})
             details2 = FxGraphHashDetails(None, [], {})

         with config.patch({"max_autotune": True}):
             details3 = FxGraphHashDetails(None, [], {})

         self.assertEqual(
             FxGraphCachePickler.dumps(details1),
             FxGraphCachePickler.dumps(details2),
         )
         self.assertNotEqual(
             FxGraphCachePickler.dumps(details1),
             FxGraphCachePickler.dumps(details3),
         )


 if __name__ == "__main__":
     run_tests()
	# Owner(s): ["module: inductor"]
	import functools
	import pickle
	import tempfile
	import unittest
	from unittest.mock import patch

	import torch
	from torch._dynamo.test_case import run_tests, TestCase
	from torch._dynamo.utils import counters
	from torch._inductor import config
	from torch._inductor.codecache import (
	AsyncCompile,
	FxGraphCachePickler,
	FxGraphHashDetails,
	TensorMetadata,
	TensorMetadataAndValues,
	)
	from torch.testing._internal.common_cuda import SM80OrLater
	from torch.testing._internal.common_device_type import largeTensorTest
	from torch.testing._internal.common_utils import (
	instantiate_parametrized_tests,
	parametrize,
	)
	from torch.testing._internal.inductor_utils import HAS_CUDA
	from torch.utils._triton import has_triton

	HAS_TRITON = has_triton()

	requires_cuda = functools.partial(unittest.skipIf, not HAS_CUDA, "requires cuda")
	requires_triton = functools.partial(unittest.skipIf, not HAS_TRITON, "requires triton")


	class MyModel(torch.nn.Module):
	def __init__(self):
	super().__init__()
	self.fc1 = torch.nn.Linear(10, 10)

	def forward(self, inp):
	return self.fc1(inp)


	def _run_codecache_test(start_method):
	torch._inductor.config.worker_start_method = start_method
	torch._inductor.config.compile_threads = 16
	AsyncCompile.warm_pool()

	model = MyModel().cuda()
	model = torch.compile(model)
	inp = torch.rand(10, 10).cuda()
	model(inp).sum().backward()


	@requires_cuda()
	def test_codecache_spawn():
	_run_codecache_test("spawn")


	@requires_cuda()
	def test_codecache_fork():
	_run_codecache_test("fork")


	class MyModelConv2d(torch.nn.Module):
	def __init__(self, dim=512):
	super().__init__()
	self.conv1 = torch.nn.Conv2d(3, dim, kernel_size=3, stride=2, bias=False)
	self.conv2 = torch.nn.Conv2d(dim, dim, kernel_size=3, stride=2, bias=False)

	def forward(self, x):
	x = self.conv1(x)
	torch._dynamo.graph_break()
	x = self.conv2(x)
	return x


	@instantiate_parametrized_tests
	class TestFxGraphCache(TestCase):
	@classmethod
	def setUpClass(cls):
	# Reroute all cache disk activity to a clean temporary directory to
	# ensure isolation (and initial cache misses). Deliberately create the
	# temp dir in setUpClass, however, so that individual test runs reuse
	# the same location. We don't expect different tests to reuse cache
	# entries, so preserving the temp dir provides that additional testing.
	cls.tmpdir = tempfile.TemporaryDirectory()
	cls.cache_dir_patch = patch("torch._inductor.codecache.cache_dir")
	cls.cache_dir_patch.start().return_value = cls.tmpdir.name

	@classmethod
	def tearDownClass(cls):
	cls.cache_dir_patch.stop()
	cls.tmpdir.cleanup()

	def setUp(self):
	counters.clear()

	@requires_triton()
	@config.patch({"fx_graph_cache": True})
	@parametrize("device", ("cuda", "cpu"))
	@parametrize("dtype", (torch.float32, torch.bfloat16))
	@parametrize("dynamic", (False, True))
	def test_cache_load_function(self, device, dtype, dynamic):
	"""
	Verify that we can populate and load functions from the cache.
	"""
	if device == "cuda" and not HAS_CUDA:
	raise unittest.SkipTest("requires CUDA")
	if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
	raise unittest.SkipTest("requires SM80 or later")

	def fn(x, y):
	return (x * 2, y @ y)

	a = torch.rand(25, dtype=dtype, device=device)
	b = torch.rand(5, 5, dtype=dtype, device=device)
	c = a.view(5, 5)

	compiled_fn = torch.compile(fn, dynamic=dynamic)

	# A first call shold miss in the cache.
	self.assertEqual(fn(a, b), compiled_fn(a, b))
	self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
	self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)

	# A second call should hit. (First reset so in-memory guards
	# don't prevent compilation).
	torch._dynamo.reset()
	self.assertEqual(fn(a, b), compiled_fn(a, b))
	self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
	self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)

	# But we expect different code if the tensors are aliased.
	torch._dynamo.reset()
	self.assertEqual(fn(a, c), compiled_fn(a, c))
	self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 2)
	self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)

	@requires_triton()
	@config.patch({"fx_graph_cache": True})
	@parametrize("device", ("cuda", "cpu"))
	@parametrize("dtype", (torch.float32, torch.float64))
	@parametrize("dynamic", (False, True))
	def test_cache_load_model(self, device, dtype, dynamic):
	"""
	Verify that we can populate and load models from the cache.
	"""
	if device == "cuda" and not HAS_CUDA:
	raise unittest.SkipTest("requires CUDA")

	def fn(mod, x):
	mod.zero_grad()
	mod(x).sum().backward()
	return [p.grad for p in mod.parameters()]

	compiled_fn = torch.compile(fn, dynamic=dynamic)

	mod = MyModelConv2d().to(device=device, dtype=dtype)
	inp = torch.randn(2, 3, 16, 16, device=device, dtype=dtype)

	# The first call should see all cache misses.
	counters.clear()
	grads1 = compiled_fn(mod, inp)
	self.assertGreater(counters["inductor"]["fxgraph_cache_miss"], 0)
	self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)

	# The second should see all hits. (First reset so in-memory guards
	# don't prevent compilation).
	counters.clear()
	torch._dynamo.reset()
	grads2 = compiled_fn(mod, inp)
	self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
	self.assertGreater(counters["inductor"]["fxgraph_cache_hit"], 0)

	# And the results should be the same.
	self.assertEqual(grads1, grads2)

	@largeTensorTest("64GB", device="cuda")
	@config.patch({"fx_graph_cache": True})
	@parametrize("device", ("cuda",))
	@parametrize("dtype", (torch.float16, torch.bfloat16))
	def test_cache_load_with_guards_int32_bounds(self, device, dtype):
	"""
	Test caching the same graph, but under conditions that introduce guards
	for tensor sizes < int32.
	"""
	if device == "cuda" and not HAS_CUDA:
	raise unittest.SkipTest("requires CUDA")
	if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
	raise unittest.SkipTest("requires SM80 or later")

	def fn(x, y):
	return (x + x, y + y)

	compiled_fn = torch.compile(fn, dynamic=True)

	# Iterate over different shapes, varying whether the total
	# size is below or above int32. For each combination, we expect
	# different guards around whether the symbolic sizes do or do
	# not exceed int32.
	shapes = (
	((5, 6), (7, 8)),
	((5, 6), (47000, 47001)),
	((47000, 47001), (5, 6)),
	)
	for a_shape, b_shape in shapes:
	a = torch.rand(a_shape, device=device, dtype=dtype)
	b = torch.rand(b_shape, device=device, dtype=dtype)

	# AVOID a dynamo reset here. We expect guards to have been
	# added that will be violated with the new shape. We should
	# see a recompilation (along with a cache miss).
	counters.clear()
	res1 = compiled_fn(a, b)
	self.assertGreater(counters["inductor"]["fxgraph_cache_miss"], 0)
	self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)

	# A second call should hit. (Reset here to force compilation).
	counters.clear()
	torch._dynamo.reset()
	res2 = compiled_fn(a, b)
	self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
	self.assertGreater(counters["inductor"]["fxgraph_cache_hit"], 0)

	self.assertEqual(res1, res2)

	@config.patch({"fx_graph_cache": True})
	@parametrize("device", ("cuda", "cpu"))
	@parametrize("dtype", (torch.float32, torch.bfloat16))
	def test_cache_load_with_guards_static_bounds(self, device, dtype):
	"""
	Test caching the same graph, but under conditions that introduce guards
	for static bounds.
	"""
	if device == "cuda" and not HAS_CUDA:
	raise unittest.SkipTest("requires CUDA")
	if device == "cuda" and dtype == torch.bfloat16 and not SM80OrLater:
	raise unittest.SkipTest("requires SM80 or later")

	# See lowering; for all of the pooling operators, we always guard and
	# make the height/width static.
	def fn(x):
	return torch.nn.functional.adaptive_avg_pool2d(x, [5, 7])

	compiled_fn = torch.compile(fn, dynamic=True)

	# Iterate over different input shapes. Each new shape should cause
	# a cache miss.
	shapes = ((1, 64, 8, 9), (1, 64, 9, 10), (1, 64, 10, 11))
	for shape in shapes:
	x = torch.rand(shape, device=device, dtype=dtype)

	# AVOID a dynamo reset here. For each cache hit, we expect guards
	# to have been added that will be violated with each new shape.
	# We should see a recompilation (along with a cache miss).
	counters.clear()
	res1 = compiled_fn(x)
	self.assertGreater(counters["inductor"]["fxgraph_cache_miss"], 0)
	self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)

	# A second call should hit.
	counters.clear()
	torch._dynamo.reset()
	res2 = compiled_fn(x)
	self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
	self.assertGreater(counters["inductor"]["fxgraph_cache_hit"], 0)

	self.assertEqual(res1, res2)

	@config.patch({"fx_graph_cache": True})
	def test_cache_clear(self):
	"""
	Test clearing the cache.
	"""

	def fn(x, y):
	return (x * y,)

	a = torch.rand(5, 5)
	b = torch.rand(5, 5)

	compiled_fn = torch.compile(fn)

	# A first call shold miss in the cache.
	self.assertEqual(fn(a, b), compiled_fn(a, b))
	self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
	self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)

	# A second call should hit.
	counters.clear()
	torch._dynamo.reset()
	self.assertEqual(fn(a, b), compiled_fn(a, b))
	self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 0)
	self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)

	# Clear the cache; now we should miss.
	counters.clear()
	torch._dynamo.reset()
	torch._inductor.codecache.FxGraphCache.clear()
	self.assertEqual(fn(a, b), compiled_fn(a, b))
	self.assertEqual(counters["inductor"]["fxgraph_cache_miss"], 1)
	self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 0)


	class TestFxGraphCacheHashing(TestCase):
	def test_tensor_constants(self):
	"""
	Test the handling of small vs. large tensor constants.
	"""
	data = FxGraphCachePickler.dumps(torch.tensor(list(range(9))))
	self.assertIsInstance(pickle.loads(data), TensorMetadata)

	data = FxGraphCachePickler.dumps(torch.tensor(list(range(8))))
	self.assertIsInstance(pickle.loads(data), TensorMetadataAndValues)

	def test_hash_fake_tensors(self):
	"""
	Test hashing (pickling) FakeTensors with various characteristics.
	"""
	with torch._subclasses.FakeTensorMode():
	# Verify that FakeTensors get pickled into a TensorMetadata:
	data = FxGraphCachePickler.dumps(torch.randn(1))
	self.assertIsInstance(pickle.loads(data), TensorMetadata)

	# Different shapes:
	self.assertEqual(
	FxGraphCachePickler.dumps(torch.randn(3)),
	FxGraphCachePickler.dumps(torch.randn(3)),
	)
	self.assertNotEqual(
	FxGraphCachePickler.dumps(torch.randn(3)),
	FxGraphCachePickler.dumps(torch.randn(4)),
	)
	self.assertNotEqual(
	FxGraphCachePickler.dumps(torch.randn(3)),
	FxGraphCachePickler.dumps(torch.randn(3, 3)),
	)

	self.assertEqual(
	FxGraphCachePickler.dumps(torch.randn(3, 3)),
	FxGraphCachePickler.dumps(torch.randn(3, 3)),
	)
	self.assertNotEqual(
	FxGraphCachePickler.dumps(torch.randn(3, 3)),
	FxGraphCachePickler.dumps(torch.randn(3, 4)),
	)
	self.assertNotEqual(
	FxGraphCachePickler.dumps(torch.randn(3, 3)),
	FxGraphCachePickler.dumps(torch.randn(4, 3)),
	)

	# Different strides:
	self.assertEqual(
	FxGraphCachePickler.dumps(torch.randn(3, 3)),
	FxGraphCachePickler.dumps(
	torch.randn(3, 3).transpose(0, 1).transpose(0, 1)
	),
	)
	self.assertNotEqual(
	FxGraphCachePickler.dumps(torch.randn(3, 3)),
	FxGraphCachePickler.dumps(torch.randn(3, 3).transpose(0, 1)),
	)

	# Different storage offsets:
	self.assertEqual(
	FxGraphCachePickler.dumps(torch.randn(3)[1:]),
	FxGraphCachePickler.dumps(torch.randn(3)[1:]),
	)
	self.assertNotEqual(
	FxGraphCachePickler.dumps(torch.randn(3)[1:]),
	FxGraphCachePickler.dumps(torch.randn(2)),
	)

	# Different dtypes:
	self.assertEqual(
	FxGraphCachePickler.dumps(torch.randn(3, dtype=torch.float32)),
	FxGraphCachePickler.dumps(torch.randn(3, dtype=torch.float32)),
	)
	self.assertNotEqual(
	FxGraphCachePickler.dumps(torch.randn(3, dtype=torch.float32)),
	FxGraphCachePickler.dumps(torch.randn(3, dtype=torch.float64)),
	)

	# Different 'requires_grad':
	self.assertEqual(
	FxGraphCachePickler.dumps(torch.randn(3, requires_grad=True)),
	FxGraphCachePickler.dumps(torch.randn(3, requires_grad=True)),
	)
	self.assertNotEqual(
	FxGraphCachePickler.dumps(torch.randn(3, requires_grad=True)),
	FxGraphCachePickler.dumps(torch.randn(3, requires_grad=False)),
	)

	# Different memory formats:
	self.assertNotEqual(
	FxGraphCachePickler.dumps(torch.randn(1, 2, 3, 4)),
	FxGraphCachePickler.dumps(
	torch.randn(1, 2, 3, 4).to(memory_format=torch.channels_last)
	),
	)

	# Different devices:
	self.assertEqual(
	FxGraphCachePickler.dumps(torch.randn(3, device="meta")),
	FxGraphCachePickler.dumps(torch.randn(3, device="meta")),
	)
	self.assertNotEqual(
	FxGraphCachePickler.dumps(torch.randn(3, device="meta")),
	FxGraphCachePickler.dumps(torch.randn(3, device="cpu")),
	)

	if HAS_CUDA and torch.cuda.device_count() >= 2:
	self.assertEqual(
	FxGraphCachePickler.dumps(torch.randn(3, device="cuda:1")),
	FxGraphCachePickler.dumps(torch.randn(3, device="cuda:1")),
	)
	self.assertNotEqual(
	FxGraphCachePickler.dumps(torch.randn(3, device="cuda:0")),
	FxGraphCachePickler.dumps(torch.randn(3, device="cuda:1")),
	)

	def test_hash_kwargs(self):
	"""
	Test the special handling of the kwargs when hashing, i.e.,
	ordering of the kwargs dict and any set arguments.
	"""
	# Dict order of the kwargs should not affect hashes.
	details1 = FxGraphHashDetails(None, [], {"a": 0, "z": 1})
	details2 = FxGraphHashDetails(None, [], {"z": 1, "a": 0})
	self.assertEqual(
	FxGraphCachePickler.dumps(details1),
	FxGraphCachePickler.dumps(details2),
	)

	# Different kwarg values should affect hashes.
	details1 = FxGraphHashDetails(None, [], {"a": 0})
	details2 = FxGraphHashDetails(None, [], {"a": 1})
	self.assertNotEqual(
	FxGraphCachePickler.dumps(details1),
	FxGraphCachePickler.dumps(details2),
	)

	# Set order should not affect hashes. Sets are unordered, but
	# sorting and creating a new set seems to change the order.
	set1 = {"a", "b", "c", "d", "e", "f", "g"}
	set2 = set(sorted(set1)) # noqa: C414
	details1 = FxGraphHashDetails(None, [], {"a": set1})
	details2 = FxGraphHashDetails(None, [], {"a": set2})
	self.assertEqual(
	FxGraphCachePickler.dumps(details1),
	FxGraphCachePickler.dumps(details2),
	)

	# But different set contents should affect hashes.
	details1 = FxGraphHashDetails(None, [], {"a": {1, 2, 3}})
	details2 = FxGraphHashDetails(None, [], {"a": {1, 2}})
	self.assertNotEqual(
	FxGraphCachePickler.dumps(details1),
	FxGraphCachePickler.dumps(details2),
	)

	def test_hash_config_changes(self):
	"""
	Test that different config settings affect hashes.
	"""
	with config.patch({"max_autotune": False}):
	details1 = FxGraphHashDetails(None, [], {})
	details2 = FxGraphHashDetails(None, [], {})

	with config.patch({"max_autotune": True}):
	details3 = FxGraphHashDetails(None, [], {})

	self.assertEqual(
	FxGraphCachePickler.dumps(details1),
	FxGraphCachePickler.dumps(details2),
	)
	self.assertNotEqual(
	FxGraphCachePickler.dumps(details1),
	FxGraphCachePickler.dumps(details3),
	)


	if __name__ == "__main__":
	run_tests()