test/nn/test_embedding.py - platform/external/pytorch - Git at Google

 # Owner(s): ["module: nn"]
 import unittest
 import random
 import itertools
 from itertools import product

 import torch
 from torch.testing._internal.common_utils import run_tests, set_default_dtype, skipIfTorchDynamo, \
     instantiate_parametrized_tests, parametrize as parametrize_test, _assertGradAndGradgradChecks, IS_JETSON
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_nn import NNTestCase
 from torch.testing._internal.common_device_type import onlyNativeDeviceTypes, dtypes, \
     instantiate_device_type_tests, dtypesIfCUDA, onlyCUDA, \
     TEST_WITH_ROCM, skipCUDAIf, skipMeta
 import torch.nn.functional as F
 import torch.nn as nn
 from torch.testing._internal.common_utils import dtype2prec_DONTUSE

 class TestEmbeddingNN(NNTestCase):
     _do_cuda_memory_leak_check = True
     _do_cuda_non_default_stream = True

     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
     def test_embedding_max_norm_unsorted_repeating_indices(self):
         def create_embedding(device):
             # Seed RNG so we get the same Embedding each time
             torch.manual_seed(0)
             return torch.nn.Embedding(
                 num_embeddings=20,
                 embedding_dim=64,
                 max_norm=1.0).to(device)

         ix = torch.arange(2, device='cpu', dtype=torch.long).repeat(2000)
         out_cpu = create_embedding('cpu')(ix)

         ix = ix.to('cuda')
         out = create_embedding('cuda')(ix)
         self.assertEqual(out.cpu(), out_cpu)

     def test_embedding_sparse_basic(self):
         embedding = nn.Embedding(10, 20, sparse=True)
         input = torch.tensor([[0, 2, 4, 5], [4, 3, 0, 9]], dtype=torch.long)
         embedding(input).sum().backward()
         self.assertTrue(embedding.weight.grad.is_sparse)
         self.assertEqual(embedding.weight.grad.shape, embedding.weight.shape)

     def test_embedding_sparse_empty_tensor(self):
         embedding = nn.Embedding(0, 0, sparse=True)
         input = torch.tensor([], dtype=torch.int64)
         embedding(input).sum().backward()
         self.assertTrue(embedding.weight.grad.is_sparse)
         self.assertEqual(embedding.weight.grad.shape, embedding.weight.shape)

         embedding = nn.Embedding(10, 0, sparse=True)
         input = torch.LongTensor([[0, 2, 4, 5], [4, 3, 0, 9]])
         embedding(input).sum().backward()
         self.assertTrue(embedding.weight.grad.is_sparse)
         self.assertEqual(embedding.weight.grad.shape, embedding.weight.shape)

     def test_move_sparse_half_embedding(self):
         embedding = nn.Embedding(10, 3, sparse=True)
         self.assertEqual(embedding.weight.device.type, 'cpu')
         self.assertEqual(embedding.weight.dtype, torch.get_default_dtype())
         embedding.to(torch.float16)
         self.assertEqual(embedding.weight.dtype, torch.float16)
         self.assertEqual(embedding.embedding_dim, 3)
         self.assertEqual(embedding.num_embeddings, 10)

         if torch.cuda.is_available():
             embedding.to('cuda')
             self.assertEqual(embedding.weight.device.type, 'cuda')
             embedding.to('cpu')
             self.assertEqual(embedding.weight.device.type, 'cpu')

     def test_embedding_max_norm(self):
         embedding = nn.Embedding(22, 5, max_norm=1.0)
         input = torch.tensor([2, 8, 8, 6], dtype=torch.long)
         output = embedding(input)
         self.assertEqual(output[1], output[2])
         self.assertTrue(output.data.norm(p=2, dim=1).le(1).all())

     @parametrize_test("dtype", (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64, torch.float, torch.double))
     def test_embedding_from_pretrained(self, dtype):
         a = torch.tensor([[1., 2., 3.], [4., 5., 6.]], dtype=dtype)
         embedding = nn.Embedding.from_pretrained(a)
         self.assertEqual(a, embedding.weight.data)

         input = torch.LongTensor([0, 1])
         output = embedding(input)
         self.assertEqual(a, output)

     def test_embedding_bag_from_pretrained(self):
         a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
         embedding = nn.EmbeddingBag.from_pretrained(a)
         self.assertEqual(a, embedding.weight)

         input = torch.tensor([0, 1], dtype=torch.long)
         output = embedding(input, torch.arange(input.size(0)))
         self.assertEqual(a, output)

     def test_embedding_from_pretrained_padding_idx(self):
         padding_idx = 2
         padding_vec = torch.ones(3) * 7
         embeddings = torch.rand(4, 3, requires_grad=True)
         with torch.no_grad():
             embeddings[padding_idx] = padding_vec
         embedding_nn = nn.Embedding.from_pretrained(embeddings, padding_idx=padding_idx)
         self.assertEqual(embedding_nn.weight[padding_idx], padding_vec)

     def test_embedding_bag_from_pretrained_padding_idx(self):
         padding_idx = 2
         embeddings = torch.rand(4, 3, requires_grad=True)
         embedding_nn = nn.EmbeddingBag.from_pretrained(embeddings, padding_idx=padding_idx)
         self.assertEqual(embedding_nn.weight, embeddings)

     def test_embedding_from_pretrained_options(self):
         with set_default_dtype(torch.double):
             a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
             opts = {
                 "max_norm": 2.,
                 "norm_type": .5,
                 "scale_grad_by_freq": False,
                 "sparse": True
             }
             embedding = nn.Embedding.from_pretrained(a, **opts)
             input = torch.LongTensor([0, 1])
             output = embedding(input)
             # test output and that weight matrix was renormalized
             self.assertEqual(a, output)
             self.assertTrue(a.ne(torch.arange(1, 7, dtype=a.dtype).view(2, 3)).all())
             self.assertTrue(output.data.norm(p=opts["norm_type"], dim=1).le(opts["max_norm"]).all())

     def test_embedding_functional(self):
         a = torch.tensor([
             [1, 3, 2],
             [0, 2, 1]
         ], dtype=torch.long)
         embeddings = torch.rand(4, 3, requires_grad=True)

         embed_old = torch.nn.Embedding(4, 3)
         embed_old.weight.data = embeddings.data
         # A silly test for eager, this test is useful for when we run under PYTORCH_TEST_WITH_DYNAMO=1
         # as it ensures that setattr correctly works.
         self.assertEqual(embed_old.weight.data, embeddings.data)
         res_old = embed_old(a)

         res_F = F.embedding(a, embeddings)
         self.assertEqual(res_old, res_F)

         embed_old = torch.nn.Embedding(4, 3)
         embed_old = embed_old.from_pretrained(embeddings, padding_idx=2)
         res_old = embed_old(a)
         res_F = F.embedding(a, embeddings, padding_idx=2)

         self.assertEqual(res_old, res_F)

     def test_embedding_bag_functional(self):
         a = torch.tensor([
             [1, 3, 2],
             [0, 2, 1]
         ], dtype=torch.long)
         embeddings = torch.rand(4, 3, requires_grad=True)

         embed_old = torch.nn.EmbeddingBag(4, 3)
         embed_old.weight = torch.nn.Parameter(embeddings)
         res_old = embed_old(a)

         res_F = F.embedding_bag(a, embeddings)
         self.assertEqual(res_old, res_F)

         embed_old = torch.nn.EmbeddingBag(4, 3)
         embed_old = embed_old.from_pretrained(embeddings, padding_idx=2)
         res_old = embed_old(a)
         res_F = F.embedding_bag(a, embeddings, padding_idx=2)

         self.assertEqual(res_old, res_F)

     # Make sure that error is thrown if padding_idx is out of bounds
     def test_embedding_bag_padding_idx_error(self):
         a = torch.tensor([
             [1, 3, 2],
             [0, 2, 1]
         ], dtype=torch.long)
         num_embeddings = 4
         num_features = 3
         embeddings = torch.rand(num_embeddings, num_features, requires_grad=True)

         functional_err_msg = r'padding_idx must be within the number of embeddings'
         module_err_msg = r'padding_idx must be within num_embeddings'

         for padding_idx in range(-(num_embeddings + 2), (num_embeddings + 2)):
             if (padding_idx < -num_embeddings) or (padding_idx >= num_embeddings):
                 with self.assertRaisesRegex(RuntimeError, functional_err_msg):
                     F.embedding_bag(a, embeddings, padding_idx=padding_idx)
                 with self.assertRaisesRegex(AssertionError, module_err_msg):
                     torch.nn.EmbeddingBag(num_embeddings, num_features, padding_idx=padding_idx)
             else:
                 F.embedding_bag(a, embeddings, padding_idx=padding_idx)
                 torch.nn.EmbeddingBag(num_embeddings, num_features, padding_idx=padding_idx)

     def test_embeddingbag_from_pretrained(self):
         a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
         embeddingbag = nn.EmbeddingBag.from_pretrained(a)
         self.assertEqual(a, embeddingbag.weight.data)

         input = torch.LongTensor([[0, 1]])
         output = embeddingbag(input)
         self.assertEqual(a.mean(0, keepdim=True), output)

     def test_embeddingbag_from_pretrained_options(self):
         with set_default_dtype(torch.double):
             a = torch.tensor([[1., 2., 3.], [4., 5., 6.]])
             opts = {
                 "max_norm": 2.,
                 "norm_type": .5,
                 "scale_grad_by_freq": False,
                 "mode": "max",
                 "sparse": False
             }
             embeddingbag = nn.EmbeddingBag.from_pretrained(a, **opts)

             input = torch.LongTensor([[0, 1]])
             output = embeddingbag(input)
             self.assertEqual(a.max(0, keepdim=True)[0], output)
             self.assertTrue(a.ne(torch.arange(1, 7, dtype=a.dtype).view(2, 3)).all())
             self.assertTrue(a.norm(p=opts["norm_type"], dim=1).le(opts["max_norm"]).all())

     def test_embeddingbag_include_last_offset(self):
         # Test case from https://github.com/pytorch/pytorch/issues/89677
         embeddingbag = nn.EmbeddingBag(100, 3, include_last_offset=True, padding_idx=61)
         input = torch.tensor([0, 1, 2, 3])
         out = embeddingbag(input, torch.tensor([0, 3, 3]))
         out2 = embeddingbag(input, torch.tensor([0, 3, 4]))

         weight = embeddingbag.weight
         row0 = weight[0:3].mean(0)
         row1 = weight[3]
         ref_out = torch.stack([row0, row1])

         self.assertEqual(ref_out, out)
         self.assertEqual(ref_out, out2)

 class TestEmbeddingNNDeviceType(NNTestCase):
     def test_embedding_dense_grad(self, device):
         with set_default_dtype(torch.double):
             embd = nn.Embedding(20, 20).to(device)
             weight = embd.weight

             def fn_wrapper(device):
                 def fn(weight):
                     inp = torch.tensor([[0, 1, 1, 2], [3, 5, 7, 11]], dtype=torch.long).to(device)
                     return torch.nn.functional.embedding(inp, weight)
                 return fn

             fn = fn_wrapper(device)
             _assertGradAndGradgradChecks(self, fn, (weight, ))

     def test_embedding_scalar_weight_error(self, device):
         indices = torch.rand(2, 2, device=device).long()
         weights = [
             torch.tensor(1.0, device=device),
             torch.tensor(1.0, device=device).reshape(1, 1, 1),
         ]

         for weight in weights:
             with self.assertRaisesRegex(RuntimeError, "'weight' must be 2-D"):
                 torch.nn.functional.embedding(indices, weight)

     @dtypesIfCUDA(torch.float16, torch.float64)
     @dtypes(torch.float64)
     def test_embedding_backward(self, device, dtype):
         embedding = nn.Embedding(10, 3, sparse=True)
         tensor = torch.tensor([[7, 1, 3]])
         ones = torch.tensor(1., dtype=dtype).expand(3, 3)
         tensorTwice = tensor.repeat(1, 2)
         onesTwice = torch.cat((ones, ones))

         embedding = embedding.to(dtype=dtype).to(device)
         tensor = tensor.to(device)
         ones = ones.to(device)
         tensorTwice = tensorTwice.to(device)
         onesTwice = onesTwice.to(device)

         embedding.zero_grad()
         embedding(tensor[0]).sum().backward()
         self.assertEqual(embedding.weight.grad._indices(), tensor)
         self.assertEqual(embedding.weight.grad._values(), ones)

         embedding.zero_grad()
         embedding(tensor[0]).sum().backward()
         embedding(tensor[0]).sum().backward()
         self.assertEqual(embedding.weight.grad._indices(), tensorTwice)
         self.assertEqual(embedding.weight.grad._values(), onesTwice)

         embedding.zero_grad()
         embedding(tensor[0]).sum().backward()
         tensor[0, 0] = 8
         embedding(tensor[0]).sum().backward()
         tensorTwice[0, 3] = 8
         self.assertEqual(embedding.weight.grad._indices(), tensorTwice)
         self.assertEqual(embedding.weight.grad._values(), onesTwice)

     @dtypesIfCUDA(*((torch.float, torch.double, torch.bfloat16, torch.half)
                     if TEST_WITH_ROCM else (torch.float, torch.double, torch.half)))
     @dtypes(torch.float32)
     def test_embedding_max_norm_backward(self, device, dtype):
         # can't use gradcheck since in place renorm makes analytical gradients different from produced ones
         weight = torch.randn((4, 4), device=device, dtype=dtype) * 2
         weight.requires_grad_()
         inp_list = [0, 1, 2, 2]
         inp = torch.tensor(inp_list, device=device)
         out = nn.functional.embedding(inp, weight, max_norm=1.).sum()
         out.backward()

         expected_grad = torch.tensor([[1., 1., 2., 0.]], device=device, dtype=dtype).transpose(0, 1).expand(4, 4)
         self.assertEqual(weight.grad, expected_grad)

     @dtypesIfCUDA(*((torch.float, torch.double, torch.bfloat16, torch.half)
                     if TEST_WITH_ROCM else (torch.float, torch.double, torch.half)))
     @dtypes(torch.float32)
     def test_embedding_max_norm_fwd_AD(self, device, dtype):
         if torch.device(device).type == 'xla':
             self.skipTest("forward AD doesn't work on xla")

         # can't use gradcheck since in place renorm makes analytical gradients different from produced ones
         weight = torch.randn((4, 4), device=device, dtype=dtype) * 2
         tangent = torch.ones((4, 4), device=device, dtype=dtype)
         inp = torch.tensor([[0, 1], [2, 2]], device=device)
         with torch.autograd.forward_ad.dual_level():
             dual_weight = torch.autograd.forward_ad.make_dual(weight, tangent)
             out = nn.functional.embedding(inp, dual_weight, max_norm=1.)
             jvp = torch.autograd.forward_ad.unpack_dual(out).tangent

         expected_grad = torch.ones((2, 2, 4), device=device, dtype=dtype)
         self.assertEqual(jvp, expected_grad)

     @dtypesIfCUDA(*((torch.float, torch.double, torch.bfloat16, torch.half)
                     if TEST_WITH_ROCM else (torch.float, torch.double, torch.half)))
     @dtypes(torch.float32)
     def test_embedding_padding_idx(self, device, dtype):
         embedding = nn.Embedding(10, 20, padding_idx=0).to(device, dtype)
         input = torch.tensor([[0, 2, 4, 5], [4, 3, 0, 9]], dtype=torch.long).to(device)
         output = embedding(input)
         self.assertEqual(output[0][0].sum(), 0)
         self.assertEqual(output[1][2].sum(), 0)

         embedding = nn.Embedding(10, 20, padding_idx=0, sparse=True).to(device, dtype)
         input = torch.tensor([[0, 2, 4, 5], [4, 3, 0, 9]], dtype=torch.long).to(device)
         output = embedding(input)
         self.assertEqual(output[0][0].sum(), 0)
         self.assertEqual(output[1][2].sum(), 0)

         # negative indexing check for padding_idx
         # padding_idx=-2, num_embeddings=10 ==> index 8 padded
         embedding = nn.Embedding(10, 20, padding_idx=-2).to(device, dtype)
         input = torch.tensor([[0, 2, 8, 5], [4, 8, 0, 9]], dtype=torch.long).to(device)
         output = embedding(input)
         self.assertEqual(output[0][2].sum(), 0)
         self.assertEqual(output[1][1].sum(), 0)

         embedding = nn.Embedding(10, 20, padding_idx=-2, sparse=True).to(device, dtype)
         input = torch.tensor([[0, 2, 8, 5], [4, 8, 0, 9]], dtype=torch.long).to(device)
         output = embedding(input)
         self.assertEqual(output[0][2].sum(), 0)
         self.assertEqual(output[1][1].sum(), 0)

         # change padding vector
         padding_vector = torch.ones(20, dtype=dtype, device=device)
         embedding = nn.Embedding(10, 20, padding_idx=2, sparse=True).to(device, dtype)
         with torch.no_grad():
             embedding.weight[2] = padding_vector
         input = torch.tensor([0, 2], dtype=torch.long).to(device)
         output = embedding(input)
         self.assertEqual(output[1], padding_vector)

         # out of bounds check for padding_idx
         self.assertRaises(AssertionError, nn.Embedding, num_embeddings=10, embedding_dim=20, padding_idx=25)
         self.assertRaises(AssertionError, nn.Embedding, num_embeddings=10, embedding_dim=20, padding_idx=-25)

         padding_idx = 0
         embedding = nn.Embedding(5, 2, padding_idx=padding_idx).to(device, dtype)
         for n in (1, 2, 1000):  # Need large N to trigger all the methods we have implemented
             for other_indices in ([], [1, 3], [2]):
                 indices = torch.tensor(other_indices + [padding_idx] * n, dtype=torch.long).to(device)
                 pre = embedding.weight[padding_idx].clone()
                 embedding(indices).sum().backward()
                 after = (embedding.weight + embedding.weight.grad)[padding_idx]
                 embedding.zero_grad()
                 self.assertEqual(after, pre)

                 # test double backward
                 emb_sum = embedding(indices).sum()
                 emb_grad = torch.autograd.grad(outputs=emb_sum, inputs=list(embedding.parameters()), retain_graph=True)
                 scalar = emb_grad[0].sum() + emb_sum
                 scalar.backward()
                 after = (embedding.weight + embedding.weight.grad)[padding_idx]
                 embedding.zero_grad()
                 self.assertEqual(after, pre)

     # Check correctness of torch.nn.functional.embedding_bag forward and
     # backward functions with padding_idx, given a 1D input separated into bags
     # with an offset array. Compare against an equivalent 2D input that uses
     # padding indices to fill in the gaps indicated by the offset array

     @skipIfTorchDynamo("see https://github.com/pytorch/pytorch/pull/95621")
     @onlyNativeDeviceTypes
     @dtypes(torch.float32, torch.float64)
     @dtypesIfCUDA(torch.half, torch.bfloat16)
     def test_embedding_bag_1D_padding_idx(self, device, dtype):
         num_features = 3
         max_indices_per_bag = 10
         num_bags = 10
         num_words = 100

         def gen_1D_indices_offsets(include_last_offset, allpad):
             indices = []
             offsets = []
             cur_offset = 0

             # Make one bag full and one bag empty, for extra coverage
             empty_bag = random.randint(0, num_bags - 1)
             full_bag = empty_bag
             while full_bag == empty_bag:
                 full_bag = random.randint(0, num_bags - 1)

             for bag in range(num_bags):
                 offsets.append(cur_offset)
                 if bag == full_bag:
                     bag_size = max_indices_per_bag
                 elif bag == empty_bag:
                     bag_size = 0
                 else:
                     bag_size = random.randint(1, max_indices_per_bag - 1)
                 indices += [1 if allpad else random.randint(0, num_words - 1) for _ in range(bag_size)]
                 cur_offset += bag_size

             # embedding_bag requires first entry of offsets to be 0
             assert offsets[0] == 0

             indices = torch.tensor(indices, device=device)

             if include_last_offset:
                 offsets.append(indices.size(0))

             offsets = torch.tensor(offsets, device=device)

             return indices, offsets

         # Convert a 1-D indices-offsets representation into 2-D. Fill any empty
         # indices with padding_idx
         def gen_2D_indices_from_1D(indices_1D, offsets, include_last_offset, padding_idx):
             assert offsets[0] == 0
             if include_last_offset:
                 offsets = offsets[:-1]
             indices_2D = torch.empty(num_bags, max_indices_per_bag, device=device, dtype=torch.long)
             for bag in range(num_bags):
                 # Determine the start and end position of the bag within indices_1D
                 start = offsets[bag]
                 end = len(indices_1D) if bag + 1 == num_bags else offsets[bag + 1]
                 end = min(len(indices_1D), end)

                 # Pull out the bag's indices from indices_1D, and fill any
                 # remaining space with padding indices
                 indices_in_bag = []
                 for item_pos in range(0, max_indices_per_bag):
                     if (start + item_pos) < end:
                         indices_in_bag.append(indices_1D[start + item_pos])
                     else:
                         indices_in_bag.append(padding_idx)
                 indices_2D[bag] = torch.tensor(indices_in_bag, device=device)

             return indices_2D

         test_cases = product(['max', 'mean', 'sum'], [False, True], [False, True], [False, True])

         for mode, sparse, include_last_offset, allpad in test_cases:
             # Max sparse and bfloat16 are not supported
             if mode == 'max':
                 if sparse or (dtype == torch.bfloat16):
                     continue
             indices_1D, offsets = gen_1D_indices_offsets(include_last_offset, allpad)
             for padding_idx_1D in list(set(indices_1D.tolist())) + [None]:
                 msg = (
                     f"mode: '{mode}', sparse: {sparse}, include_last_offset: {include_last_offset}, "
                     f"padding_idx_1D: {padding_idx_1D}")

                 # If 1D input does not use a padding index, we still need one for the 2D input,
                 # so we can add one dummy word to the weights to act as the padded word
                 padding_idx_2D = padding_idx_1D if padding_idx_1D is not None else num_words
                 num_words_with_padding = num_words if padding_idx_1D is not None else num_words + 1

                 indices_2D = gen_2D_indices_from_1D(
                     indices_1D,
                     offsets,
                     include_last_offset,
                     padding_idx_2D)

                 weights = torch.randn(
                     num_words_with_padding,
                     num_features,
                     dtype=dtype,
                     device=device,
                     requires_grad=True)
                 weights_check = weights.clone().detach().requires_grad_(True)

                 bag = torch.nn.functional.embedding_bag(
                     indices_1D,
                     weights,
                     offsets,
                     padding_idx=padding_idx_1D,
                     mode=mode,
                     sparse=sparse,
                     include_last_offset=include_last_offset)

                 bag_check = torch.nn.functional.embedding_bag(
                     indices_2D,
                     weights_check,
                     padding_idx=padding_idx_2D,
                     mode=mode,
                     sparse=sparse)
                 self.assertEqual(bag, bag_check, msg=msg)

                 bag.sum().backward()
                 bag_check.sum().backward()

                 # Sometimes, half dtype gradients mismatch by a greater amount
                 # than other dtypes
                 if dtype in [torch.half, torch.bfloat16]:
                     atol = 0.01
                     rtol = 0.01
                 else:
                     atol = None
                     rtol = None
                 self.assertEqual(weights.grad, weights_check.grad, msg=msg, atol=atol, rtol=rtol)

     # Check correctness of torch.nn.functional.embedding_bag forward and
     # backward functions with padding_idx, given a 2D indices input. Compare
     # against torch.nn.functional.embedding followed by a reduction.
     @onlyNativeDeviceTypes
     @dtypes(torch.float32, torch.float64)
     @dtypesIfCUDA(torch.half, torch.bfloat16)
     def test_embedding_bag_2D_padding_idx(self, device, dtype):
         # Use a Python implementation of embedding_bag with padding_idx support
         # to check torch.nn.functional.embedding_bag correctness
         def embedding_bag_check(indices, weights, mode, sparse, padding_idx):
             assert padding_idx is not None
             embedding = torch.nn.functional.embedding(
                 indices,
                 weights,
                 padding_idx=padding_idx,
                 sparse=sparse)

             reduction_dim = indices.dim() - 1

             if mode == 'sum' or mode == 'mean':
                 # We must avoid including elements at padding_idx in the
                 # sum/mean, so multiply those elements by 0, and multiply
                 # all other elements by 1
                 per_sample_weights = indices.ne(padding_idx).to(dtype).unsqueeze(-1)
                 res = embedding.mul(per_sample_weights).sum(dim=reduction_dim)

                 if mode == 'mean':
                     weights_sum = per_sample_weights.sum(dim=reduction_dim)
                     res = res.div(weights_sum)

             elif mode == 'max':
                 # We must avoid allowing elements at padding_idx to be chosen
                 # as the max, so set those elements to negative infinity
                 res = embedding.masked_fill(
                     indices.unsqueeze(-1) == padding_idx, -float('inf')
                 ).amax(dim=reduction_dim)

             else:
                 raise RuntimeError(f"mode '{mode}' is not available")

             # If a row is all padding, set its corresponding result row to 0.
             # This is needed because the above mean and max mode
             # implementations set these elements to nan and -inf, respectively
             if mode in ['mean', 'max']:
                 res = res.masked_fill(
                     indices.eq(padding_idx).all(dim=-1).unsqueeze(-1),
                     0)

             return res

         num_features = 3
         num_words = 10
         indices_dim1 = 10

         for mode, sparse, allpad, indices_dim0 in product(['max', 'mean', 'sum'], [False, True], [False, True], [1, 10]):
             # Max sparse and bfloat16 are not supported
             if mode == 'max':
                 if sparse or (dtype == torch.bfloat16):
                     continue

             if allpad:
                 indices = torch.empty(indices_dim0, indices_dim1, dtype=torch.long, device=device).fill_(1)
             else:
                 indices = torch.randint(0, num_words, (indices_dim0, indices_dim1), device=device)

                 if indices_dim0 > 1:
                     # Fill one row with duplicate index so we can test with a fully
                     # padded row
                     duplicate_row = random.randint(0, indices_dim0 - 1)
                     indices[duplicate_row] = indices[duplicate_row][0]

             for padding_idx in list(set(indices.flatten(0, -1).tolist())):
                 weights = torch.randn(num_words, num_features, dtype=dtype, device=device, requires_grad=True)
                 weights_check = weights.clone().detach().requires_grad_(True)

                 msg = (
                     f"mode: '{mode}', sparse: {sparse}, padding_idx: {padding_idx}, "
                     f"allpad: {allpad}, indices.size(): {indices.size()}")

                 # Check forward with a Python implementation of padding_idx embedding_bag
                 bag_check = embedding_bag_check(
                     indices,
                     weights_check,
                     mode,
                     sparse,
                     padding_idx)
                 bag = torch.nn.functional.embedding_bag(
                     indices,
                     weights,
                     padding_idx=padding_idx,
                     mode=mode,
                     sparse=sparse)

                 self.assertEqual(bag, bag_check, msg=msg)

                 bag_check.sum().backward()
                 grad_check = weights_check.grad

                 bag.sum().backward()
                 grad = weights.grad

                 # Sometimes, half dtype gradients mismatch by a greater amount
                 # than other dtypes
                 if dtype in [torch.half, torch.bfloat16]:
                     atol = 0.01
                     rtol = 0.01
                 else:
                     atol = None
                     rtol = None
                 self.assertEqual(grad, grad_check, msg=msg, atol=atol, rtol=rtol)

     @onlyCUDA
     @dtypes(*((torch.float, torch.double, torch.bfloat16, torch.half)
               if TEST_WITH_ROCM else (torch.float, torch.double, torch.half)))
     def test_embedding_max_norm_device(self, device, dtype):
         embedding = nn.Embedding(22, 5, max_norm=1.0).to(device, dtype=dtype)
         # nn.Embedding only takes LongTensor as input
         input = torch.tensor([2, 8, 8, 6], device=device, dtype=torch.long)
         output = embedding(input)
         self.assertEqual(output[1], output[2])
         self.assertTrue(output.data.norm(p=2, dim=1).le(1).all())

     @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long)))
     def test_embedding_bag_empty_input(self, device, dtypes):
         m = 4
         n = 3
         x = torch.tensor([], device=device, dtype=dtypes[0])
         for sparse in [True, False]:
             Embed = torch.nn.EmbeddingBag(m, n, sparse=sparse)
             Embed.to(device)

             output = Embed(input=x, offsets=torch.tensor([0], device=device, dtype=dtypes[1]))
             self.assertEqual(output, torch.zeros_like(output))

             output = Embed(input=x, offsets=torch.tensor([0, 0], device=device, dtype=dtypes[1]))
             self.assertEqual(output, torch.zeros_like(output))

     @skipCUDAIf(True, "no out-of-bounds check on CUDA for perf.")
     @dtypes(*itertools.product((torch.float, torch.double), (torch.int, torch.long)))
     @parametrize_test("padding_idx", [None, 0])
     @parametrize_test("mode", ["sum", "mean", "max"])
     def test_embedding_bag_out_of_bounds_idx(self, device, dtypes, padding_idx, mode):
         padding_idx = 0
         w_dtype, idx_dtype = dtypes
         # negative out-of-bound
         idx1 = torch.tensor([[-1, 1]], device=device, dtype=idx_dtype)
         # positive out-of-bound
         idx2 = torch.tensor([[11, 8]], device=device, dtype=idx_dtype)
         weight = torch.randn(10, 2, device=device, dtype=w_dtype)
         if mode == 'sum':
             # Only `sum` supports per_sample_weight
             per_sample_weights = (None, torch.randn_like(idx1, device=device, dtype=w_dtype))
         else:
             per_sample_weights = (None,)

         for p_s_weights, idx in itertools.product(per_sample_weights, (idx1, idx2)):
             msg = "Expected idx >= 0 && idx < num_embeddings"
             with self.assertRaisesRegex(RuntimeError, msg):
                 torch.nn.functional.embedding_bag(idx, weight,
                                                   per_sample_weights=p_s_weights, padding_idx=padding_idx,
                                                   mode=mode)

     def test_embedding_bag_dimension_errors(self, device):
         funcs = (
             lambda x, y, z: torch.nn.functional.embedding_bag(y, x, z),
             torch.embedding_bag,
             torch._embedding_bag,
             torch._embedding_bag_forward_only
         )
         for i, f in enumerate(funcs):
             err_type = (ValueError, RuntimeError) if i == 0 else RuntimeError

             weight = torch.full((2, 6,), 0, dtype=torch.float64, device=device)
             indices = torch.full((2, 0, 0, 6, 6,), 2, dtype=torch.int64, device=device)
             offsets = torch.full((2, 0, 0, 6, 6), 0, dtype=torch.int64, device=device)

             if i == 0:
                 error_msg = 'input has to be 1D or 2D Tensor'
             else:
                 error_msg = 'input has to be a 1D or 2D Tensor'
             torch._dynamo.disable(self.assertRaisesRegex)(
                 err_type, error_msg, lambda: f(weight, indices, offsets)
             )

             weight = torch.full((2, 2), 0, dtype=torch.float64, device=device)
             indices = torch.full((2,), 1, dtype=torch.int64, device=device)

             torch._dynamo.disable(self.assertRaisesRegex)(
                 err_type, 'offsets has to be a 1D Tensor', lambda: f(weight, indices, offsets)
             )

             weight = torch.full((2, 2, 2), 0, dtype=torch.float64, device=device)
             indices = torch.full((2,), 2, dtype=torch.int64, device=device)
             offsets = torch.full((2,), 0, dtype=torch.int64, device=device)

             torch._dynamo.disable(self.assertRaisesRegex)(
                 err_type, 'weight has to be a 2D Tensor', lambda: f(weight, indices, offsets)
             )

     @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long)))
     def test_EmbeddingBag_per_sample_weights_failures(self, device, dtypes):
         # Failure 1: mismatched embeddings / per_sample_weights dtype
         es = nn.EmbeddingBag(5, 2, mode='sum').to(dtype=torch.float, device=device)
         input = torch.tensor([3, 1, 1, 1, 4, 0], dtype=dtypes[0], device=device)
         offsets = torch.tensor([0, 0, 3, 3, 6], dtype=dtypes[1], device=device)
         per_sample_weights = torch.randn_like(input, dtype=torch.double, device=device)
         if device == 'cpu':
             with self.assertRaisesRegex(RuntimeError, 'have the same type as'):
                 es(input, offsets, per_sample_weights)
         else:
             with self.assertRaisesRegex(RuntimeError, 'expected scalar type'):
                 es(input, offsets, per_sample_weights)

         # Failure 2.1: input/per_sample_weights have different sizes (1d input)
         input = torch.tensor([3, 1, 1, 1, 4, 0], dtype=dtypes[0], device=device)
         offsets = torch.tensor([0, 0, 3, 3, 6], dtype=dtypes[1], device=device)
         per_sample_weights = torch.randn(5, dtype=torch.float, device=device)
         with self.assertRaisesRegex(ValueError, 'same shape as the input'):
             es(input, offsets, per_sample_weights)

         # Failure 2.2: input/per_sample_weights have different sizes (2d input)
         input = torch.randint(5, (7, 3), dtype=dtypes[0], device=device)
         offsets = None
         per_sample_weights = torch.randn(7 * 3, dtype=torch.float, device=device)
         with self.assertRaisesRegex(ValueError, 'same shape as the input'):
             es(input, offsets, per_sample_weights)

         # Failure 3: Unsupported per_sample_weights and mode=('max', 'mean')
         for unsupported_mode in ('max', 'mean'):
             es = nn.EmbeddingBag(5, 2, mode=unsupported_mode).to(
                 dtype=torch.float, device=device)
             input = torch.randint(5, (7, 3), dtype=dtypes[0], device=device)
             offsets = None
             per_sample_weights = torch.randn(7, 3, dtype=torch.float, device=device)
             with self.assertRaisesRegex(NotImplementedError,
                                         "only supported for mode='sum'"):
                 es(input, offsets, per_sample_weights)

     def _embedding_bag_reference_impl(self, input, weight, offsets=None, mode='sum',
                                       per_sample_weights=None, include_last_offset=False):
         assert mode == 'sum' or per_sample_weights is None
         assert offsets is not None
         if per_sample_weights is None:
             per_sample_weights = torch.ones(input.size()).to(
                 dtype=weight.dtype, device=weight.device
             )
         assert input.numel() == per_sample_weights.numel()

         bags = []
         long_input = input.to(torch.long)
         embeddings = weight.index_select(0, long_input) * per_sample_weights.unsqueeze(1)
         if include_last_offset:
             for index in range(len(offsets) - 1):
                 offset = offsets[index]
                 next_offset = offsets[index + 1]
                 length = next_offset - offset
                 if length == 0:
                     bags.append(
                         torch.tensor([0] * weight.size(1)).to(
                             dtype=embeddings.dtype, device=embeddings.device
                         )
                     )
                 else:
                     if mode == 'sum':
                         bags.append(embeddings.narrow(0, offset, length).sum(0))
                     elif mode == 'mean':
                         bags.append(embeddings.narrow(0, offset, length).sum(0).div(length))
                     else:
                         assert mode == 'max'
                         bags.append(embeddings.narrow(0, offset, length).max(0)[0])
         else:
             for index, offset in enumerate(offsets):
                 if index + 1 < len(offsets):
                     next_offset = offsets[index + 1]
                 else:
                     next_offset = len(long_input)
                 length = next_offset - offset
                 if length == 0:
                     bags.append(
                         torch.tensor([0] * weight.size(1)).to(
                             dtype=embeddings.dtype, device=embeddings.device
                         )
                     )
                 else:
                     if mode == 'sum':
                         bags.append(embeddings.narrow(0, offset, length).sum(0))
                     elif mode == 'mean':
                         bags.append(embeddings.narrow(0, offset, length).sum(0).div(length))
                     else:
                         assert mode == 'max'
                         bags.append(embeddings.narrow(0, offset, length).max(0)[0])
         return torch.stack(bags)

     @skipMeta
     @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
                                (torch.half, torch.bfloat16, torch.float, torch.double)))
     @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
                                      (torch.float, torch.double, torch.half)))
     def test_EmbeddingBag_empty_per_sample_weights_and_offsets(self, device, dtypes):
         # Test empty input and per sample weight, and backward pass. There was a CUDA
         # invalid configuration bug (more context in #46572)
         def test_per_sample_weights(mode, trainable_scale):
             es = nn.EmbeddingBag(5, 2, mode=mode).to(dtype=dtypes[2], device=device)
             es.weight.data.copy_(
                 torch.arange(1, 11, device=device).view_as(es.weight).to(dtypes[2]))
             input = torch.tensor([], device=device, dtype=dtypes[0])
             offsets = torch.tensor([0, 0, 0, 0, 0], device=device, dtype=dtypes[1])
             per_sample_weights = torch.randn_like(input, dtype=dtypes[2]) \
                                       .requires_grad_(trainable_scale)
             ref_per_sample_weights = \
                 per_sample_weights.detach().requires_grad_(trainable_scale)
             reference_weights = es.weight.detach().requires_grad_()

             expected = self._embedding_bag_reference_impl(
                 input, reference_weights, offsets, mode, ref_per_sample_weights)
             result = es(input, offsets, per_sample_weights)
             self.assertEqual(result, expected, atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)

             grad = torch.randn_like(expected)
             result.backward(grad)
             # the reference impl doesn't have grad fn for empty input; but the grad should
             # simply be a zero tensor
             ref_weights_grad = torch.zeros_like(es.weight)
             self.assertEqual(es.weight.grad, ref_weights_grad,
                              atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)
             if trainable_scale:
                 ref_per_sample_weights_grad = torch.empty_like(per_sample_weights)
                 self.assertEqual(per_sample_weights.grad, ref_per_sample_weights_grad,
                                  atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)

         modes = ('sum',)
         trainable_scale = (True, False)
         for mode, trainable in itertools.product(modes, trainable_scale):
             test_per_sample_weights(mode, trainable)

     @skipMeta
     @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
                                (torch.float, torch.double, torch.half, torch.bfloat16)))
     @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
                                      (torch.float, torch.double, torch.half)))
     def test_EmbeddingBag_per_sample_weights_and_offsets(self, device, dtypes):
         def test_per_sample_weights(mode, trainable_scale):
             es = nn.EmbeddingBag(5, 2, mode=mode).to(dtype=dtypes[2], device=device)
             es.weight.data.copy_(
                 torch.arange(1, 11, device=device).view_as(es.weight).to(dtypes[2]))
             input = torch.tensor([3, 1, 1, 1, 4, 0], device=device, dtype=dtypes[0])
             offsets = torch.tensor([0, 0, 3, 3, 6], device=device, dtype=dtypes[1])
             per_sample_weights = torch.randn_like(input, dtype=dtypes[2]) \
                                       .requires_grad_(trainable_scale)
             ref_per_sample_weights = \
                 per_sample_weights.detach().requires_grad_(trainable_scale)
             reference_weights = es.weight.detach().requires_grad_()

             expected = self._embedding_bag_reference_impl(
                 input, reference_weights, offsets, mode, ref_per_sample_weights)
             result = es(input, offsets, per_sample_weights)
             self.assertEqual(result, expected, atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)

             grad = torch.randn_like(expected).to(dtype=dtypes[2], device=device)
             result.backward(grad)
             expected.backward(grad)
             self.assertEqual(es.weight.grad, reference_weights.grad,
                              atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)
             if trainable_scale:
                 self.assertEqual(per_sample_weights.grad, ref_per_sample_weights.grad,
                                  atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)

         modes = ('sum',)
         trainable_scale = (True, False)
         for mode, trainable in itertools.product(modes, trainable_scale):
             test_per_sample_weights(mode, trainable)

     @skipMeta
     @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
                                (torch.float, torch.double, torch.half, torch.bfloat16)))
     @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
                                      (torch.float, torch.double, torch.half)))
     def test_EmbeddingBag_per_sample_weights_and_new_offsets(self, device, dtypes):
         def test_per_sample_weights_new_offsets(mode, trainable_scale, include_last_offset, has_weight=True):
             es = nn.EmbeddingBag(5, 2, mode=mode, include_last_offset=include_last_offset).to(dtype=dtypes[2], device=device)
             es.weight.data.copy_(
                 torch.arange(1, 11, device=device).view_as(es.weight).to(dtypes[2]))
             input = torch.tensor([3, 1, 1, 1, 4, 0], device=device, dtype=dtypes[0])
             offsets = torch.tensor([0, 0, 3, 3, 6], device=device, dtype=dtypes[1])

             if include_last_offset:
                 offsets = torch.cat((offsets, torch.tensor([input.size(0)], device=device, dtype=dtypes[1])), 0)

             if has_weight:
                 per_sample_weights = torch.randn_like(input, device=device, dtype=dtypes[2]) \
                                           .requires_grad_(trainable_scale)
                 ref_per_sample_weights = \
                     per_sample_weights.detach().requires_grad_(trainable_scale)
             else:
                 per_sample_weights = None
                 ref_per_sample_weights = None

             reference_weights = es.weight.detach().requires_grad_()

             expected = self._embedding_bag_reference_impl(
                 input, reference_weights, offsets, mode, ref_per_sample_weights, include_last_offset)
             result = es(input, offsets, per_sample_weights)
             self.assertEqual(result, expected, atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)

             grad = torch.randn_like(expected)
             result.backward(grad)
             expected.backward(grad)
             self.assertEqual(es.weight.grad, reference_weights.grad,
                              atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)
             if has_weight and trainable_scale:
                 self.assertEqual(per_sample_weights.grad, ref_per_sample_weights.grad,
                                  atol=dtype2prec_DONTUSE[dtypes[2]], rtol=0)

         trainable_scale = (True, False)
         include_last_offset_list = (True, False)
         modes = (('sum', False), ('sum', True), ('max', False), ('mean', False))
         for (mode, has_weight), trainable, include_last_offset in itertools.product(
             modes, trainable_scale, include_last_offset_list
         ):
             test_per_sample_weights_new_offsets(
                 mode, trainable, include_last_offset, has_weight
             )

     def _test_EmbeddingBag_vs_Embedding(self, N, D, B, L, max_norm=None,
                                         mode='mean',
                                         device='cpu',
                                         wdtype=torch.float,
                                         dtype=torch.long,
                                         test_per_sample_weights=False,
                                         trainable_per_sample_weights=False,
                                         sparse=False,
                                         test_backward=True,
                                         backward_prec=None):
         es = nn.EmbeddingBag(N, D, mode=mode, sparse=sparse, max_norm=max_norm).to(device, wdtype)
         e = nn.Embedding(N, D, max_norm=max_norm).to(device, wdtype)
         e.weight.data.copy_(es.weight)
         input = torch.randint(N, (B, L), device=device, dtype=dtype)
         offsets = torch.arange(0, B, device=device, dtype=dtype).mul_(L)
         grad_output = torch.rand(B, D, device=device, dtype=wdtype)

         if test_per_sample_weights:
             # To prevent large gradients, weights should sum to 1 for each bag
             per_sample_weights = \
                 torch.randn(B, L, device=device, dtype=wdtype).softmax(dim=-1)
             per_sample_weights_reference = \
                 per_sample_weights.clone().requires_grad_(trainable_per_sample_weights)
             per_sample_weights.requires_grad_(trainable_per_sample_weights)
             output = es(input.view(-1), offsets, per_sample_weights.view(-1))
         else:
             output = es(input.view(-1), offsets)
             per_sample_weights = None
             per_sample_weights_reference = None

         if mode == 'sum':
             if test_per_sample_weights:
                 ref_output = (e(input) * per_sample_weights_reference.unsqueeze(-1)).sum(1)
             else:
                 ref_output = e(input).sum(1)
         elif mode == 'mean':
             assert not test_per_sample_weights
             ref_output = e(input).mean(1)
         elif mode == 'max':
             assert not test_per_sample_weights
             ref_output = e(input).max(1)[0]

         self.assertEqual(output, ref_output, atol=dtype2prec_DONTUSE[wdtype], rtol=0)

         if not test_backward:
             return

         output.backward(grad_output)
         ref_output.backward(grad_output)
         es_weight_grad = es.weight.grad
         if sparse:
             es_weight_grad = es.weight.grad.to_dense()

         # We have more floating point error here because we are dealing with larger numbers
         if backward_prec is None:
             needed_prec = dtype2prec_DONTUSE[wdtype] * 5
             rtol = 0.02 if wdtype == torch.half else 0
         else:
             needed_prec = backward_prec
             rtol = 0

         self.assertEqual(es_weight_grad, e.weight.grad, atol=needed_prec, rtol=rtol)

         if test_per_sample_weights and trainable_per_sample_weights:
             self.assertEqual(per_sample_weights.grad, per_sample_weights_reference.grad,
                              atol=dtype2prec_DONTUSE[wdtype], rtol=0)

     @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.half, torch.float, torch.double)))
     @dtypes(*itertools.product((torch.int, torch.long), (torch.float, torch.double)))
     def test_EmbeddingBag_per_sample_weights_and_no_offsets(self, device, dtypes):
         def run_tests(mode, sparse, trainable_per_sample_weights):
             kwargs = dict(test_per_sample_weights=True, device=device,
                           mode=mode, wdtype=dtypes[1], dtype=dtypes[0], sparse=sparse,
                           trainable_per_sample_weights=trainable_per_sample_weights)

             # Simple case
             self._test_EmbeddingBag_vs_Embedding(2, 3, 5, 7, **kwargs)

             # B * L > 1000
             self._test_EmbeddingBag_vs_Embedding(2, 5, 53, 23, **kwargs)

             # Large num_embedding
             self._test_EmbeddingBag_vs_Embedding(101, 5, 3, 7, **kwargs)

             # Large embedding_dim
             self._test_EmbeddingBag_vs_Embedding(2, 101, 3, 7, **kwargs)

         modes = ('sum',)
         sparsity = (True, False)
         trainable_scale = (True, False)
         for mode, sparse, trainable_per_sample_weights in \
                 itertools.product(modes, sparsity, trainable_scale):
             run_tests(mode, sparse, trainable_per_sample_weights)

         # Test CUDA Dense on half precision
         if device == 'cuda':
             modes = ('sum',)
             sparsity = (False,)
             trainable_scale = (True, False)
             for mode, sparse, trainable_per_sample_weights in \
                     itertools.product(modes, sparsity, trainable_scale):
                 run_tests(mode, sparse, trainable_per_sample_weights)

     def _test_EmbeddingBag(
         self,
         device,
         mode,
         sparse,
         wdtype=torch.double,
         dtype=torch.long,
         odtype=torch.long,
         test_backward=True,
     ):
         # check a known test example
         es = nn.EmbeddingBag(5, 2, mode=mode, sparse=sparse).to(device, wdtype)
         es.weight.data.copy_(torch.arange(1, 11, device=device).view_as(es.weight).to(wdtype))
         input = torch.tensor([3, 1, 1, 1, 4, 0], device=device, dtype=dtype)
         offsets = torch.tensor([0, 0, 3, 3, 6], device=device, dtype=odtype)

         grad_output = torch.tensor(
             [1, 2,
              3, 4], device=device, dtype=wdtype).view(2, 2)
         grad_output_with_empty = torch.tensor(
             [99, 99,
              1, 2,
              99, 99,
              3, 4,
              99, 99], device=device, dtype=wdtype).view(5, 2)

         if mode == "sum" or mode == "mean":
             denominator = 1 if mode == "sum" else 3
             expected_output = torch.tensor(
                 [[13, 16],
                  [13, 16]], device=device, dtype=wdtype) / denominator

             expected_output_with_empty = torch.tensor(
                 [[0, 0],
                  [13, 16],
                  [0, 0],
                  [13, 16],
                  [0, 0]], device=device, dtype=wdtype) / denominator

             expected_grad_weight = torch.tensor(
                 [[3, 4],
                  [5, 8],
                  [0, 0],
                  [1, 2],
                  [3, 4]], device=device, dtype=wdtype) / denominator
         elif mode == "max":
             expected_output = torch.tensor(
                 [[7, 8],
                  [9, 10]], device=device, dtype=wdtype)

             expected_output_with_empty = torch.tensor(
                 [[0, 0],
                  [7, 8],
                  [0, 0],
                  [9, 10],
                  [0, 0]], device=device, dtype=wdtype)

             expected_grad_weight = torch.tensor(
                 [[0, 0],
                  [0, 0],
                  [0, 0],
                  [1, 2],
                  [3, 4]], device=device, dtype=wdtype)
         output = es(input, offsets)
         output.backward(grad_output_with_empty)

         es_weight_grad = es.weight.grad
         if sparse:
             es_weight_grad = es.weight.grad.to_dense()
         self.assertEqual(output, expected_output_with_empty)
         self.assertEqual(es_weight_grad, expected_grad_weight, atol=dtype2prec_DONTUSE[wdtype], rtol=0)

         # check same example except as 2D (2 x 3)
         input = input.view(2, -1)
         es.zero_grad()
         output = es(input)
         output.backward(grad_output)

         es_weight_grad = es.weight.grad
         if sparse:
             es_weight_grad = es.weight.grad.to_dense()
         self.assertEqual(output, expected_output)
         self.assertEqual(es_weight_grad, expected_grad_weight, atol=dtype2prec_DONTUSE[wdtype], rtol=0)

         # test all empty bags
         es.zero_grad()
         inputs = torch.tensor([], dtype=dtype, device=device)
         offsets = torch.tensor([0, 0, 0, 0], dtype=odtype, device=device)
         es(inputs, offsets).sum().backward()
         dense_grad = es.weight.grad
         if dense_grad.is_sparse:
             dense_grad = dense_grad.to_dense()
         self.assertEqual(dense_grad, torch.zeros_like(es.weight))

         # now compare EmbeddingBag vs Embedding + Sum/Mean, for constant bag length
         N, D, B, L = random.randint(1, 100), random.randint(1, 100), random.randint(1, 50), random.randint(1, 50)
         kwargs = dict(mode=mode, sparse=sparse, device=device, wdtype=wdtype, dtype=dtype, test_backward=test_backward)
         self._test_EmbeddingBag_vs_Embedding(N, D, B, L, **kwargs)
         for max_norm in (None, 3):
             for p in itertools.product([1, 2], repeat=4):
                 self._test_EmbeddingBag_vs_Embedding(*p, max_norm=max_norm, **kwargs)

         # check that giving illegal input combos raises error
         es = nn.EmbeddingBag(10, 20, mode=mode, sparse=sparse)
         input = torch.ones(3, 4, dtype=dtype)
         offset = torch.arange(0, 3, dtype=odtype)
         torch._dynamo.disable(self.assertRaises)(ValueError, lambda: es(input, offset))
         torch._dynamo.disable(self.assertRaises)(ValueError, lambda: es(input.view(-1)))
         offset[0] = 1
         if self.device_type == "cpu":
             torch._dynamo.disable(self.assertRaises)(RuntimeError, lambda: es(input.view(-1), offset))
             offset[0] = 0
             offset[-1] = 100
             torch._dynamo.disable(self.assertRaises)(RuntimeError, lambda: es(input.view(-1), offset))

     @skipMeta
     @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
                                (torch.float, torch.double, torch.half, torch.bfloat16)))
     @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
                                      (torch.float, torch.double, torch.half)))
     def test_embedding_bag_device(self, device, dtypes):
         if IS_JETSON and torch.bfloat16 in dtypes and device == "cpu":
             self.skipTest("bfloat16 not supported with Jetson cpu")
         with set_default_dtype(torch.double):
             self._test_EmbeddingBag(device, 'sum', False, wdtype=dtypes[2], dtype=dtypes[0], odtype=dtypes[1])
             self._test_EmbeddingBag(device, 'mean', False, wdtype=dtypes[2], dtype=dtypes[0], odtype=dtypes[1])
             self._test_EmbeddingBag(device, 'max', False, wdtype=dtypes[2], dtype=dtypes[0], odtype=dtypes[1])

             test_backward = False
             if self.device_type == 'cuda':
                 # see 'todo' in test_embedding_bag.
                 test_backward = dtypes[2] is not torch.float16
             elif self.device_type == 'cpu':
                 # TODO: figure out why precision on sparse embeddings isn't the
                 # same as for dense.
                 test_backward = dtypes[2] is not torch.float and dtypes[2] is not torch.float16

             self._test_EmbeddingBag(
                 device,
                 'sum',
                 True,
                 wdtype=dtypes[2],
                 dtype=dtypes[0],
                 odtype=dtypes[1],
                 test_backward=test_backward,
             )
             self._test_EmbeddingBag(
                 device,
                 'mean',
                 True,
                 wdtype=dtypes[2],
                 dtype=dtypes[0],
                 odtype=dtypes[1],
                 test_backward=test_backward,
             )

     @skipMeta
     @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
                                (torch.float, torch.double, torch.half, torch.bfloat16)))
     @dtypesIfCUDA(*itertools.product((torch.int, torch.long), (torch.int, torch.long),
                                      (torch.float, torch.double, torch.half)))
     def test_embedding_bag_non_contiguous_weight(self, device, dtypes):
         weight_tensor = torch.randn(3, 4, dtype=dtypes[2], device=device)

         weight_tensor_non_contig = weight_tensor[:, :3]  # This is non-contiguous strided.
         weight_tensor_contig = weight_tensor_non_contig.clone().contiguous()  # Contig-strided.

         index = torch.tensor([0, 1, 2], dtype=dtypes[0], device=device)
         offsets = torch.tensor([0, 2], dtype=dtypes[1], device=device)
         for mode in ['sum', 'mean', 'max']:
             output_non_contig = F.embedding_bag(
                 input=index,
                 weight=weight_tensor_non_contig,
                 offsets=offsets,
                 mode=mode,
             )
             output_contig = F.embedding_bag(
                 input=index,
                 weight=weight_tensor_contig,
                 offsets=offsets,
                 mode=mode,
             )
         self.assertEqual(output_non_contig, output_contig)

     @onlyNativeDeviceTypes  # currently fails on XLA
     @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long)))
     def test_embedding_bag_bfloat16(self, device, dtypes):
         with set_default_dtype(torch.double):
             self._test_EmbeddingBag(device, 'sum', True,
                                     wdtype=torch.bfloat16, dtype=dtypes[0],
                                     odtype=dtypes[1], test_backward=True)
             self._test_EmbeddingBag(device, 'mean', True,
                                     wdtype=torch.bfloat16, dtype=dtypes[0],
                                     odtype=dtypes[1], test_backward=True)

     @onlyNativeDeviceTypes  # currently fails on XLA
     @dtypes(*itertools.product((torch.int, torch.long), (torch.int, torch.long)))
     def test_embedding_bag_half(self, device, dtypes):
         self._test_EmbeddingBag(device, 'sum', True, wdtype=torch.float16, dtype=dtypes[0], odtype=dtypes[1], test_backward=True)


 instantiate_device_type_tests(TestEmbeddingNNDeviceType, globals())
 instantiate_parametrized_tests(TestEmbeddingNN)

 if __name__ == '__main__':
     run_tests()