| # Owner(s): ["module: nn"] |
| |
| import contextlib |
| import math |
| import random |
| import unittest |
| import io |
| import itertools |
| import warnings |
| import pickle |
| import re |
| from copy import deepcopy |
| from itertools import product |
| from functools import partial |
| from collections import OrderedDict |
| from unittest import SkipTest |
| |
| import torch |
| from torch import inf, nan |
| import torch.autograd.forward_ad as fwAD |
| import torch.backends.cudnn as cudnn |
| import torch.nn as nn |
| import torch.nn.functional as F |
| import torch.nn.utils.rnn as rnn_utils |
| from torch.nn.utils import clip_grad_norm_, clip_grad_value_ |
| from torch.nn.utils import parameters_to_vector, vector_to_parameters |
| from torch.nn.utils.fusion import fuse_conv_bn_weights |
| from torch.nn.utils.fusion import fuse_linear_bn_weights |
| from torch.nn import Buffer, Parameter |
| from torch.nn.parallel._functions import Broadcast |
| from torch.testing._internal.common_dtype import integral_types, get_all_math_dtypes, floating_types |
| from torch.testing._internal.common_utils import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \ |
| download_file, get_function_arglist, load_tests, skipIfMps, \ |
| IS_PPC, \ |
| parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \ |
| skipIfTorchDynamo, gcIfJetson, set_default_dtype |
| from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PLATFORM_SUPPORTS_FLASH_ATTENTION |
| from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \ |
| module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \ |
| ctcloss_reference, new_module_tests, single_batch_reference_fn, _test_bfloat16_ops, _test_module_empty_input |
| from torch.testing._internal.common_device_type import dtypesIfMPS, instantiate_device_type_tests, dtypes, \ |
| dtypesIfCUDA, precisionOverride, skipCUDAIfCudnnVersionLessThan, onlyCUDA, onlyCPU, \ |
| skipCUDAIfRocm, skipCUDAIf, skipCUDAIfNotRocm, \ |
| onlyNativeDeviceTypes, deviceCountAtLeast, largeTensorTest, expectedFailureMeta, expectedFailureMPS, \ |
| skipMeta, get_all_device_types |
| |
| from hypothesis import given |
| import torch.testing._internal.hypothesis_utils as hu |
| from torch.testing._internal.common_utils import _assertGradAndGradgradChecks, gradcheck, gradgradcheck, \ |
| from torch.testing._internal.common_utils import dtype2prec_DONTUSE |
| from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32, tf32_off, tf32_on |
| from torch.types import _TensorOrTensors |
| from torch.testing._internal.common_mkldnn import bf32_on_and_off |
| |
| AMPERE_OR_ROCM = TEST_WITH_ROCM or tf32_is_not_fp32() |
| |
| # load_tests from common_utils is used to automatically filter tests for |
| # sharding on sandcastle. This line silences flake warnings |
| load_tests = load_tests |
| |
| if TEST_SCIPY: |
| import scipy.signal |
| import scipy.ndimage |
| |
| if TEST_NUMPY: |
| import numpy as np |
| |
| |
| # WARNING: If you add a new top-level test case to this file, you MUST |
| # update test/run_test.py to list it, otherwise it will NOT be run in |
| # CI. |
| |
| class TestNN(NNTestCase): |
| _do_cuda_memory_leak_check = True |
| _do_cuda_non_default_stream = True |
| |
| def _forward(self, module, input: _TensorOrTensors): |
| with freeze_rng_state(): |
| if isinstance(input, tuple): |
| return module(*input) |
| else: |
| return module(input) |
| |
| def _backward(self, module, input: _TensorOrTensors, output, grad_output, create_graph=False): |
| output.backward(grad_output, retain_graph=True, create_graph=create_graph) |
| if isinstance(input, tuple): |
| return tuple(i.grad.data if i.grad is not None else None for i in input) |
| else: |
| return input.grad.data if input.grad is not None else None |
| |
| def _forward_criterion(self, criterion, input, target, extra_args=None): |
| if extra_args is None: |
| extra_args = () |
| if isinstance(input, tuple): |
| args = input + (target,) + extra_args |
| output = criterion(*args) |
| else: |
| output = criterion(input, target, *extra_args) |
| return output |
| |
| def _backward_criterion(self, criterion, input, output, target, gradOutput=None, extra_args=None): |
| if extra_args is None: |
| extra_args = () |
| input_tuple = input if isinstance(input, tuple) else (input,) |
| output_tuple = output if isinstance(output, tuple) else (output,) |
| for i in input_tuple: |
| if i.grad is not None: |
| i.grad.data.zero_() |
| args = input_tuple + (target,) + extra_args |
| if gradOutput is None: |
| gradOutput = torch.ones(()) |
| criterion(*args).backward(gradOutput.to(output_tuple[0])) |
| if isinstance(input, tuple): |
| return tuple(i.grad.data for i in input) |
| else: |
| return input.grad.data |
| |
| def _zero_grad_parameters(self, module): |
| for p in module.parameters(): |
| if p.grad is not None: |
| with torch.no_grad(): |
| p.grad.zero_() |
| p.grad.detach_() |
| |
| def _get_parameters(self, module): |
| params = [] |
| d_params = [] |
| for p in module.parameters(): |
| params.append(p) |
| d_params.append(p.grad) |
| return params, d_params |
| |
| def test_parse_to(self): |
| # Test for buggy use of THPMemoryFormat_New |
| self.assertEqual( |
| repr(torch._C._nn._parse_to(memory_format=torch.contiguous_format)[3]), |
| "torch.contiguous_format" |
| ) |
| |
| def test_requires_grad_(self): |
| m = _create_basic_net()[-1] |
| assert len(list(m.buffers())) > 0, 'invalid test' |
| assert all(not b.requires_grad for b in m.buffers()) > 0, 'invalid test' |
| assert len(list(m.parameters())) > 0, 'invalid test' |
| assert all(p.requires_grad for p in m.parameters()) > 0, 'invalid test' |
| for requires_grad in (False, True): |
| self.assertIs(m.requires_grad_(requires_grad), m) |
| for p in m.parameters(): |
| self.assertEqual(p.requires_grad, requires_grad) |
| for b in m.buffers(): |
| self.assertFalse(b.requires_grad) |
| |
| def test_module_backcompat(self): |
| from torch.serialization import SourceChangeWarning |
| path = download_file('https://download.pytorch.org/test_data/linear.pt') |
| with warnings.catch_warnings(): |
| warnings.simplefilter('ignore', SourceChangeWarning) |
| # weights_only=False as this is legacy code that saves the model |
| m = torch.load(path, weights_only=False) |
| input = torch.randn(2, 3, dtype=torch.float) |
| self.assertEqual(m(input).size(), (2, 5)) |
| |
| def test_module_super_init(self): |
| class MyMixin: |
| def __init__(self, *a, **kw): |
| super().__init__(*a, **kw) |
| self.mixin_init = True |
| |
| class MyModuleWithMixinBefore(MyMixin, nn.Module): |
| pass |
| |
| class MyModuleWithMixinAfter(nn.Module, MyMixin): |
| pass |
| |
| self.assertTrue(hasattr(MyModuleWithMixinBefore(), 'mixin_init')) |
| self.assertFalse(hasattr(MyModuleWithMixinAfter(), 'mixin_init')) |
| |
| nn.Module.call_super_init = True |
| self.assertTrue(hasattr(MyModuleWithMixinBefore(), 'mixin_init')) |
| self.assertTrue(hasattr(MyModuleWithMixinAfter(), 'mixin_init')) |
| nn.Module.call_super_init = False |
| |
| MyModuleWithMixinBefore.call_super_init = True |
| MyModuleWithMixinAfter.call_super_init = True |
| self.assertTrue(hasattr(MyModuleWithMixinBefore(), 'mixin_init')) |
| self.assertTrue(hasattr(MyModuleWithMixinAfter(), 'mixin_init')) |
| MyModuleWithMixinBefore.call_super_init = False |
| MyModuleWithMixinAfter.call_super_init = False |
| |
| def test_share_memory(self): |
| class Net(nn.Module): |
| def __init__(self) -> None: |
| super().__init__() |
| self.p = nn.Parameter(torch.eye(5)) |
| self.par = nn.ParameterList() |
| self.par.append(nn.Parameter(torch.randn(10))) |
| |
| def forward(self, inp): |
| # NB: dead code |
| return inp.clone() |
| |
| net = Net() |
| for p in net.parameters(): |
| self.assertFalse(p.storage().is_shared()) |
| for b in net.buffers(): |
| self.assertFalse(b.storage().is_shared()) |
| net.share_memory() |
| for p in net.parameters(): |
| self.assertTrue(p.storage().is_shared()) |
| for b in net.buffers(): |
| self.assertTrue(b.storage().is_shared()) |
| |
| def test_to(self): |
| m = nn.Linear(3, 5) |
| self.assertIs(m, m.to('cpu')) |
| self.assertIs(m, m.to('cpu', dtype=torch.float32)) |
| self.assertEqual(m.double(), m.to(torch.float64)) |
| self.assertRaises(RuntimeError, lambda: m.to('cpu', copy=True)) |
| |
| if torch.cuda.is_available(): |
| for cuda in ['cuda', 'cuda:0' if torch.cuda.device_count() == 1 else 'cuda:1']: |
| m2 = m.cuda(device=cuda) |
| self.assertIs(m2, m2.to(cuda)) |
| self.assertEqual(m, m2.to('cpu')) |
| self.assertEqual(m2, m.to(cuda)) |
| self.assertIs(m2, m2.to(dtype=torch.float32)) |
| self.assertEqual(m2.double(), m2.to(dtype=torch.float64)) |
| |
| def test_zero_grad(self): |
| i = torch.randn(2, 5, requires_grad=True) |
| module = nn.Linear(5, 5) |
| for p in module.parameters(): |
| p.requires_grad = False |
| module.zero_grad() |
| |
| module.weight.requires_grad = True |
| module.zero_grad() |
| self.assertIsNone(module.weight.grad) # uninitialized grad |
| |
| module(i).sum().backward() |
| self.assertIsNotNone(module.weight.grad) |
| self.assertGreater(module.weight.grad.data.abs().sum(), 0) |
| module.zero_grad() |
| self.assertIsNone(module.weight.grad) |
| |
| module.bias.requires_grad = True |
| module.zero_grad() |
| self.assertIsNone(module.weight.grad) |
| self.assertIsNone(module.bias.grad) |
| module(i).sum().backward() |
| self.assertIsNotNone(module.weight.grad) |
| self.assertIsNotNone(module.bias.grad) |
| self.assertGreater(module.weight.grad.data.abs().sum(), 0) |
| self.assertGreater(module.bias.grad.data.abs().sum(), 0) |
| module.zero_grad(set_to_none=False) # Force set to zeros. |
| self.assertEqual(module.weight.grad.data, module.weight.data.clone().zero_()) |
| self.assertEqual(module.bias.grad.data, module.bias.data.clone().zero_()) |
| |
| module.zero_grad() |
| self.assertIsNone(module.weight.grad) |
| self.assertIsNone(module.bias.grad) |
| |
| def test_no_grad(self): |
| for dtype in [torch.bfloat16, torch.float, torch.double]: |
| module = nn.Conv2d(2, 5, kernel_size=3, padding=1).to(dtype) |
| input = torch.randn(1, 2, 10, 10).to(dtype) |
| x = input |
| y = input.clone() |
| |
| output = module(x) |
| self.assertTrue(output.requires_grad) |
| output.backward(torch.ones(1, 5, 10, 10)) |
| |
| with torch.no_grad(): |
| output2 = module(y) |
| self.assertFalse(output2.requires_grad) |
| self.assertRaises(RuntimeError, lambda: output2.backward(torch.ones(1, 5, 10, 10))) |
| |
| def test_parameters_and_named_parameters(self): |
| def names(named_parameters): |
| return [k for k, _ in named_parameters] |
| |
| l, n, s = _create_basic_net() |
| |
| self.assertEqual(len(list(l.parameters())), 1) |
| self.assertEqual( |
| names(l.named_parameters()), |
| ['layer_dummy_param']) |
| |
| self.assertEqual(len(list(n.parameters())), 2) |
| self.assertEqual( |
| names(n.named_parameters()), |
| ['dummy_param', 'l1.layer_dummy_param']) |
| |
| self.assertEqual(len(list(n.parameters(recurse=False))), 1) |
| self.assertEqual( |
| names(n.named_parameters(recurse=False)), |
| ['dummy_param']) |
| |
| self.assertEqual(len(list(s.parameters())), 2) |
| self.assertEqual( |
| names(s.named_parameters()), |
| ['0.dummy_param', '0.l1.layer_dummy_param']) |
| |
| def test_named_parameters_remove_duplicate(self): |
| def names(named_parameters): |
| return [k for k, _ in named_parameters] |
| |
| class M1(nn.Module): |
| def __init__(self) -> None: |
| super().__init__() |
| self.param1 = nn.Parameter(torch.empty(3, 3)) |
| self.param2 = self.param1 |
| |
| m1 = M1() |
| self.assertEqual(names(m1.named_parameters()), |
| ["param1"]) |
| self.assertEqual(names(m1.named_parameters(remove_duplicate=False)), |
| ["param1", "param2"]) |
| |
| class M2(nn.Module): |
| def __init__(self) -> None: |
| super().__init__() |
| self.mod1 = nn.Linear(3, 4, bias=False) |
| self.mod2 = self.mod1 |
| |
| m2 = M2() |
| self.assertEqual(names(m2.named_parameters()), |
| ["mod1.weight"]) |
| self.assertEqual(names(m2.named_parameters(remove_duplicate=False)), |
| ["mod1.weight", "mod2.weight"]) |
| |
| def test_buffers_and_named_buffers(self): |
| def names(named_buffers): |
| return [k for k, _ in named_buffers] |
| |
| l, n, s = _create_basic_net() |
| |
| self.assertEqual(len(list(l.buffers())), 1) |
| self.assertEqual( |
| names(l.named_buffers()), |
| ['layer_dummy_buf']) |
| |
| self.assertEqual(len(list(n.buffers())), 2) |
| self.assertEqual( |
| names(n.named_buffers()), |
| ['dummy_buf', 'l1.layer_dummy_buf']) |
| |
| self.assertEqual(len(list(n.buffers(recurse=False))), 1) |
| self.assertEqual( |
| names(n.named_buffers(recurse=False)), |
| ['dummy_buf']) |
| |
| self.assertEqual(len(list(s.buffers())), 2) |
| self.assertEqual( |
| names(s.named_buffers()), |
| ['0.dummy_buf', '0.l1.layer_dummy_buf']) |
| |
| # test remove_duplicate |
| class M(nn.Module): |
| def __init__(self) -> None: |
| super().__init__() |
| self.buffer1 = Buffer(torch.empty(3, 5)) |
| self.buffer2 = self.buffer1 |
| |
| m = M() |
| self.assertEqual(names(m.named_buffers()), |
| ["buffer1"]) |
| self.assertEqual(names(m.named_buffers(remove_duplicate=False)), |
| ["buffer1", "buffer2"]) |
| |
| def test_buffer_bad_module_subclass(self): |
| class MyBadModule(nn.Linear): |
| def __init__(self) -> None: |
| super().__init__(2, 2) |
| self.bar = Buffer(torch.rand(2, 2)) |
| |
| def register_buffer(self, name, value): |
| # persistent is explicitly missing! |
| super().register_buffer(name, value, True) |
| |
| foo = MyBadModule() |
| self.assertIsNotNone(foo.bar) |
| |
| def test_call_supports_python_dict_output(self): |
| class Net(nn.Module): |
| def __init__(self) -> None: |
| super().__init__() |
| self.l1 = nn.Linear(10, 20) |
| self.register_backward_hook(self.hook) |
| self.check_backward_hook_flag = False |
| |
| def hook(self, module, grad_out, grad_in): |
| self.check_backward_hook_flag = True |
| |
| def forward(self, inputs): |
| return {"output": self.l1(inputs).sum()} |
| |
| net = Net() |
| model_output = net(torch.randn([5, 10])) |
| model_output["output"].backward() |
| self.assertTrue(net.check_backward_hook_flag) |
| |
| def test_children(self): |
| l1 = nn.Linear(2, 2) |
| l2 = nn.Linear(2, 2) |
| l3 = nn.Linear(2, 2) |
| l4 = nn.Linear(2, 2) |
| subnet = nn.Sequential(l3, l4) |
| s = nn.Sequential(l1, l2, l1, l2, subnet) |
| self.assertEqual(list(s.children()), [l1, l2, subnet]) |
| |
| def test_train_errors_for_invalid_mode(self): |
| class SubclassNet(nn.Module): |
| def __init__(self) -> None: |
| super().__init__() |
| self.l1 = nn.Linear(2, 2) |
| |
| def forward(self, inputs): |
| return self.l1(inputs) |
| |
| subclass_net = SubclassNet() |
| sequential_net = nn.Sequential(nn.Linear(2, 2), nn.Linear(2, 2)) |
| |
| error_modes = ["invalid_str", torch.device('cpu')] |
| modules_to_check = [subclass_net, sequential_net] |
| |
| for error_mode, module in itertools.product(error_modes, modules_to_check): |
| with self.assertRaises(ValueError): |
| module.train(error_mode) |
| |
| def test_dir(self): |
| linear = nn.Linear(2, 2) |
| linear._test_submodule = nn.Linear(2, 2) |
| linear._test_parameter = Parameter(torch.empty(2, 2)) |
| linear._test_buffer = Buffer(torch.empty(2, 2)) |
| keys = dir(linear) |
| self.assertIn('_test_submodule', keys) |
| self.assertIn('_test_parameter', keys) |
| self.assertIn('_test_buffer', keys) |
| |
| for key in keys: |
| self.assertTrue(hasattr(linear, key)) |
| |
| def test_repr(self): |
| # no extra information or sub-modules |
| empty_sequential = nn.Sequential() |
| expected_repr_empty = 'Sequential()' |
| self.assertEqual(repr(empty_sequential), expected_repr_empty) |
| |
| # one liner extra information |
| linear = nn.Linear(1, 1) |
| expected_repr_linear = 'Linear(in_features=1, out_features=1, bias=True)' |
| self.assertEqual(repr(linear), expected_repr_linear) |
| |
| # sub-modules repr |
| sequential = nn.Sequential(linear) |
| expected_repr_sequential = 'Sequential(\n' \ |
| ' (0): Linear(in_features=1, out_features=1, bias=True)\n' \ |
| ')' |
| self.assertEqual(repr(sequential), expected_repr_sequential) |
| |
| def test_dir_digit(self): |
| model = nn.Sequential(nn.Linear(2, 2)) |
| keys = dir(model) |
| self.assertNotIn('0', keys) |
| |
| def test_named_children(self): |
| l1 = nn.Linear(2, 2) |
| l2 = nn.Linear(2, 2) |
| l3 = nn.Linear(2, 2) |
| l4 = nn.Linear(2, 2) |
| subnet = nn.Sequential(l3, l4) |
| s = nn.Sequential() |
| with self.assertRaises(KeyError): |
| s.add_module('', l1) |
| with self.assertRaises(KeyError): |
| s.add_module('name.with.dot', l1) |
| s.add_module('layer1', l1) |
| s.add_module('layer2', l2) |
| s.add_module('layer3', l1) |
| s.add_module('layer4', l2) |
| s.add_module('subnet', subnet) |
| self.assertEqual(list(s.named_children()), [('layer1', l1), ('layer2', l2), ('subnet', subnet)]) |
| |
| def test_modules(self): |
| class Net(nn.Module): |
| def __init__(self) -> None: |
| super().__init__() |
| self.l1 = l |
| self.l2 = l |
| self.param = torch.empty(3, 5) |
| |
| l = nn.Linear(10, 20) |
| n = Net() |
| s = nn.Sequential(n, n, n, n) |
| self.assertEqual(list(s.modules()), [s, n, l]) |
| |
| def test_named_modules(self): |
| class Net(nn.Module): |
| def __init__(self) -> None: |
| super().__init__() |
| self.l1 = l |
| self.l2 = l |
| self.param = torch.empty(3, 5) |
| self.block = block |
| l = nn.Linear(10, 20) |
| l1 = nn.Linear(10, 20) |
| l2 = nn.Linear(10, 20) |
| block = nn.Sequential() |
| block.add_module('linear1', l1) |
| block.add_module('linear2', l2) |
| n = Net() |
| s = nn.Sequential(n, n) |
| self.assertEqual(list(s.named_modules()), [('', s), ('0', n), ('0.l1', l), |
| ('0.block', block), ('0.block.linear1', l1), |
| ('0.block.linear2', l2)]) |
| # test the option to not remove duplicate module instances |
| self.assertEqual(list(s.named_modules(remove_duplicate=False)), [ |
| ('', s), ('0', n), ('0.l1', l), ('0.l2', l), |
| ('0.block', block), ('0.block.linear1', l1), |
| ('0.block.linear2', l2), |
| ('1', n), ('1.l1', l), ('1.l2', l), |
| ('1.block', block), ('1.block.linear1', l1), |
| ('1.block.linear2', l2)]) |
| |
| def test_register_buffer_raises_error_if_name_is_not_string(self): |
| m = nn.Module() |
| expected_error = 'buffer name should be a string. Got ' |
| with self.assertRaisesRegex(TypeError, expected_error + 'int'): |
| m.register_buffer(1, torch.rand(5)) |
| with self.assertRaisesRegex(TypeError, expected_error + 'NoneType'): |
| m.register_buffer(None, torch.rand(5)) |
| |
| def test_register_buffer_raises_error_if_attr_exists(self): |
| m = nn.Module() |
| m.attribute_name = 5 |
| with self.assertRaises(KeyError): |
| m.register_buffer('attribute_name', torch.rand(5)) |
| |
| with self.assertRaises(KeyError): |
| m.attribute_name = Buffer(torch.rand(5)) |
| |
| del m.attribute_name |
| m.register_parameter('attribute_name', nn.Parameter()) |
| with self.assertRaises(KeyError): |
| m.register_buffer('attribute_name', torch.rand(5)) |
| |
| del m.attribute_name |
| m.add_module('attribute_name', nn.Module()) |
| with self.assertRaises(KeyError): |
| m.register_buffer('attribute_name', torch.rand(5)) |
| |
| def test_register_buffer_raises_error_if_not_tensor(self): |
| m = nn.Module() |
| with self.assertRaises(TypeError): |
| m.register_buffer('attribute_name', 5) |
| |
| def test_register_buffer_allows_overwriting_with_same_name(self): |
| m = nn.Module() |
| buffer1 = torch.rand(5) |
| buffer2 = buffer1 + 5 |
| buffer3 = None |
| m.register_buffer('buffer_name', buffer1) |
| self.assertEqual(m.buffer_name, buffer1) |
| m.register_buffer('buffer_name', buffer2) |
| self.assertEqual(m.buffer_name, buffer2) |
| m.register_buffer('buffer_name', buffer3) |
| self.assertEqual(m.buffer_name, buffer3) |
| m.buffer_name = Buffer(buffer1) |
| self.assertEqual(m.buffer_name, Buffer(buffer1)) |
| m.buffer_name = Buffer(buffer2) |
| self.assertEqual(m.buffer_name, Buffer(buffer2)) |
| m.buffer_name = Buffer(buffer3) |
| self.assertEqual(m.buffer_name, Buffer(buffer3)) |
| |
| def test_get_buffer(self): |
| m = nn.Module() |
| buffer1 = torch.randn(2, 3) |
| buffer2 = torch.randn(4, 5) |
| m.foo = Buffer(buffer1) |
| m.register_buffer('bar', buffer2) |
| self.assertEqual(buffer1, m.get_buffer('foo')) |
| self.assertEqual(buffer2, m.get_buffer('bar')) |
| |
| def test_get_buffer_from_submodules(self): |
| class MyModule(nn.Module): |
| def __init__(self, foo, bar): |
| super().__init__() |
| self.sub = Sub(foo, bar) |
| |
| class Sub(nn.Module): |
| def __init__(self, foo, bar): |
| super().__init__() |
| self.foo = Buffer(foo) |
| self.subsub = SubSub(bar) |
| |
| class SubSub(nn.Module): |
| def __init__(self, bar): |
| super().__init__() |
| self.bar = Buffer(bar) |
| |
| foo = torch.randn(2, 3) |
| bar = torch.randn(4, 5) |
| m = MyModule(foo, bar) |
| self.assertEqual(foo, m.get_buffer('sub.foo')) |
| self.assertEqual(bar, m.get_buffer('sub.subsub.bar')) |
| |
| def test_buffer_not_persistent(self): |
| m = nn.Module() |
| m.buf = nn.Buffer(torch.rand(5), persistent=False) |
| self.assertTrue(len(list(m.buffers())) == 1) |
| self.assertTrue(len(m.state_dict()) == 0) |
| |
| def test_buffer_not_persistent_del(self): |
| m = nn.Module() |
| m.buf = nn.Buffer(torch.rand(5), persistent=False) |
| del m.buf |
| self.assertTrue(len(list(m.buffers())) == 0) |
| |
| def test_buffer_not_persistent_overwrite(self): |
| m = nn.Module() |
| m.buf = nn.Buffer(torch.rand(5), persistent=False) |
| m.buf = nn.Buffer(torch.rand(5)) |
| |
| # can we overwrite a non-persistent buffer with a persistent one? |
| self.assertTrue(len(list(m.buffers())) == 1) |
| self.assertTrue(len(m.state_dict()) == 1) |
| |
| # can we overwrite a persistent buffer with a non-persistent one? |
| m.buf = nn.Buffer(torch.rand(5), persistent=False) |
| self.assertTrue(len(list(m.buffers())) == 1) |
| self.assertTrue(len(m.state_dict()) == 0) |
| |
| def test_buffer_not_persistent_assign(self): |
| m = nn.Module() |
| m.buf = nn.Buffer(torch.rand(5), persistent=False) |
| self.assertTrue(len(list(m.buffers())) == 1) |
| self.assertTrue(len(m.state_dict()) == 0) |
| |
| # Assigning None removes the buffer but if we then assign a new Tensor |
| # to the same property, it should still be marked as a buffer. |
| m.buf = None |
| self.assertTrue(len(list(m.buffers())) == 0) |
| self.assertTrue(len(m.state_dict()) == 0) |
| m.buf = torch.rand(5) |
| self.assertTrue(len(list(m.buffers())) == 1) |
| self.assertTrue(len(m.state_dict()) == 0) |
| |
| # Assigning a Parameter removes the buffer. |
| m.buf = nn.Parameter(torch.rand(5)) |
| self.assertTrue(len(list(m.buffers())) == 0) |
| self.assertTrue(len(m.state_dict()) == 1) |
| |
| def test_buffer_not_persistent_load(self): |
| m = nn.Module() |
| m.buf = nn.Buffer(torch.rand(5), persistent=False) |
| m.load_state_dict({}) |
| |
| def test_register_parameter_raises_error_if_name_is_not_string(self): |
| m = nn.Module() |
| expected_error = 'parameter name should be a string. Got ' |
| with self.assertRaisesRegex(TypeError, expected_error + 'int'): |
| m.register_parameter(1, nn.Parameter()) |
| with self.assertRaisesRegex(TypeError, expected_error + 'NoneType'): |
| m.register_parameter(None, nn.Parameter()) |
| |
| def test_register_parameter_raises_error_if_attr_exists(self): |
| m = nn.Module() |
| m.attribute_name = 5 |
| with self.assertRaises(KeyError): |
| m.register_parameter('attribute_name', nn.Parameter()) |
| |
| del m.attribute_name |
| m.register_buffer('attribute_name', torch.rand(5)) |
| with self.assertRaises(KeyError): |
| m.register_parameter('attribute_name', nn.Parameter()) |
| |
| del m.attribute_name |
| m.attribute_name = Buffer(torch.rand(5)) |
| with self.assertRaises(KeyError): |
| m.register_parameter('attribute_name', nn.Parameter()) |
| |
| del m.attribute_name |
| m.add_module('attribute_name', nn.Module()) |
| with self.assertRaises(KeyError): |
| m.register_parameter('attribute_name', nn.Parameter()) |
| |
| def test_register_parameter_allows_overwriting_with_same_name(self): |
| m = nn.Module() |
| param1 = nn.Parameter(torch.rand(5)) |
| param2 = nn.Parameter(param1.data + 5) |
| param3 = None |
| m.register_parameter('param_name', param1) |
| self.assertEqual(m.param_name, param1) |
| m.register_parameter('param_name', param2) |
| self.assertEqual(m.param_name, param2) |
| m.register_parameter('param_name', param3) |
| self.assertEqual(m.param_name, param3) |
| |
| def test_add_module_raises_error_if_attr_exists(self): |
| methods_to_test = ['add_module', 'register_module'] |
| for fn in methods_to_test: |
| m = nn.Module() |
| m.attribute_name = 5 |
| with self.assertRaises(KeyError): |
| getattr(m, fn)('attribute_name', nn.Module()) |
| |
| del m.attribute_name |
| m.register_buffer('attribute_name', torch.rand(5)) |
| with self.assertRaises(KeyError): |
| getattr(m, fn)('attribute_name', nn.Module()) |
| |
| del m.attribute_name |
| m.register_parameter('attribute_name', nn.Parameter()) |
| with self.assertRaises(KeyError): |
| getattr(m, fn)('attribute_name', nn.Module()) |
| |
| @unittest.expectedFailure |
| def test_getattr_with_property(self): |
| class Model(nn.Module): |
| @property |
| def some_property(self): |
| return self.something_that_doesnt_exist |
| |
| model = Model() |
| |
| with self.assertRaisesRegex( |
| AttributeError, |
| r"'Model' object has no attribute 'something_that_doesnt_exist'"): |
| model.some_property |
| |
| def test_Sequential_getitem(self): |
| l1 = nn.Linear(10, 20) |
| l2 = nn.Linear(20, 30) |
| l3 = nn.Linear(30, 40) |
| l4 = nn.Linear(40, 50) |
| n = nn.Sequential(l1, l2, l3, l4) |
| self.assertIs(n[0], l1) |
| self.assertIs(n[1], l2) |
| self.assertIs(n[2], l3) |
| self.assertIs(n[3], l4) |
| self.assertIs(n[torch.tensor(3, dtype=torch.int64)], l4) |
| self.assertEqual(n[1:], nn.Sequential(l2, l3, l4)) |
| self.assertEqual(n[3:], nn.Sequential(l4)) |
| self.assertEqual(n[:-1], nn.Sequential(l1, l2, l3)) |
| self.assertEqual(n[:-3], nn.Sequential(l1)) |
| self.assertEqual(n[::-1], nn.Sequential(l4, l3, l2, l1)) |
| |
| def test_Sequential_setitem(self): |
| l1 = nn.Linear(10, 20) |
| l2 = nn.Linear(20, 30) |
| l3 = nn.Linear(30, 40) |
| l4 = nn.Linear(40, 50) |
| n = nn.Sequential(l1, l2, l3) |
| n[0] = l4 |
| n[-1] = l4 |
| n[torch.tensor(1, dtype=torch.int16)] = l1 |
| self.assertIs(n[0], l4) |
| self.assertIs(n[1], l1) |
| self.assertIs(n[2], l4) |
| |
| def test_Sequential_setitem_named(self): |
| l1 = nn.Linear(10, 20) |
| l2 = nn.Linear(20, 30) |
| l3 = nn.Linear(30, 40) |
| l4 = nn.Linear(40, 50) |
| n = nn.Sequential(OrderedDict([ |
| ('linear1', l1), |
| ('linear2', l2), |
| ('linear3', l3), |
| ])) |
| |
| n[0] = l4 |
| n[-1] = l4 |
| self.assertEqual(n.linear1, l4) |
| self.assertEqual(n.linear3, l4) |
| |
| def test_Sequential_delitem(self): |
| l1 = nn.Linear(10, 20) |
| l2 = nn.Linear(20, 30) |
| l3 = nn.Linear(30, 40) |
| l4 = nn.Linear(40, 50) |
| n = nn.Sequential(l1, l2, l3, l4) |
| del n[-1] |
| self.assertEqual(n, nn.Sequential(l1, l2, l3)) |
| del n[1::2] |
| self.assertEqual(n, nn.Sequential(l1, l3)) |
| |
| def test_Sequential_add(self): |
| l1 = nn.Linear(1, 2) |
| l2 = nn.Linear(2, 3) |
| l3 = nn.Linear(3, 4) |
| l4 = nn.Linear(4, 5) |
| n = nn.Sequential(l1, l2) |
| other = nn.Sequential(l3, l4) |
| self.assertEqual(n + other, nn.Sequential(l1, l2, l3, l4)) |
| |
| def test_Sequential_iadd(self): |
| l1 = nn.Linear(10, 20) |
| l2 = nn.Linear(20, 30) |
| l3 = nn.Linear(30, 40) |
| l4 = nn.Linear(40, 50) |
| n = nn.Sequential(l1, l2, l3) |
| n2 = nn.Sequential(l4) |
| n += n2 |
| n2 += n |
| self.assertEqual(n, nn.Sequential(l1, l2, l3, l4)) |
| self.assertEqual(n2, nn.Sequential(l4, l1, l2, l3, l4)) |
| |
| def test_Sequential_mul(self): |
| l1 = nn.Linear(10, 20) |
| l2 = nn.Linear(20, 30) |
| l3 = nn.Linear(30, 40) |
| l4 = nn.Linear(40, 50) |
| n = nn.Sequential(l1, l2, l3, l4) |
| n2 = n * 2 |
| self.assertEqual(n2, nn.Sequential(l1, l2, l3, l4, l1, l2, l3, l4)) |
| |
| def test_Sequential_rmul(self): |
| l1 = nn.Linear(10, 20) |
| l2 = nn.Linear(20, 30) |
| l3 = nn.Linear(30, 40) |
| l4 = nn.Linear(40, 50) |
| n = nn.Sequential(l1, l2, l3, l4) |
| n2 = 2 * n |
| self.assertEqual(n2, nn.Sequential(l1, l2, l3, l4, l1, l2, l3, l4)) |
| |
| def test_Sequential_imul(self): |
| l1 = nn.Linear(10, 20) |
| l2 = nn.Linear(20, 30) |
| l3 = nn.Linear(30, 40) |
| l4 = nn.Linear(40, 50) |
| n = nn.Sequential(l1, l2, l3, l4) |
| n *= 2 |
| self.assertEqual(n, nn.Sequential(l1, l2, l3, l4, l1, l2, l3, l4)) |
| n *= 2 |
| self.assertEqual( |
| n, |
| nn.Sequential(l1, l2, l3, l4, l1, l2, l3, l4, l1, l2, l3, l4, l1, l2, l3, l4) |
| ) |
| |
| def test_Sequential_append(self): |
| l1 = nn.Linear(10, 20) |
| l2 = nn.Linear(20, 30) |
| l3 = nn.Linear(30, 40) |
| l4 = nn.Linear(40, 50) |
| n = nn.Sequential(l1, l2, l3) |
| n2 = n.append(l4) |
| self.assertEqual(n, nn.Sequential(l1, l2, l3, l4)) |
| self.assertEqual(n2, nn.Sequential(l1, l2, l3, l4)) |
| self.assertEqual(nn.Sequential(l1).append(l2).append(l4), nn.Sequential(l1, l2, l4)) |
| |
| def test_Sequential_pop(self): |
| l1 = nn.Linear(1, 2) |
| l2 = nn.Linear(2, 3) |
| l3 = nn.Linear(3, 4) |
| l4 = nn.Linear(4, 5) |
| n1 = nn.Sequential(l1, l2, l3, l4) |
| self.assertEqual(l4, n1.pop(3)) |
| n2 = nn.Sequential(l1, l2, l3) |
| self.assertEqual(n1, n2) |
| # check order of the index |
| for k, mod in zip(range(len(n1)), n1): |
| self.assertIs(n1[k], mod) |
| |
| def test_Sequential_insert(self): |
| l1 = nn.Linear(1, 2) |
| l2 = nn.Linear(2, 3) |
| l3 = nn.Linear(3, 4) |
| |
| n1 = nn.Sequential(l1, l2, l3) |
| module_1 = nn.Linear(4, 5) |
| n2 = nn.Sequential(l1, module_1, l2, l3) |
| self.assertEqual(n1.insert(1, module_1), n2) |
| |
| # test for negative support |
| n3 = nn.Sequential(l1, l2, l3) |
| module_2 = nn.Linear(5, 6) |
| n4 = nn.Sequential(l1, module_2, l2, l3) |
| self.assertEqual(n3.insert(-2, module_2), n4) |
| |
| def test_Sequential_insert_fail_case(self): |
| l1 = nn.Linear(1, 2) |
| l2 = nn.Linear(2, 3) |
| l3 = nn.Linear(3, 4) |
| |
| module = nn.Linear(5, 6) |
| |
| # test for error case |
| n1 = nn.Sequential(l1, l2, l3) |
| with self.assertRaises(IndexError): |
| n1.insert(-5, module) |
| |
| with self.assertRaises(AssertionError): |
| n1.insert(1, [nn.Linear(6, 7)]) |
| |
| def test_Sequential_extend(self): |
| l1 = nn.Linear(10, 20) |
| l2 = nn.Linear(20, 30) |
| l3 = nn.Linear(30, 40) |
| l4 = nn.Linear(40, 50) |
| n1 = nn.Sequential(l1, l2) |
| n2 = nn.Sequential(l3, l4) |
| n3 = nn.Sequential(l1, l2) |
| for l in n2: |
| n1.append(l) |
| n3.extend(n2) |
| self.assertEqual(n3, n1) |
| |
| def test_ModuleList(self): |
| modules = [nn.ReLU(), nn.Linear(5, 5)] |
| module_list = nn.ModuleList(modules) |
| |
| def check(): |
| self.assertEqual(len(module_list), len(modules)) |
| for m1, m2 in zip(modules, module_list): |
| self.assertIs(m1, m2) |
| for m1, m2 in zip(modules, module_list.children()): |
| self.assertIs(m1, m2) |
| for i in range(len(modules)): |
| self.assertIs(module_list[i], modules[i]) |
| |
| check() |
| modules += [nn.Conv2d(3, 4, 3)] |
| module_list += [modules[-1]] |
| check() |
| modules = modules + [nn.Conv2d(3, 4, 3, bias=False), nn.GELU()] |
| module_list = module_list + nn.ModuleList(modules[-2:]) |
| check() |
| modules.insert(1, nn.Linear(3, 2)) |
| module_list.insert(1, modules[1]) |
| check() |
| modules.append(nn.Tanh()) |
| module_list.append(modules[-1]) |
| check() |
| next_modules = [nn.Linear(5, 5), nn.Sigmoid()] |
| modules.extend(next_modules) |
| module_list.extend(next_modules) |
| check() |
| modules[2] = nn.Conv2d(5, 3, 2) |
| module_list[2] = modules[2] |
| check() |
| modules[-1] = nn.Conv2d(5, 2, 1) |
| module_list[-1] = modules[-1] |
| check() |
| idx = torch.tensor(2, dtype=torch.int32) |
| modules[2] = nn.Conv2d(5, 3, 2) |
| module_list[idx] = modules[2] |
| self.assertIs(module_list[idx], modules[2]) |
| check() |
| self.assertEqual(module_list[1:], nn.ModuleList(modules[1:])) |
| self.assertEqual(module_list[3:], nn.ModuleList(modules[3:])) |
| self.assertEqual(module_list[:-1], nn.ModuleList(modules[:-1])) |
| self.assertEqual(module_list[:-3], nn.ModuleList(modules[:-3])) |
| self.assertEqual(module_list[::-1], nn.ModuleList(modules[::-1])) |
| del module_list[-1] |
| self.assertEqual(module_list, nn.ModuleList(modules[:-1])) |
| del module_list[1::2] |
| self.assertEqual(module_list, nn.ModuleList(modules[:-1][0::2])) |
| |
| with self.assertRaises(TypeError): |
| module_list += nn.ReLU() |
| with self.assertRaises(TypeError): |
| module_list.extend(nn.ReLU()) |
| |
| l1 = nn.Linear(1, 2) |
| l2 = nn.Linear(2, 3) |
| l3 = nn.Linear(3, 2) |
| l4 = nn.Linear(2, 3) |
| subnet = nn.Sequential(l3, l4) |
| s = nn.Sequential( |
| OrderedDict([ |
| ("layer1", l1), |
| ("layer2", l2), |
| ("layer3", l3), |
| ("layer4", l4), |
| ("subnet_layer", subnet) |
| ]) |
| ) |
| modules = list(s.modules()) |
| module_list = nn.ModuleList() |
| module_list.extend(s.modules()) |
| check() |
| |
| modules = [nn.ReLU(), nn.Linear(5, 5), nn.Conv2d(3, 4, 3)] |
| module_list = nn.ModuleList(modules) |
| self.assertEqual(modules.pop(1), module_list.pop(1)) |
| self.assertEqual(modules, module_list) |
| # check order of the index |
| for k, mod in zip(range(len(module_list)), module_list): |
| self.assertIs(module_list[k], mod) |
| |
| # verify the right exception is thrown when trying to "forward" through a ModuleList |
| self.assertRaises(NotImplementedError, module_list) |
| self.assertRaises(NotImplementedError, module_list, torch.rand(1, 3)) |
| |
| def test_ModuleDict(self): |
| modules = OrderedDict([ |
| ('act', nn.ReLU()), |
| ('conv', nn.Conv2d(10, 10, 5)), |
| ('fc', nn.Linear(5, 5)), |
| ]) |
| |
| module_dict = nn.ModuleDict(modules) |
| |
| def check(): |
| self.assertEqual(len(module_dict), len(modules)) |
| for k1, m2 in zip(modules, module_dict.children()): |
| self.assertIs(modules[k1], m2) |
| for k1, k2 in zip(modules, module_dict): |
| self.assertIs(modules[k1], module_dict[k2]) |
| for k in module_dict: |
| self.assertIs(module_dict[k], modules[k]) |
| for k in module_dict.keys(): |
| self.assertIs(module_dict[k], modules[k]) |
| for k, v in module_dict.items(): |
| self.assertIs(modules[k], v) |
| for k1, m2 in zip(modules, module_dict.values()): |
| self.assertIs(modules[k1], m2) |
| for k in modules.keys(): |
| self.assertTrue(k in module_dict) |
| check() |
| |
| modules['conv'] = nn.Conv2d(3, 4, 3) |
| module_dict['conv'] = modules['conv'] |
| check() |
| |
| next_modules = [ |
| ('fc2', nn.Linear(5, 5)), |
| ('act', nn.Sigmoid()), |
| ] |
| modules.update(next_modules) |
| module_dict.update(next_modules) |
| check() |
| |
| next_modules = OrderedDict([ |
| ('fc3', nn.Linear(5, 5)), |
| ('act2', nn.Sigmoid()), |
| ]) |
| modules.update(next_modules) |
| module_dict.update(next_modules) |
| check() |
| |
| next_modules = { |
| 'fc4': nn.Linear(5, 5), |
| 'act3': nn.Sigmoid() |
| } |
| modules.update(next_modules.items()) |
| module_dict.update(next_modules) |
| check() |
| |
| next_modules = nn.ModuleDict([ |
| ('fc5', nn.Linear(5, 5)), |
| ('act4', nn.Sigmoid()), |
| ]) |
| modules.update(next_modules) |
| module_dict.update(next_modules) |
| check() |
| |
| del module_dict['fc'] |
| del modules['fc'] |
| check() |
| |
| with self.assertRaises(TypeError): |
| module_dict.update(nn.ReLU()) |
| |
| with self.assertRaises(TypeError): |
| module_dict.update([nn.ReLU()]) |
| |
| with self.assertRaises(ValueError): |
| module_dict.update([[nn.ReLU()]]) |
| |
| with self.assertRaises(TypeError): |
| module_dict[1] = nn.ReLU() |
| |
| s = nn.Sequential(modules) |
| module_dict = nn.ModuleDict(s.named_children()) |
| check() |
| |
| c = module_dict.pop('conv') |
| self.assertIs(c, modules['conv']) |
| modules.pop('conv') |
| check() |
| |
| module_dict.clear() |
| self.assertEqual(len(module_dict), 0) |
| modules.clear() |
| check() |
| |
| # verify the right exception is thrown when trying to "forward" through a ModuleDict |
| self.assertRaises(NotImplementedError, module_dict) |
| self.assertRaises(NotImplementedError, module_dict, torch.rand(1, 3)) |
| |
| @skipIfTorchDynamo() |
| def test_ParameterList(self): |
| def make_param(): |
| return Parameter(torch.randn(2, 2)) |
| parameters = [make_param(), make_param()] |
| param_list = nn.ParameterList(parameters) |
| |
| def check(): |
| self.assertEqual(len(parameters), len(param_list)) |
| for p1, p2 in zip(parameters, param_list): |
| self.assertIs(p1, p2) |
| for p1, p2 in zip(filter(lambda x: isinstance(x, Parameter), parameters), param_list.parameters()): |
| self.assertIs(p1, p2) |
| for i in range(len(parameters)): |
| self.assertIs(parameters[i], param_list[i]) |
| |
| check() |
| parameters += [make_param()] |
| param_list += [parameters[-1]] |
| check() |
| parameters.append(make_param()) |
| param_list.append(parameters[-1]) |
| check() |
| next_params = [make_param(), make_param()] |
| parameters.extend(next_params) |
| param_list.extend(next_params) |
| check() |
| parameters[2] = make_param() |
| param_list[2] = parameters[2] |
| check() |
| parameters[-1] = make_param() |
| param_list[-1] = parameters[-1] |
| check() |
| idx = torch.tensor(2, dtype=torch.int32) |
| parameters[2] = make_param() |
| param_list[idx] = parameters[2] |
| self.assertIs(param_list[idx], parameters[2]) |
| check() |
| self.assertEqual(param_list[1:], nn.ParameterList(parameters[1:])) |
| self.assertEqual(param_list[3:], nn.ParameterList(parameters[3:])) |
| self.assertEqual(param_list[:-1], nn.ParameterList(parameters[:-1])) |
| self.assertEqual(param_list[:-3], nn.ParameterList(parameters[:-3])) |
| self.assertEqual(param_list[::-1], nn.ParameterList(parameters[::-1])) |
| |
| with self.assertRaises(TypeError): |
| param_list += make_param() |
| with self.assertRaises(TypeError): |
| param_list.extend(make_param()) |
| |
| l1 = nn.Linear(1, 2) |
| l2 = nn.Linear(2, 3) |
| l3 = nn.Linear(3, 2) |
| l4 = nn.Linear(2, 3) |
| subnet = nn.Sequential(l3, l4) |
| s = nn.Sequential( |
| OrderedDict([ |
| ("layer1", l1), |
| ("layer2", l2), |
| ("layer3", l3), |
| ("layer4", l4), |
| ("subnet_layer", subnet) |
| ]) |
| ) |
| parameters = list(s.parameters()) |
| param_list = nn.ParameterList() |
| param_list.extend(s.parameters()) |
| check() |
| |
| param_list.append(torch.rand(2, 2)) |
| self.assertIsInstance(param_list[-1], Parameter) |
| parameters.append(param_list[-1]) |
| |
| param_list.extend([torch.rand(2, 2), "foo"]) |
| self.assertIsInstance(param_list[-2], Parameter) |
| self.assertIsInstance(param_list[-1], str) |
| parameters.extend(param_list[-2:]) |
| |
| param_list += ["bar", torch.rand(2, 2)] |
| self.assertIsInstance(param_list[-2], str) |
| self.assertIsInstance(param_list[-1], Parameter) |
| parameters += param_list[-2:] |
| check() |
| |
| def test_ParameterList_meta(self): |
| p = torch.nn.Parameter(torch.empty(1, device='meta')) |
| self.assertExpectedInline(str(p), """\ |
| Parameter containing: |
| tensor(..., device='meta', size=(1,), requires_grad=True)""") |
| pl = torch.nn.ParameterList([p]) |
| self.assertExpectedInline(str(pl), """ParameterList( (0): Parameter containing: [torch.float32 of size 1])""") |
| |
| def test_ParameterList_replication(self): |
| # The actual replication code from DP cannot be used on CPU so doing it manually here |
| def make_param(): |
| return Parameter(torch.randn(2, 2)) |
| parameters = [make_param(), make_param()] |
| param_list = nn.ParameterList(parameters) |
| |
| new_param_list = param_list._replicate_for_data_parallel() |
| |
| for n, p in param_list.named_parameters(): |
| # Do a view here so that we can check the base later |
| setattr(new_param_list, n, p.view_as(p)) |
| |
| for p, p2 in zip(param_list, new_param_list): |
| self.assertEqual(p, p2) |
| self.assertIsNotNone(p2.grad_fn) |
| self.assertIs(p2._base, p) |
| |
| def test_ParameterDict(self): |
| parameters = OrderedDict([ |
| ('p1', Parameter(torch.randn(10, 10))), |
| ('p2', Parameter(torch.randn(10, 10))), |
| ('p3', Parameter(torch.randn(10, 10))), |
| ]) |
| |
| parameter_dict = nn.ParameterDict(parameters) |
| |
| def check(): |
| self.assertEqual(len(parameter_dict), len(parameters)) |
| for i, (k1, (k2, m2)) in enumerate(zip(parameters, parameter_dict.named_parameters())): |
| self.assertEqual(k1, k2) |
| self.assertIs(parameters[k1], m2) |
| for k1, k2 in zip(parameters, parameter_dict): |
| self.assertIs(parameters[k1], parameter_dict[k2]) |
| for k in parameter_dict: |
| self.assertIs(parameter_dict[k], parameters[k]) |
| for k in parameter_dict.keys(): |
| self.assertIs(parameter_dict[k], parameters[k]) |
| for k, v in parameter_dict.items(): |
| self.assertIs(v, parameters[k]) |
| for k1, m2 in zip(parameters, parameter_dict.values()): |
| self.assertIs(parameters[k1], m2) |
| for k in parameters.keys(): |
| self.assertTrue(k in parameter_dict) |
| |
| check() |
| |
| parameters['p4'] = Parameter(torch.randn(10, 10)) |
| parameter_dict['p4'] = parameters['p4'] |
| check() |
| |
| next_parameters = [ |
| ('p5', Parameter(torch.randn(10, 10))), |
| ('p2', Parameter(torch.randn(10, 10))), |
| ] |
| parameters.update(next_parameters) |
| parameter_dict.update(next_parameters) |
| check() |
| |
| next_parameters = OrderedDict([ |
| ('p6', Parameter(torch.randn(10, 10))), |
| ('p5', Parameter(torch.randn(10, 10))), |
| ]) |
| parameters.update(next_parameters) |
| parameter_dict.update(next_parameters) |
| check() |
| |
| next_parameters = { |
| 'p8': Parameter(torch.randn(10, 10)), |
| 'p7': Parameter(torch.randn(10, 10)) |
| } |
| parameters.update(sorted(next_parameters.items())) |
| parameter_dict.update(next_parameters) |
| check() |
| |
| next_parameters = nn.ParameterDict([ |
| ('p10', Parameter(torch.randn(10, 10))), |
| ('p9', Parameter(torch.randn(10, 10))), |
| ]) |
| parameters.update(next_parameters) |
| parameter_dict.update(next_parameters) |
| check() |
| |
| del parameter_dict['p3'] |
| del parameters['p3'] |
| check() |
| |
| with self.assertRaises(TypeError): |
| parameter_dict.update(1) |
| |
| with self.assertRaises(TypeError): |
| parameter_dict.update([1]) |
| |
| with self.assertRaises(ValueError): |
| parameter_dict.update(Parameter(torch.randn(10, 10))) |
| |
| p_pop = parameter_dict.pop('p4') |
| self.assertIs(p_pop, parameters['p4']) |
| parameters.pop('p4') |
| check() |
| |
| # Check reverse works |
| forward = list(iter(parameter_dict)) |
| backward = list(reversed(parameter_dict)) |
| self.assertEqual(len(forward), len(backward)) |
| n = len(forward) |
| for i in range(n): |
| self.assertIs(forward[i], backward[n - i - 1]) |
| check() |
| |
| # Check copy works |
| copy = parameter_dict.copy() |
| |
| # Check all keys are present and have shallow copied values |
| for key in parameter_dict: |
| self.assertTrue(key in copy) |
| self.assertEqual(parameter_dict[key], copy[key]) |
| self.assertIs(parameter_dict[key], copy[key]) |
| check() |
| |
| parameter_dict["p20"] = Parameter(torch.randn(10, 10)) |
| copy["p21"] = Parameter(torch.randn(9, 10)) |
| |
| self.assertTrue("p20" in parameter_dict) |
| self.assertFalse("p20" in copy) |
| self.assertFalse("p21" in parameter_dict) |
| self.assertTrue("p21" in copy) |
| parameter_dict.pop("p20") |
| check() |
| |
| p = Parameter(torch.randn(10, 10)) |
| parameter_dict['p12'] = p |
| p_popitem = parameter_dict.popitem() |
| self.assertEqual(p_popitem[0], 'p12') |
| self.assertIs(p_popitem[1], p) |
| check() |
| |
| # Unit test for set_default |
| # 1. Ensure parameter is correctly inserted when |
| # the key is not present in `ParameterDict` |
| assert 'p11' not in parameter_dict |
| assert 'p11' not in parameters |
| parameters['p11'] = Parameter(torch.randn(10, 10)) |
| p_setdefault = parameter_dict.setdefault('p11', parameters['p11']) |
| self.assertIs(p_setdefault, parameters['p11']) |
| self.assertIs(p_setdefault, parameter_dict['p11']) |
| check() |
| # 2. Ensure parameter is NOT inserted when the |
| # key is already present in `ParameterDict` |
| p = Parameter(torch.randn(10, 10)) |
| self.assertFalse(parameter_dict.setdefault('p11', p) is p) |
| check() |
| # 3. Ensure `None` is inserted when the key is not |
| # present in `Parameter` and parameter is not specified |
| self.assertIs(parameter_dict.setdefault('p26'), None) |
| del parameter_dict['p26'] |
| check() |
| |
| parameters2 = OrderedDict([ |
| ('p13', Parameter(torch.randn(10, 10))), |
| ('p2', Parameter(torch.randn(10, 10))), |
| ('p3', Parameter(torch.randn(10, 10))), |
| ]) |
| parameter_dict2 = nn.ParameterDict(parameters2) |
| parameters.update(parameters2) |
| parameter_dict |= parameter_dict2 |
| check() |
| |
| parameters2 = OrderedDict() |
| parameter_dict2 = nn.ParameterDict(parameters2) |
| parameters.update(parameters2) |
| parameter_dict |= parameter_dict2 |
| check() |
| |
| parameters2 = OrderedDict([ |
| ('p14', Parameter(torch.randn(10, 10))), |
| ('p15', Parameter(torch.randn(10, 10))), |
| ('p13', Parameter(torch.randn(10, 10))), |
| ]) |
| parameter_dict2 = nn.ParameterDict(parameters2) |
| parameters.update(parameters2) |
| parameter_dict |= parameter_dict2 |
| check() |
| |
| # Check __or__ and __ror__ works |
| parameters2 = OrderedDict([ |
| ('p20', Parameter(torch.randn(10, 10))), |
| ('p21', Parameter(torch.randn(10, 10))), |
| ('p22', Parameter(torch.randn(10, 10))), |
| ]) |
| parameter_dict2 = nn.ParameterDict(parameters2) |
| parameters.update(parameters2) |
| parameter_dict = parameter_dict | parameter_dict2 |
| check() |
| |
| parameters2 = OrderedDict([ |
| ('p23', Parameter(torch.randn(10, 10))), |
| ('p24', Parameter(torch.randn(10, 10))), |
| ('p25', Parameter(torch.randn(10, 10))), |
| ]) |
| parameter_dict2 = nn.ParameterDict(parameters2) |
| parameters2.update(parameters) |
| parameters = parameters2 |
| parameter_dict = parameter_dict2 | parameter_dict |
| check() |
| |
| parameters['p17'] = Parameter(torch.randn(10, 10)) |
| parameter_dict['p17'] = parameters['p17'] |
| self.assertIs(parameters['p17'], parameter_dict.get('p17')) |
| temp_param = Parameter(torch.randn(10, 10)) |
| self.assertIs(parameters['p17'], parameter_dict.get('p17', temp_param)) |
| self.assertIs(None, parameter_dict.get('p18')) |
| self.assertIs(temp_param, parameter_dict.get('p18', temp_param)) |
| check() |
| |
| parameter_dict.clear() |
| self.assertEqual(len(parameter_dict), 0) |
| parameters.clear() |
| check() |
| |
| parameter_dict2 = parameter_dict.fromkeys(['p19', 'p20']) |
| self.assertEqual({'p19': None, 'p20': None}, parameter_dict2) |
| check() |
| |
| parameter_dict2 = parameter_dict.fromkeys(['p19', 'p20'], temp_param) |
| self.assertEqual({'p19': temp_param, 'p20': temp_param}, parameter_dict2) |
| check() |
| |
| parameter_dict['p21'] = torch.rand(2, 2) |
| self.assertIsInstance(parameter_dict['p21'], Parameter) |
| parameters['p21'] = parameter_dict['p21'] |
| |
| parameter_dict.update({'p22': torch.rand(2, 2), 'foo': 'bar'}) |
| self.assertIsInstance(parameter_dict['p22'], Parameter) |
| self.assertIsInstance(parameter_dict['foo'], str) |
| parameters['p22'] = parameter_dict['p22'] |
| parameters['foo'] = parameter_dict['foo'] |
| |
| def test_ParameterDict_replication(self): |
| # The actual replication code from DP cannot be used on CPU so doing it manually here |
| def make_param(): |
| return Parameter(torch.randn(2, 2)) |
| parameters = {"foo": make_param(), "bar": make_param()} |
| param_dict = nn.ParameterDict(parameters) |
| |
| new_param_dict = param_dict._replicate_for_data_parallel() |
| |
| for n, p in param_dict.named_parameters(): |
| # Do a view here so that we can check the base later |
| setattr(new_param_dict, n, p.view_as(p)) |
| |
| for (k, p), (k2, p2) in zip(param_dict.items(), new_param_dict.items()): |
| self.assertEqual(k, k2) |
| self.assertEqual(p, p2) |
| self.assertIsNotNone(p2.grad_fn) |
| self.assertIs(p2._base, p) |
| |
| self.assertEqual(param_dict["foo"], new_param_dict["foo"]) |
| |
| def test_add_module(self): |
| methods_to_test = ['add_module', 'register_module'] |
| for fn in methods_to_test: |
| l = nn.Linear(10, 20) |
| net = nn.Module() |
| net.l = l |
| net.l2 = l |
| getattr(net, fn)('empty', None) |
| self.assertEqual(net.l, l) |
| self.assertEqual(net.l2, l) |
| self.assertEqual(net.empty, None) |
| getattr(net, fn)('l3', l) |
| self.assertEqual(net.l3, l) |
| l3 = nn.Linear(20, 10) |
| getattr(net, fn)('l', l3) |
| self.assertEqual(net.l, l3) |
| self.assertRaises(TypeError, lambda: getattr(net, fn)('x', 'non-module')) |
| self.assertRaisesRegex(TypeError, 'module name should be a string. Got int', |
| lambda: getattr(net, fn)(1, l)) |
| self.assertRaisesRegex(TypeError, 'module name should be a string. Got NoneType', |
| lambda: getattr(net, fn)(None, l)) |
| |
| def test_set_submodule(self): |
| net = nn.Module() |
| net.t = nn.Module() |
| l = nn.Linear(1, 2) |
| target = "t.l" |
| net.set_submodule(target, l) |
| self.assertEqual(net.get_submodule(target), l) |
| l2 = nn.Linear(2, 1) |
| net.set_submodule(target, l2) |
| self.assertEqual(net.get_submodule(target), l2) |
| self.assertRaises(ValueError, net.set_submodule, "", l) |
| self.assertRaises(AttributeError, net.set_submodule, "a.l", l) |
| |
| def test_module_to_argparse(self): |
| net = nn.Sequential(nn.Linear(3, 3)) |
| cpu = torch.device('cpu') |
| with self.assertRaises(TypeError): |
| net.to(cpu, True) |
| with self.assertRaises(TypeError): |
| net.to(torch.long) |
| with self.assertRaises(TypeError): |
| net.to(None, True) |
| with self.assertRaises(TypeError): |
| net.to(cpu, torch.long, True) |
| with self.assertRaises(TypeError): |
| net.to(cpu, dtype=torch.long, non_blocking=True) |
| with self.assertRaises(TypeError): |
| net.to([]) |
| with self.assertRaises(TypeError): |
| net.to({}, non_blocking=True) |
| with self.assertRaises(TypeError): |
| net.to(torch.tensor(3, dtype=torch.long), non_blocking=True) |
| with self.assertRaises(TypeError): |
| net.to(cpu, torch.tensor(3, dtype=torch.long), non_blocking=True) |
| |
| def test_RNN_nonlinearity(self): |
| rnn = torch.nn.RNN(1, 10) |
| self.assertEqual(rnn.nonlinearity, 'tanh') |
| |
| rnn = torch.nn.RNN(1, 10, nonlinearity='relu') |
| self.assertEqual(rnn.nonlinearity, 'relu') |
| |
| with self.assertRaisesRegex(ValueError, 'Unknown nonlinearity'): |
| rnn = torch.nn.RNN(1, 10, nonlinearity='garbage') |
| |
| def test_RNN_nonlinearity_passed_as_arg(self): |
| rnn = torch.nn.RNN(2, 3, 1, 'relu') |
| self.assertEqual(rnn.nonlinearity, 'relu') |
| |
| def test_module_apply_inplace_op(self): |
| def add_one_inplace(t): |
| return t.add_(1.0) |
| |
| # Test that applying an in-place operation to a module would bump |
| # the module's parameters' version counter. |
| m = nn.Linear(20, 10) |
| pvm = m.weight.mul(m.weight) |
| m_weight_version_saved = m.weight._version |
| m = m._apply(add_one_inplace) |
| self.assertGreater(m.weight._version, m_weight_version_saved) |
| with self.assertRaisesRegex(RuntimeError, "modified by an inplace operation"): |
| pvm.backward(torch.randn(10, 20)) |
| |
| # Test that applying an in-place operation to a module would bump |
| # the module's parameters' gradients' version counter. |
| m = nn.Linear(20, 10) |
| m.weight.grad = torch.randn(10, 20).requires_grad_() |
| pgm = m.weight.grad.mul(m.weight.grad) |
| m_weight_grad_version_saved = m.weight.grad._version |
| m = m._apply(add_one_inplace) |
| self.assertGreater(m.weight.grad._version, m_weight_grad_version_saved) |
| with self.assertRaisesRegex(RuntimeError, "modified by an inplace operation"): |
| pgm.backward(torch.randn(10, 20)) |
| |
| def test_overwrite_module_params_on_conversion(self): |
| # Test that if the conversion function passed to `module._apply()` |
| # changes the TensorImpl type of `module`'s parameters, the `module`'s |
| # parameters are always overwritten, regardless of the value of |
| # `torch.__future__.get_overwrite_module_params_on_conversion()`. |
| m = nn.Linear(20, 10) |
| m.weight.grad = torch.randn(10, 20) |
| weight_ref = m.weight |
| weight_grad_ref = m.weight.grad |
| m = m._apply(lambda t: torch.sparse_coo_tensor(torch.zeros([2, 1]), torch.ones([1]), torch.Size([10, 20]))) |
| self.assertNotEqual(weight_ref.layout, m.weight.layout) |
| self.assertNotEqual(weight_grad_ref.layout, m.weight.grad.layout) |
| |
| # Test that under the current default settings |
| # (`torch.__future__.get_overwrite_module_params_on_conversion() == False`), |
| # a view to a module's parameters is not pointing to the same storage as |
| # its base variable after converting the module to a different dtype. |
| m = nn.Linear(20, 10).float() |
| mw = m.weight[:] |
| m.double() |
| with torch.no_grad(): |
| mw[0][0] = 5 |
| self.assertTrue(mw[0][0].dtype == torch.float) |
| self.assertTrue(mw._base[0][0].dtype == torch.double) |
| |
| try: |
| torch.__future__.set_overwrite_module_params_on_conversion(True) |
| |
| # Test that if `torch.__future__.get_overwrite_module_params_on_conversion() == True`, |
| # a view to a module's parameters is still pointing to the same storage as |
| # its base variable after converting the module to a different dtype. |
| m = nn.Linear(20, 10).float() |
| mw = m.weight[:] |
| m.double() |
| with torch.no_grad(): |
| mw[0][0] = 5 |
| self.assertTrue(mw[0][0] == mw._base[0][0]) |
| |
| # Test that if `torch.__future__.get_overwrite_module_params_on_conversion() == True`, |
| # `float_module.double()` doesn't preserve previous references to |
| # `float_module`'s parameters or gradients. |
| m = nn.Linear(20, 10).float() |
| m.weight.grad = torch.randn(10, 20).float() |
| weight_ref = m.weight |
| weight_grad_ref = m.weight.grad |
| m.double() |
| self.assertNotEqual(weight_ref.dtype, m.weight.dtype) |
| self.assertNotEqual(weight_grad_ref.dtype, m.weight.grad.dtype) |
| |
| def add_one_inplace(t): |
| return t.add_(1.0) |
| |
| # Test that if `torch.__future__.get_overwrite_module_params_on_conversion() == True`, |
| # applying an in-place operation to a module would bump the module's |
| # original parameters' version counter. |
| m = nn.Linear(20, 10) |
| pvm = m.weight.mul(m.weight) |
| weight_ref = m.weight |
| m_weight_version_saved = weight_ref._version |
| m = m._apply(add_one_inplace) |
| # Test that the in-place operation bumps the original parameter's version counter |
| self.assertGreater(weight_ref._version, m_weight_version_saved) |
| with self.assertRaisesRegex(RuntimeError, "modified by an inplace operation"): |
| pvm.backward(torch.randn(10, 20)) |
| |
| # Test that if `torch.__future__.get_overwrite_module_params_on_conversion() == True`, |
| # applying an in-place operation to a module would bump the module's |
| # original parameters' gradients' version counter. |
| m = nn.Linear(20, 10) |
| m.weight.grad = torch.randn(10, 20).requires_grad_() |
| pgm = m.weight.grad.mul(m.weight.grad) |
| weight_grad_ref = m.weight.grad |
| m_weight_grad_version_saved = weight_grad_ref._version |
| m = m._apply(add_one_inplace) |
| self.assertGreater(weight_grad_ref._version, m_weight_grad_version_saved) |
| with self.assertRaisesRegex(RuntimeError, "modified by an inplace operation"): |
| pgm.backward(torch.randn(10, 20)) |
| |
| # Test that if `torch.__future__.get_overwrite_module_params_on_conversion() == True`, |
| # applying an out-of-place operation to a module doesn't bump |
| # the module's original parameters' version counter. |
| m = nn.Linear(20, 10) |
| weight_ref = m.weight |
| m_weight_version_saved = weight_ref._version |
| m = m._apply(lambda t: torch.randn(t.shape)) |
| self.assertEqual(weight_ref._version, m_weight_version_saved) |
| |
| # Test that if `torch.__future__.get_overwrite_module_params_on_conversion() == True`, |
| # applying an out-of-place operation to a module doesn't bump |
| # the module's original parameters' gradients' version counter. |
| m = nn.Linear(20, 10) |
| m.weight.grad = torch.randn(10, 20).requires_grad_() |
| weight_grad_ref = m.weight.grad |
| m_weight_grad_version_saved = weight_grad_ref._version |
| m = m._apply(lambda t: torch.randn(t.shape)) |
| self.assertEqual(weight_grad_ref._version, m_weight_grad_version_saved) |
| finally: |
| torch.__future__.set_overwrite_module_params_on_conversion(False) |
| |
| def test_swap_module_params_poisons_acc_grad(self): |
| try: |
| torch.__future__.set_swap_module_params_on_conversion(True) |
| # (1) backward cannot be run after _apply |
| # forward will init AccumulateGrad nodes, which bumps use_count of parameters' at::Tensors |
| # additionally, if any Tensors are saved for backward, their use_count will be bumped |
| m = torch.nn.Linear(2, 3) |
| inp = torch.randn(2, 2) |
| out = m(inp) |
| m.half() |
| self.assertTrue(all(p.dtype == torch.float16 for p in m.parameters())) |
| with self.assertRaisesRegex(RuntimeError, "Trying to execute AccumulateGrad node that was poisoned by swap_tensors"): |
| out.sum().backward() |
| # (2) _apply can be run after backward() |
| # After running backward, all the references generated by "save for backward" will be cleared |
| # So the use_count will be 2 (1 from Tensor itself, and 1 from AccumulateGrad node), swap_tensors |
| # should allow this. |
| inp2 = torch.randn(2, 2, dtype=torch.half) |
| out2 = m(inp2) |
| out2.sum().backward() |
| m.float() |
| self.assertTrue(all(p.dtype == torch.float32 for p in m.parameters())) |
| out3 = m(inp) |
| finally: |
| torch.__future__.set_swap_module_params_on_conversion(False) |
| |
| def test_type(self): |
| l = nn.Linear(10, 20) |
| net = nn.Module() |
| net.l = l |
| net.l2 = l |
| net.add_module('empty', None) |
| net.indices = Buffer(torch.LongTensor(1)) |
| net.float() |
| self.assertIsInstance(l.weight.data, torch.FloatTensor) |
| self.assertIsInstance(l.bias.data, torch.FloatTensor) |
| self.assertIsInstance(net.indices, torch.LongTensor) |
| net.double() |
| self.assertIsInstance(l.weight.data, torch.DoubleTensor) |
| self.assertIsInstance(l.bias.data, torch.DoubleTensor) |
| self.assertIsInstance(net.indices, torch.LongTensor) |
| net.to(torch.half) |
| self.assertIsInstance(l.weight.data, torch.HalfTensor) |
| self.assertIsInstance(l.bias.data, torch.HalfTensor) |
| self.assertIsInstance(net.indices, torch.LongTensor) |
| if TEST_CUDA: |
| net.float().cuda() |
| self.assertIsInstance(l.weight.data, torch.cuda.FloatTensor) |
| self.assertIsInstance(l.bias.data, torch.cuda.FloatTensor) |
| self.assertIsInstance(net.indices, torch.cuda.LongTensor) |
| net.cpu() |
| self.assertIsInstance(l.weight.data, torch.FloatTensor) |
| self.assertIsInstance(l.bias.data, torch.FloatTensor) |
| self.assertIsInstance(net.indices, torch.LongTensor) |
| net.to("cuda", torch.double, True) |
| self.assertIsInstance(l.weight.data, torch.cuda.DoubleTensor) |
| self.assertIsInstance(l.bias.data, torch.cuda.DoubleTensor) |
| self.assertIsInstance(net.indices, torch.cuda.LongTensor) |
| net.to(torch.empty(1, device="cuda:0", dtype=torch.half)) |
| self.assertIsInstance(l.weight.data, torch.cuda.HalfTensor) |
| self.assertIsInstance(l.bias.data, torch.cuda.HalfTensor) |
| self.assertIsInstance(net.indices, torch.cuda.LongTensor) |
| net.to(torch.device("cpu"), non_blocking=True) |
| self.assertIsInstance(l.weight.data, torch.HalfTensor) |
| self.assertIsInstance(l.bias.data, torch.HalfTensor) |
| self.assertIsInstance(net.indices, torch.LongTensor) |
| net.to(torch.float) |
| self.assertIsInstance(l.weight.data, torch.FloatTensor) |
| self.assertIsInstance(l.bias.data, torch.FloatTensor) |
| net.to(torch.DoubleTensor(1)) |
| self.assertIsInstance(l.weight.data, torch.DoubleTensor) |
| self.assertIsInstance(l.bias.data, torch.DoubleTensor) |
| if TEST_CUDA: |
| net.to(device='cuda', dtype=torch.float) |
| self.assertIsInstance(l.weight.data, torch.cuda.FloatTensor) |
| self.assertIsInstance(l.bias.data, torch.cuda.FloatTensor) |
| |
| def test_non_leaf_parameters(self): |
| l1 = nn.Linear(10, 10) |
| l2 = nn.Linear(10, 10) |
| |
| def assign_weight(): |
| l2.weight = l1.weight + 2 |
| |
| self.assertRaises(TypeError, assign_weight) |
| # This should work though |
| l2.weight = Parameter(torch.randn(10, 10)) |
| |
| def test_parameters_to_vector(self): |
| conv1 = nn.Conv2d(3, 10, 5) |
| fc1 = nn.Linear(10, 20) |
| model = nn.Sequential(conv1, fc1) |
| |
| vec = parameters_to_vector(model.parameters()) |
| self.assertEqual(vec.size(0), 980) |
| |
| def test_vector_to_parameters(self): |
| conv1 = nn.Conv2d(3, 10, 5) |
| fc1 = nn.Linear(10, 20) |
| model = nn.Sequential(conv1, fc1) |
| |
| vec = torch.arange(0., 980) |
| vector_to_parameters(vec, model.parameters()) |
| |
| sample = next(model.parameters())[0, 0, 0] |
| self.assertTrue(torch.equal(sample.data, vec.data[:5])) |
| |
| def test_rnn_weight_norm(self): |
| def check_weight_norm(l, name, num_params): |
| # This Module has 4 or 5 parameters called: |
| # 'weight_ih_l0', 'weight_hh_l0', 'bias_ih_l0', 'bias_hh_l0', weight_hr_l0 |
| |
| # Applying weight norm on one of them causes it to become a tensor |
| l = torch.nn.utils.weight_norm(l, name=name) |
| self.assertEqual( |
| sum(isinstance(p, torch.nn.Parameter) for p in l._flat_weights), |
| num_params - 1, |
| ) |
| |
| # Removing the weight norm reparametrization restores the Parameter |
| l = torch.nn.utils.remove_weight_norm(l, name=name) |
| self.assertEqual( |
| sum(isinstance(p, torch.nn.Parameter) for p in l._flat_weights), |
| num_params, |
| ) |
| |
| # Make sure that, upon removal of the reparametrization, the |
| # `._parameters` and `.named_parameters` contain the right params. |
| # Specifically, the original weight ('weight_ih_l0') should be placed |
| # back in the parameters, while the reparametrization components |
| # ('weight_ih_l0_v' and 'weight_ih_l0_g') should be removed. |
| self.assertTrue(name in l._parameters) |
| self.assertIsNotNone(l._parameters[name]) |
| self.assertTrue(name + '_v' not in l._parameters) |
| self.assertTrue(name + '_g' not in l._parameters) |
| self.assertTrue(name in dict(l.named_parameters())) |
| self.assertIsNotNone(dict(l.named_parameters())[name]) |
| self.assertTrue(name + '_v' not in dict(l.named_parameters())) |
| self.assertTrue(name + '_g' not in dict(l.named_parameters())) |
| |
| check_weight_norm(torch.nn.LSTM(32, 32), 'weight_ih_l0', 4) |
| check_weight_norm(torch.nn.LSTM(32, 32, proj_size=16), 'weight_hr_l0', 5) |
| |
| |
| def test_weight_norm(self): |
| for dtype in [torch.float, torch.bfloat16]: |
| input = torch.randn(3, 4, dtype=dtype) |
| m = nn.Linear(4, 5).to(dtype=dtype) |
| expected_output = m(input) |
| |
| # add weight normalization |
| m = torch.nn.utils.weight_norm(m) |
| self.assertEqual(m.weight_v.size(), m.weight.size()) |
| self.assertEqual(m.weight_g.size(), (5, 1)) |
| self.assertEqual(m(input), expected_output, atol=dtype2prec_DONTUSE[dtype], rtol=0) |
| |
| # remove weight norm |
| m = torch.nn.utils.remove_weight_norm(m) |
| self.assertFalse(hasattr(m, 'weight_g')) |
| self.assertFalse(hasattr(m, 'weight_v')) |
| self.assertEqual(m(input), expected_output, atol=dtype2prec_DONTUSE[dtype], rtol=0) |
| |
| # test with dim=1 |
| m = torch.nn.utils.weight_norm(m, dim=1) |
| self.assertEqual(m.weight_v.size(), m.weight.size()) |
| self.assertEqual(m.weight_g.size(), (1, 4)) |
| self.assertEqual(m(input), expected_output, atol=dtype2prec_DONTUSE[dtype], rtol=0) |
| |
| # test with dim=None |
| m = nn.Linear(4, 5).to(dtype=dtype) |
| expected_output = m(input) |
| m = torch.nn.utils.weight_norm(m, dim=None) |
| self.assertEqual(m(input), expected_output) |
| |
| with self.assertRaisesRegex(RuntimeError, 'register two weight_norm hooks'): |
| m = torch.nn.utils.weight_norm(m) |
| m = torch.nn.utils.weight_norm(m) |
| |
| # For float16, the forward of the Module doesn't work but we must still be able |
| # to register the weight norm as this is often done before sending the Module to |
| # CUDA. |
| m = nn.Linear(4, 5, dtype=torch.float16) |
| m = torch.nn.utils.weight_norm(m) |
| |
| def test_parameterlistdict_setting_attributes(self): |
| with warnings.catch_warnings(record=True) as w: |
| mod = nn.ParameterList(map(nn.Parameter, [torch.rand(2), torch.rand(2)])) |
| self.assertTrue(len(w) == 0) |
| |
| with warnings.catch_warnings(record=True) as w: |
| mod.train() |
| mod.eval() |
| self.assertTrue(len(w) == 0) |
| |
| with warnings.catch_warnings(record=True) as w: |
| mod = nn.ParameterDict({"a": nn.Parameter(torch.rand(2)), "b": nn.Parameter(torch.rand(2))}) |
| self.assertTrue(len(w) == 0) |
| |
| with warnings.catch_warnings(record=True) as w: |
| mod.train() |
| mod.eval() |
| self.assertTrue(len(w) == 0) |
| |
| def test_parameterlistdict_pickle(self): |
| m = nn.ParameterList(map(nn.Parameter, [torch.rand(2), torch.rand(2)])) |
| with warnings.catch_warnings(record=True) as w: |
| m = pickle.loads(pickle.dumps(m)) |
| self.assertTrue(len(w) == 0) |
| |
| # Test whether loading from older checkpoints works without triggering warnings |
| m = nn.ParameterList(map(nn.Parameter, [torch.rand(2), torch.rand(2)])) |
| del m._forward_pre_hooks, m._state_dict_hooks, m._load_state_dict_pre_hooks, m._non_persistent_buffers_set |
| with warnings.catch_warnings(record=True) as w: |
| m = pickle.loads(pickle.dumps(m)) |
| self.assertTrue(len(w) == 0) |
| |
| m = nn.ParameterDict({"a": nn.Parameter(torch.rand(2)), "b": nn.Parameter(torch.rand(2))}) |
| with warnings.catch_warnings(record=True) as w: |
| m = pickle.loads(pickle.dumps(m)) |
| self.assertTrue(len(w) == 0) |
| |
| # Test whether loading from older checkpoints works without triggering warnings |
| m = nn.ParameterDict({"a": nn.Parameter(torch.rand(2)), "b": nn.Parameter(torch.rand(2))}) |
| del m._forward_pre_hooks, m._state_dict_hooks, m._load_state_dict_pre_hooks, m._non_persistent_buffers_set |
| with warnings.catch_warnings(record=True) as w: |
| m = pickle.loads(pickle.dumps(m)) |
| self.assertTrue(len(w) == 0) |
| |
| def test_weight_norm_pickle(self): |
| m = torch.nn.utils.weight_norm(nn.Linear(5, 7)) |
| m = pickle.loads(pickle.dumps(m)) |
| self.assertIsInstance(m, nn.Linear) |
| |
| @skipIfTorchDynamo("TorchDynamo fails here for unknown reasons") |
| @set_default_dtype(torch.double) |
| def test_spectral_norm(self): |
| input = torch.randn(3, 5) |
| m = nn.Linear(5, 7) |
| m = torch.nn.utils.spectral_norm(m) |
| |
| self.assertEqual(m.weight_u.size(), torch.Size([m.weight.size(0)])) |
| # weight_orig should be trainable |
| self.assertTrue(hasattr(m, 'weight_orig')) |
| self.assertTrue('weight_orig' in m._parameters) |
| # weight_u should be just a reused buffer |
| self.assertTrue(hasattr(m, 'weight_u')) |
| self.assertTrue('weight_u' in m._buffers) |
| self.assertTrue('weight_v' in m._buffers) |
| # weight should be a plain attribute, not counted as a buffer or a param |
| self.assertFalse('weight' in m._buffers) |
| self.assertFalse('weight' in m._parameters) |
| # it should also be sharing storage as `weight_orig` |
| self.assertEqual(m.weight_orig.storage(), m.weight.storage()) |
| self.assertEqual(m.weight_orig.size(), m.weight.size()) |
| self.assertEqual(m.weight_orig.stride(), m.weight.stride()) |
| |
| m = torch.nn.utils.remove_spectral_norm(m) |
| self.assertFalse(hasattr(m, 'weight_orig')) |
| self.assertFalse(hasattr(m, 'weight_u')) |
| # weight should be converted back as a parameter |
| self.assertTrue(hasattr(m, 'weight')) |
| self.assertTrue('weight' in m._parameters) |
| |
| with self.assertRaisesRegex(RuntimeError, 'register two spectral_norm hooks'): |
| m = torch.nn.utils.spectral_norm(m) |
| m = torch.nn.utils.spectral_norm(m) |
| |
| # test correctness in training/eval modes and cpu/multi-gpu settings |
| for apply_dp in (True, False): |
| if apply_dp: |
| if not TEST_MULTIGPU: |
| continue |
| device = torch.device('cuda:0') |
| |
| def maybe_wrap(m): |
| return torch.nn.DataParallel(m, [0, 1]) |
| else: |
| device = torch.device('cpu') |
| |
| def maybe_wrap(m): |
| return m |
| |
| for requires_grad in (True, False): |
| m = nn.Linear(3, 4).to(device) |
| m.weight.requires_grad_(requires_grad) |
| m = torch.nn.utils.spectral_norm(m) |
| wrapped_m = maybe_wrap(m) |
| self.assertTrue(hasattr(m, 'weight_u')) |
| u0 = m.weight_u.clone() |
| v0 = m.weight_v.clone() |
| |
| |
| # assert that u and v are updated |
| input = torch.randn(2, 3, device=device) |
| out = wrapped_m(input) |
| self.assertNotEqual(u0, m.weight_u) |
| self.assertNotEqual(v0, m.weight_v) |
| |
| # assert that backprop reaches weight_orig |
| # can't use gradcheck because the function changes as we |
| # activate through it in training mode |
| if requires_grad: |
| torch.autograd.grad(out.sum(), m.weight_orig) |
| |
| # test backward works with multiple forwards |
| # it uses training mode so we need to reset `u` and `v` vectors |
| # to same value at beginning for finite difference test to pass |
| saved_u = m.weight_u.clone() |
| saved_v = m.weight_v.clone() |
| |
| def fn(input): |
| m.weight_u.data.copy_(saved_u) |
| m.weight_v.data.copy_(saved_v) |
| out0 = wrapped_m(input) |
| out1 = wrapped_m(input) |
| return out0 + out1 |
| |
| gradcheck(fn, (input.clone().requires_grad_(),), check_batched_grad=False) |
| |
| # test removing |
| pre_remove_out = wrapped_m(input) |
| m = torch.nn.utils.remove_spectral_norm(m) |
| self.assertEqual(wrapped_m(input), pre_remove_out) |
| |
| m = torch.nn.utils.spectral_norm(m) |
| for _ in range(3): |
| pre_remove_out = wrapped_m(input) |
| m = torch.nn.utils.remove_spectral_norm(m) |
| self.assertEqual(wrapped_m(input), pre_remove_out) |
| |
| |
| m = torch.nn.utils.spectral_norm(m) |
| wrapped_m(input) |
| last_train_out = wrapped_m(input) |
| last_train_u = m.weight_u.clone() |
| last_train_v = m.weight_v.clone() |
| wrapped_m.zero_grad() |
| wrapped_m.eval() |
| |
| eval_out0 = wrapped_m(input) |
| # assert eval gives same result as last training iteration |
| self.assertEqual(eval_out0, last_train_out) |
| # assert doing more iteartion in eval don't change things |
| self.assertEqual(eval_out0, wrapped_m(input)) |
| self.assertEqual(last_train_u, m.weight_u) |
| self.assertEqual(last_train_v, m.weight_v) |
| |
| # FIXME: the code below is flaky when executed with DataParallel |
| # see https://github.com/pytorch/pytorch/issues/13818 |
| if apply_dp: |
| continue |
| |
| # test backward works with multiple forwards in mixed training |
| # and eval modes |
| # it uses training mode so we need to reset `u` and `v` vectors |
| # to same value at beginning for finite difference test to pass |
| saved_u = m.weight_u.clone() |
| saved_v = m.weight_v.clone() |
| |
| def fn(input): |
| m.weight_u.data.copy_(saved_u) |
| m.weight_v.data.copy_(saved_v) |
| wrapped_m.train() |
| out0 = wrapped_m(input) |
| wrapped_m.eval() |
| out1 = wrapped_m(input) |
| wrapped_m.train() |
| out2 = wrapped_m(input) |
| wrapped_m.eval() |
| out3 = wrapped_m(input) |
| return out0 + out1 + out2 + out3 |
| |
| gradcheck(fn, (input.clone().requires_grad_(),)) |
| |
| # assert that backprop reaches weight_orig in eval |
| if requires_grad: |
| def fn(weight): |
| return wrapped_m(input) |
| |
| gradcheck(fn, (m.weight_orig,)) |
| |
| @skipIfNoLapack |
| def test_spectral_norm_load_state_dict(self): |
| inp = torch.randn(2, 3) |
| for activate_times in (0, 3): |
| # Test backward compatibility |
| # At version None -> 1: weight becomes not a buffer and v vector becomes a buffer |
| m = nn.Linear(3, 5) |
| snm = torch.nn.utils.spectral_norm(m) |
| snm.train() |
| for _ in range(activate_times): |
| snm(inp) |
| |
| version_latest_ref_state_dict = deepcopy(snm.state_dict()) |
| self.assertEqual({'weight_orig', 'bias', 'weight_u', 'weight_v'}, set(version_latest_ref_state_dict.keys())) |
| |
| # test that non-strict loading works |
| non_strict_state_dict = deepcopy(version_latest_ref_state_dict) |
| non_strict_state_dict['nonsense'] = 'nonsense' |
| with self.assertRaisesRegex(RuntimeError, r'Unexpected key\(s\) in state_dict: "nonsense"'): |
| snm.load_state_dict(non_strict_state_dict, strict=True) |
| snm.load_state_dict(non_strict_state_dict, strict=False) |
| del non_strict_state_dict['weight_orig'] |
| snm.load_state_dict(non_strict_state_dict, strict=False) |
| del non_strict_state_dict['weight_u'] |
| snm.load_state_dict(non_strict_state_dict, strict=False) |
| del non_strict_state_dict['weight_v'] |
| snm.load_state_dict(non_strict_state_dict, strict=False) |
| non_strict_state_dict['weight'] = snm.weight.detach().clone() # set W as a buffer |
| snm.load_state_dict(non_strict_state_dict, strict=False) |
| del non_strict_state_dict._metadata['']['spectral_norm'] # remove metadata info |
| snm.load_state_dict(non_strict_state_dict, strict=False) |
| del non_strict_state_dict['weight'] # remove W buffer |
| snm.load_state_dict(non_strict_state_dict, strict=False) |
| del non_strict_state_dict['bias'] |
| snm.load_state_dict(non_strict_state_dict, strict=False) |
| |
| # craft a version None state_dict |
| version_none_state_dict = deepcopy(version_latest_ref_state_dict) |
| self.assertIn('spectral_norm', version_none_state_dict._metadata['']) |
| del version_none_state_dict._metadata['']['spectral_norm'] # remove metadata info |
| del version_none_state_dict['weight_v'] # remove v vector |
| version_none_state_dict['weight'] = snm.weight.detach().clone() # set W as a buffer |
| |
| # normal state_dict |
| for version_latest_with_metadata in [True, False]: |
| version_latest_state_dict = deepcopy(version_latest_ref_state_dict) |
| |
| if not version_latest_with_metadata: |
| # We want to still load a user-crafted state_dict, one without metadata |
| del version_latest_state_dict._metadata['']['spectral_norm'] |
| |
| # test that re-wrapping does not matter |
| m = torch.nn.utils.remove_spectral_norm(snm) |
| snm = torch.nn.utils.spectral_norm(m) |
| |
| snm.load_state_dict(version_latest_ref_state_dict) |
| with torch.no_grad(): |
| snm.eval() |
| out0_eval = snm(inp) |
| snm.train() |
| out1_train = snm(inp) |
| out2_train = snm(inp) |
| snm.eval() |
| out3_eval = snm(inp) |
| |
| # test that re-wrapping does not matter |
| m = torch.nn.utils.remove_spectral_norm(snm) |
| snm = torch.nn.utils.spectral_norm(m) |
| |
| snm.load_state_dict(version_none_state_dict) |
| if activate_times > 0: |
| # since in loading version None state dict, we assume that the |
| # values in the state dict have gone through at lease one |
| # forward, we only test for equivalence when activate_times > 0. |
| with torch.no_grad(): |
| snm.eval() |
| self.assertEqual(out0_eval, snm(inp)) |
| snm.train() |
| self.assertEqual(out1_train, snm(inp)) |
| self.assertEqual(out2_train, snm(inp)) |
| snm.eval() |
| self.assertEqual(out3_eval, snm(inp)) |
| |
| # test that re-wrapping does not matter |
| m = torch.nn.utils.remove_spectral_norm(snm) |
| snm = torch.nn.utils.spectral_norm(m) |
| |
| # Test normal loading |
| snm.load_state_dict(version_latest_state_dict) |
| with torch.no_grad(): |
| snm.eval() |
| self.assertEqual(out0_eval, snm(inp)) |
| snm.train() |
| self.assertEqual(out1_train, snm(inp)) |
| self.assertEqual(out2_train, snm(inp)) |
| snm.eval() |
| self.assertEqual(out3_eval, snm(inp)) |
| |
| def test_spectral_norm_dim(self): |
| inp = torch.randn(2, 3, 10, 12) |
| m = nn.ConvTranspose2d(3, 4, (5, 6)) |
| m = torch.nn.utils.spectral_norm(m) |
| # this should not run into incompatible shapes |
| x = m(inp) |
| # check that u refers to the same dimension |
| self.assertEqual(m.weight_u.shape, m.weight_orig[0, :, 0, 0].shape) |
| |
| def test_spectral_norm_forward(self): |
| input = torch.randn(3, 5) |
| m = nn.Linear(5, 7) |
| m = torch.nn.utils.spectral_norm(m) |
| # naive forward |
| _weight, _bias, _u = m.weight_orig, m.bias, m.weight_u |
| _weight_mat = _weight.view(_weight.size(0), -1) |
| _v = torch.mv(_weight_mat.t(), _u) |
| _v = F.normalize(_v, dim=0, eps=1e-12) |
| _u = torch.mv(_weight_mat, _v) |
| _u = F.normalize(_u, dim=0, eps=1e-12) |
| _weight.data /= torch.dot(_u, torch.matmul(_weight_mat, _v)) |
| out_hat = torch.nn.functional.linear(input, _weight, _bias) |
| expect_out = m(input) |
| self.assertEqual(expect_out, out_hat) |
| |
| def test_spectral_norm_pickle(self): |
| m = torch.nn.utils.spectral_norm(nn.Linear(5, 7)) |
| m = pickle.loads(pickle.dumps(m)) |
| self.assertIsInstance(m, nn.Linear) |
| |
| def test_threshold_int(self): |
| x = torch.tensor([-3, -2, -1, 0, 1, 2, 3]) |
| expected = torch.tensor([99, 99, 99, 99, 1, 2, 3]) |
| self.assertEqual(F.threshold(x, 0, 99), expected) |
| |
| def test_threshold_bfloat16_half(self): |
| x = torch.randn(100) |
| for dtype in [torch.bfloat16, torch.half]: |
| for threshold in [0, -0.5, 0.5, float('inf'), float('-inf'), float('nan')]: |
| expected = F.threshold(x, threshold, 0).to(dtype=dtype).float() |
| res_bf16 = F.threshold(x.to(dtype=dtype), threshold, 0).float() |
| self.assertEqual(res_bf16, expected) |
| |
| @unittest.skipUnless('fbgemm' in torch.backends.quantized.supported_engines, |
| 'Linear_FP16_weight requires FBGEMM. FBGEMM is only optimized for CPUs' |
| ' with instruction set support avx2 or newer.') |
| def test_fb_fc_packed(self): |
| X = np.random.rand(16, 16).astype(np.float32) - 0.5 |
| W = np.random.rand(16, 16).astype(np.float32) - 0.5 |
| b = np.random.rand(16).astype(np.float32) - 0.5 |
| |
| def fc_op(X, W, b): |
| return np.dot(X, W.T) + b |
| |
| x_tensor = torch.tensor(X) |
| w_tensor = torch.tensor(W) |
| b_tensor = torch.tensor(b) |
| packed_w_tensor = torch.fbgemm_pack_gemm_matrix_fp16(w_tensor) |
| actual_output = torch.fbgemm_linear_fp16_weight(x_tensor, packed_w_tensor, b_tensor) |
| expected_output = fc_op(X, W, b) |
| torch.testing.assert_close(torch.from_numpy(expected_output), actual_output.cpu(), atol=1e-3, rtol=1e-3) |
| |
| def test_pad_scalar_error(self): |
| inputs = torch.tensor(0., requires_grad=True) |
| self.assertRaises(RuntimeError, lambda: F.pad(inputs, (1, 1))) |
| self.assertRaises(RuntimeError, lambda: F.pad(inputs, (1,))) |
| |
| def test_nested_tensor_from_mask(self): |
| N, L, D = 10, 12, 14 |
| |
| input = torch.rand(N, L, D) |
| mask = torch.ones(N, L, dtype=torch.bool) |
| # Leave first row be all True to maintain the nt's size unchanged |
| for i in range(1, N): |
| end = torch.randint(1, L, size=()).item() |
| mask[i, end:] = False |
| |
| nt = torch._nested_tensor_from_mask(input, mask) |
| input_convert = nt.to_padded_tensor(0.) |
| input.masked_fill_(mask.reshape(N, L, 1).logical_not(), 0.) |
| |
| self.assertEqual(input, input_convert) |
| |
| def test_nested_tensor_from_mask_error(self): |
| N, L, D = 10, 12, 14 |
| |
| input = torch.rand(N, L, D) |
| # Mask is not bool |
| mask = torch.zeros(N, L, dtype=torch.float) |
| self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask)) |
| |
| # Mask size is not 2 |
| mask = torch.zeros(N, L, D, dtype=torch.bool) |
| self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask)) |
| |
| # Input size is not 3 |
| mask = torch.zeros(N, L, dtype=torch.bool) |
| input = torch.rand(N, L) |
| self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask)) |
| |
| # Mask size does not match input |
| mask = torch.zeros(N + 1, L + 1, dtype=torch.bool) |
| input = torch.rand(N, L, D) |
| self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask)) |
| |
| # Mask is not padding format |
| mask = torch.ones(N, L, dtype=torch.bool) |
| mask[0, 0] = False |
| mask[0, 2] = False |
| self.assertRaises(RuntimeError, lambda: torch._nested_tensor_from_mask(input, mask)) |
| |
| def test_normalize(self): |
| inputs = torch.randn(1, 3, 4, 4, requires_grad=True, dtype=torch.double) |
| self.assertTrue(gradcheck(lambda x: F.normalize(x, p=1, dim=-1), (inputs,))) |
| self.assertTrue(gradcheck(lambda x: F.normalize(x, p=2, dim=-2), (inputs,))) |
| |
| inputs = torch.randn((), requires_grad=True) |
| self.assertTrue(gradcheck(lambda x: F.normalize(x, p=1, dim=-1), (inputs,))) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") |
| # Skip the test for ROCm as per https://github.com/pytorch/pytorch/issues/53190 |
| @skipIfRocm |
| def test_broadcast_double_backwards_gpu(self): |
| tensors = (torch.randn(4, 4, device='cuda', requires_grad=True, dtype=torch.double), |
| torch.randn(4, 4, device='cuda', requires_grad=True, dtype=torch.double), |
| torch.randn(4, 4, device='cuda', requires_grad=True, dtype=torch.double)) |
| # TODO(#50743): the following segfaults with check_batched_grad=True |
| _assertGradAndGradgradChecks(self, lambda *i: Broadcast.apply((0, 1), *i), tensors, |
| check_batched_grad=False) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") |
| def test_broadcast_not_requiring_grad(self): |
| variables = [ |
| torch.randn(1, 2, device='cuda', requires_grad=True), |
| torch.randn(1, 2, device='cuda', requires_grad=False), |
| torch.randn(1, 2, device='cuda', requires_grad=False), |
| torch.randn(1, 2, device='cuda', requires_grad=True), |
| torch.randn(1, 2, device='cuda', requires_grad=True), |
| ] |
| broadcasted_variables = Broadcast.apply((0, 1), *variables) |
| for output_idx, broadcasted_var in enumerate(broadcasted_variables): |
| input_var = variables[output_idx % len(variables)] |
| self.assertEqual(input_var.requires_grad, broadcasted_var.requires_grad) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") |
| def test_broadcast_no_grad(self): |
| x = torch.randn(1, 2, dtype=torch.float32, requires_grad=True, device='cuda') |
| with torch.no_grad(): |
| broadcasted = Broadcast.apply((0, 1), x) |
| self.assertTrue(x.requires_grad) |
| for output in broadcasted: |
| self.assertFalse(output.requires_grad) |
| |
| def test_state_dict(self): |
| l = nn.Linear(5, 5) |
| block = nn.Module() |
| block.conv = nn.Conv2d(3, 3, 3, bias=False) |
| net = nn.Module() |
| net.linear1 = l |
| net.linear2 = l |
| net.bn = nn.BatchNorm2d(2) |
| net.block = block |
| net.add_module('empty', None) |
| |
| state_dict = net.state_dict() |
| self.assertEqual(len(state_dict), 10) |
| self.assertEqual(len(state_dict._metadata), 6) |
| self.assertIn('', state_dict._metadata) |
| self.assertIn('linear1', state_dict._metadata) |
| self.assertIn('linear1.weight', state_dict) |
| self.assertIn('linear1.bias', state_dict) |
| self.assertIn('linear2', state_dict._metadata) |
| self.assertIn('linear2.weight', state_dict) |
| self.assertIn('linear2.bias', state_dict) |
| self.assertIn('block', state_dict._metadata) |
| self.assertIn('block.conv', state_dict._metadata) |
| self.assertIn('block.conv.weight', state_dict) |
| self.assertIn('block.conv.weight', state_dict) |
| self.assertNotIn('block.conv.bias', state_dict) |
| self.assertIn('bn', state_dict._metadata) |
| self.assertIn('bn.weight', state_dict) |
| self.assertIn('bn.bias', state_dict) |
| self.assertIn('bn.running_var', state_dict) |
| self.assertIn('bn.running_mean', state_dict) |
| self.assertIn('bn.num_batches_tracked', state_dict) |
| self.assertFalse(any(k.startswith('empty') for k in state_dict.keys())) |
| for k, v in state_dict.items(): |
| param = net |
| for component in k.split('.'): |
| param = getattr(param, component) |
| if isinstance(param, Parameter): |
| param = param.data |
| self.assertEqual(v.data_ptr(), param.data_ptr()) |
| |
| l = nn.Linear(5, 5) |
| state_dict = l.state_dict() |
| self.assertEqual(len(state_dict), 2) |
| self.assertEqual(len(state_dict._metadata), 1) |
| self.assertIn('', state_dict._metadata) |
| self.assertTrue(state_dict._metadata['']['version'] >= 0) |
| self.assertEqual(state_dict['weight'].data_ptr(), l.weight.data_ptr()) |
| self.assertEqual(state_dict['bias'].data_ptr(), l.bias.data_ptr()) |
| |
| # Reference https://github.com/pytorch/pytorch/pull/75507#issuecomment-1110291545 |
| self.assertNotWarn(lambda: l.state_dict(destination={}), "Should not warn kwarg destination w/o _metadata") |
| |
| def test_extra_state(self): |
| |
| class SubModule(torch.nn.Module): |
| def __init__(self, foo): |
| super().__init__() |
| self.foo = foo |
| |
| def get_extra_state(self): |
| return { |
| 'foo': self.foo |
| } |
| |
| def set_extra_state(self, state): |
| self.foo = state['foo'] |
| |
| class MyModule(torch.nn.Module): |
| def __init__(self, foo, bar): |
| super().__init__() |
| self.sub = SubModule(foo) |
| self.bar = bar |
| |
| def get_extra_state(self): |
| return { |
| 'bar': self.bar |
| } |
| |
| def set_extra_state(self, state): |
| self.bar = state['bar'] |
| |
| # Ensure state_dict contains the extra state by loading it into another module. |
| m = MyModule(3, 'something') |
| m2 = MyModule(5, 'something else') |
| m2.load_state_dict(m.state_dict()) |
| self.assertEqual(m.state_dict(), m2.state_dict()) |
| self.assertEqual(m2.bar, m.bar) |
| self.assertEqual(m2.sub.foo, m.sub.foo) |
| |
| def test_extra_state_non_dict(self): |
| |
| class MyModule(torch.nn.Module): |
| def __init__(self, foo): |
| super().__init__() |
| self.foo = foo |
| |
| def get_extra_state(self): |
| return self.foo |
| |
| def set_extra_state(self, state): |
| self.foo = state |
| |
| # Test various types of extra state. |
| for state in ('something', 5, MyModule(3)): |
| m = MyModule(state) |
| m2 = MyModule('something else') |
| m2.load_state_dict(m.state_dict()) |
| self.assertEqual(m.state_dict(), m2.state_dict()) |
| self.assertEqual(m.foo, m2.foo) |
| |
| def test_extra_state_missing_set_extra_state(self): |
| |
| class MyModule(torch.nn.Module): |
| def get_extra_state(self): |
| return { |
| 'foo': 5 |
| } |
| |
| m = MyModule() |
| with self.assertRaisesRegex(RuntimeError, 'Unexpected key'): |
| m.load_state_dict(m.state_dict()) |
| |
| def test_extra_state_missing_get_extra_state(self): |
| |
| class MyModule(torch.nn.Module): |
| def set_extra_state(self): |
| pass |
| |
| m = MyModule() |
| with self.assertRaisesRegex(RuntimeError, 'Missing key'): |
| m.load_state_dict(m.state_dict()) |
| |
| @skipIfTorchDynamo("TorchDynamo fails here for unknown reasons") |
| def test_parameter_assignment(self): |
| l = nn.Linear(5, 5) |
| |
| def num_params(): |
| return len(list(l.parameters())) |
| |
| self.assertEqual(num_params(), 2) |
| |
| new_param = Parameter(torch.randn(5, 5)) |
| l.param_name = new_param |
| self.assertEqual(num_params(), 3) |
| self.assertObjectIn(new_param, l.parameters()) |
| |
| var = torch.randn(5, 5) |
| l.var_name = var |
| self.assertEqual(num_params(), 3) |
| self.assertNotIn(id(var), map(id, l.parameters())) |
| |
| # Make sure Variables are not saved as parameters |
| l.variable_attr = torch.empty(5, 5) |
| self.assertEqual(num_params(), 3) |
| l.param_attr = Parameter(torch.empty(5, 5)) |
| self.assertEqual(num_params(), 4) |
| |
| # It shouldn't be possible to replace a parameter with a Variable |
| def assign_var(): |
| l.param_attr = torch.empty(5, 5) |
| |
| self.assertRaises(TypeError, assign_var) |
| # But replacing it with None should be fine |
| l.param_attr = None |
| self.assertEqual(num_params(), 3) |
| |
| def test_assignment(self): |
| l = nn.Module() |
| a = nn.Parameter(torch.randn(2)) |
| b = nn.Parameter(torch.randn(3)) |
| c = nn.Parameter(torch.randn(4)) |
| q = nn.Linear(4, 4) |
| r = nn.Linear(5, 5) |
| w = nn.Linear(6, 6) |
| |
| def test_assignments(get_list, a, b, c): |
| # Check that None can be shadowed |
| l.a = None |
| self.assertIsNone(l.a) |
| self.assertIn('a', l.__dict__) |
| l.a = a |
| self.assertIs(l.a, a) |
| self.assertEqual(get_list(), [a]) |
| self.assertNotIn('a', l.__dict__) |
| |
| # Assign second object |
| l.b = None |
| self.assertIsNone(l.b) |
| self.assertIn('b', l.__dict__) |
| l.b = b |
| self.assertIs(l.b, b) |
| self.assertEqual(get_list(), [a, b]) |
| self.assertNotIn('b', l.__dict__) |
| |
| # Remove and add the object back. Order should be unchanged. |
| l.a = None |
| self.assertIsNone(l.a) |
| self.assertEqual(get_list(), [b]) |
| l.a = a |
| self.assertIs(l.a, a) |
| self.assertEqual(get_list(), [a, b]) |
| |
| # Replace object with another one. Order should be unchanged. |
| l.a = c |
| self.assertIs(l.a, c) |
| self.assertEqual(get_list(), [c, b]) |
| |
| # Remove and reassign an attribute. It should appear at the end of the list now. |
| del l.a |
| self.assertFalse(hasattr(l, 'a')) |
| l.a = a |
| self.assertIs(l.a, a) |
| self.assertEqual(get_list(), [b, a]) |
| |
| test_assignments(lambda: list(l.parameters()), a, b, c) |
| del l.a, l.b |
| self.assertEqual(list(l.parameters()), []) |
| |
| test_assignments(lambda: list(l.children()), q, r, w) |
| del l.a, l.b |
| self.assertEqual(list(l.children()), []) |
| |
| buf = Buffer(torch.randn(10)) |
| l.buf = buf |
| self.assertIs(l.buf, buf) |
| l.buf = None |
| self.assertIs(l.buf, None) |
| self.assertNotIn('buf', l.__dict__) # should be stored in l._buffers |
| l.buf = buf |
| self.assertIn('buf', l.state_dict()) |
| self.assertEqual(l.state_dict()['buf'], buf) |
| |
| def test_container_copy(self): |
| class Model(nn.Module): |
| def __init__(self) -> None: |
| super().__init__() |
| self.linear = nn.Linear(4, 5) |
| |
| def forward(self, input): |
| return self.linear(input) |
| |
| input = torch.randn(2, 4) |
| |
| model = Model() |
| model_cp = deepcopy(model) |
| self.assertEqual(model(input).data, model_cp(input).data) |
| |
| model_cp.linear.weight.data[:] = 2 |
| self.assertNotEqual(model(input).data, model_cp(input).data) |
| |
| def test_RNN_cell(self): |
| # this is just a smoke test; these modules are implemented through |
| # autograd so no Jacobian test is needed |
| for module in (nn.RNNCell, nn.GRUCell): |
| for bias in (True, False): |
| input = torch.randn(3, 10) |
| hx = torch.randn(3, 20) |
| cell = module(10, 20, bias=bias) |
| for _ in range(6): |
| hx = cell(input, hx) |
| |
| hx.sum().backward() |
| |
| def test_RNN_cell_forward_zero_hidden_size(self): |
| input = torch.randn(3, 10) |
| hx = torch.randn(3, 0) |
| cell_shared_param = (10, 0) |
| for cell in (nn.RNNCell(*cell_shared_param, nonlinearity="relu"), |
| nn.RNNCell(*cell_shared_param, nonlinearity="tanh"), |
| nn.GRUCell(*cell_shared_param)): |
| self.assertEqual(cell(input, hx).shape, torch.Size([3, 0])) |
| |
| def _test_loss_equal_input_target_shape(self, cast): |
| # Tests losses whose inputs should have the same size. |
| losses = { |
| 'mse_loss': lambda x, y: F.mse_loss(x, y), |
| 'l1_loss': lambda x, y: F.l1_loss(x, y), |
| 'smooth_l1_loss': lambda x, y: F.smooth_l1_loss(x, y), |
| 'huber_loss': lambda x, y: F.huber_loss(x, y), |
| 'kl_div': lambda x, y: F.kl_div(x, y), |
| 'poisson_nll_loss': lambda x, y: F.poisson_nll_loss(x, y), |
| } |
| |
| input = cast(torch.randn(3, 5)) |
| target = cast(torch.randn(5, 3)) |
| for fn in losses.values(): |
| self.assertRaises(Exception, lambda: fn(input, target)) |
| |
| def test_loss_equal_input_target_shape(self): |
| self._test_loss_equal_input_target_shape(lambda x: x) |
| |
| def test_mse_loss_size_warning(self): |
| i = torch.randn((10, 1), requires_grad=True) |
| t = torch.randn((10,)) |
| with warnings.catch_warnings(record=True) as w: |
| # Ensure warnings are being shown |
| warnings.simplefilter("always") |
| # Trigger Warning |
| F.mse_loss(i, t) |
| # Check warning occurs |
| self.assertEqual(len(w), 1) |
| self.assertIn('Please ensure they have the same size.', str(w[0])) |
| |
| def test_gaussian_nll_loss_broadcasting(self): |
| input = torch.tensor([[0.5, 1.5, 2.5], [2., 4., 6.]]) |
| target_full = torch.tensor([[1., 2., 3.], [1., 2., 3.]]) |
| target_part = torch.tensor([[1., 2., 3.]]) |
| var_full = torch.tensor([[0.5, 0.5, 0.5], [1.5, 1.5, 1.5]]) |
| var_part1 = torch.tensor([[0.5], [1.5]]) |
| var_part2 = torch.tensor([0.5, 1.5]) |
| component_wise_loss = 0.5 * (torch.log(var_full) + (input - target_full)**2 / var_full) |
| self.assertEqual(component_wise_loss, |
| F.gaussian_nll_loss(input, target_part, var_full, reduction='none')) |
| self.assertEqual(component_wise_loss, |
| F.gaussian_nll_loss(input, target_full, var_part1, reduction='none')) |
| self.assertEqual(component_wise_loss, |
| F.gaussian_nll_loss(input, target_full, var_part2, reduction='none')) |
| self.assertEqual(component_wise_loss, |
| F.gaussian_nll_loss(input, target_part, var_part1, reduction='none')) |
| self.assertEqual(component_wise_loss, |
| F.gaussian_nll_loss(input, target_part, var_part2, reduction='none')) |
| |
| def test_gaussian_nll_loss_args(self): |
| input = torch.randn(3, 5) |
| with self.assertRaisesRegex(ValueError, 'var is of incorrect size'): |
| target = torch.randn(3, 5) |
| var = torch.ones(3, 3) |
| torch.nn.functional.gaussian_nll_loss(input, target, var) |
| with self.assertRaisesRegex(ValueError, 'var has negative entry/entries'): |
| var = -1 * torch.ones(3, 5) |
| torch.nn.functional.gaussian_nll_loss(input, target, var) |
| |
| def test_KLDivLoss_batch_mean(self): |
| input_shape = (2, 5) |
| log_prob1 = F.log_softmax(torch.randn(input_shape), 1) |
| prob2 = F.softmax(torch.randn(input_shape), 1) |
| |
| loss = nn.KLDivLoss(reduction='batchmean') |
| l = loss(log_prob1, prob2) |
| |
| loss_none_reduce = nn.KLDivLoss(reduction='sum')(log_prob1, prob2) |
| expected = loss_none_reduce / input_shape[0] |
| |
| self.assertEqual(l, expected) |
| |
| def test_KLDivLoss_batch_mean_log_target(self): |
| input_shape = (2, 5) |
| log_prob1 = F.log_softmax(torch.randn(input_shape), 1) |
| log_prob2 = F.log_softmax(torch.randn(input_shape), 1) |
| |
| loss = nn.KLDivLoss(reduction='batchmean', log_target=True) |
| l = loss(log_prob1, log_prob2) |
| |
| loss_none_reduce = nn.KLDivLoss(reduction='sum', log_target=True)(log_prob1, log_prob2) |
| expected = loss_none_reduce / input_shape[0] |
| |
| self.assertEqual(l, expected) |
| |
| def test_CTCLoss_typechecks(self): |
| target_lengths = torch.tensor([30, 25, 20]) |
| input_lengths = torch.tensor([50, 50, 50]) |
| targets = torch.randint(1, 15, (sum(target_lengths),), dtype=torch.int) |
| log_probs = torch.randn(50, 3, 15, dtype=torch.float).log_softmax(2) |
| with self.assertRaises(RuntimeError): |
| _input_lengths = input_lengths.to(dtype=torch.float) |
| torch.nn.functional.ctc_loss(log_probs, targets, _input_lengths, target_lengths) |
| with self.assertRaises(RuntimeError): |
| target_lengths = target_lengths.to(dtype=torch.float) |
| torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths) |
| |
| @unittest.skipIf(not TEST_CUDA, 'CUDA not available') |
| def test_CTCLoss_lengthchecks_cuda(self): |
| for target_lengths in [[30, 25, 20], [-1, -1, -1]]: |
| for input_lengths in [[50, 50, 50], [-1, -1, -1]]: |
| targets = torch.randint(1, 15, (3, 29), dtype=torch.long, device='cuda') |
| log_probs = torch.randn(50, 3, 15, dtype=torch.float, device='cuda').log_softmax(2) |
| with self.assertRaises(RuntimeError): |
| torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths) |
| |
| def test_CTCLoss_lengthchecks_cpu(self): |
| for target_lengths in [[30, 25, 20], [-1, -1, -1]]: |
| for input_lengths in [[50, 50, 50], [-1, -1, -1]]: |
| targets = torch.randint(1, 15, (3, 29), dtype=torch.int) |
| log_probs = torch.randn(50, 3, 15, dtype=torch.float).log_softmax(2) |
| with self.assertRaises(RuntimeError): |
| torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths) |
| |
| @unittest.skipIf(not TEST_CUDA, 'CUDA not available') |
| def test_CTCLoss_long_targets(self): |
| input_length = 4000 |
| vocab_size = 3 |
| batch_size = 4 |
| target_length = 1200 |
| |
| log_probs = torch.randn(input_length, batch_size, vocab_size, dtype=torch.double).log_softmax(2).requires_grad_() |
| targets = torch.randint(low=1, high=vocab_size - 1, size=(batch_size, target_length), dtype=torch.long) |
| input_lengths = batch_size * [input_length] |
| target_lengths = batch_size * [target_length] |
| |
| res_cpu = torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths, |
| reduction='sum', zero_infinity=True) |
| grad_out = torch.randn_like(res_cpu) |
| grad_cpu, = torch.autograd.grad(res_cpu, log_probs, grad_out) |
| |
| with torch.backends.cudnn.flags(enabled=False): |
| res_gpu = torch.nn.functional.ctc_loss(log_probs.cuda(), targets.cuda(), input_lengths, target_lengths, |
| reduction='sum', zero_infinity=True) |
| grad_gpu, = torch.autograd.grad(res_gpu, log_probs, grad_out.cuda()) |
| self.assertEqual(res_cpu, res_gpu, atol=1e-4, rtol=0) |
| self.assertEqual(grad_cpu, grad_gpu, atol=1e-4, rtol=0) |
| |
| @unittest.skipIf(not TEST_CUDA, 'CUDA not available') |
| def test_CTCLoss_critical_target_len(self): |
| # cudnn has an unexpected problem with target length 256, see issue #53505 |
| N = 1 |
| S = 256 |
| C = 10 |
| T = 500 |
| target = torch.randint(low=1, high=C, size=(S,), dtype=torch.int) |
| input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.int) |
| target_lengths = torch.tensor(S, dtype=torch.int) |
| inp = torch.randn(T, N, C, dtype=torch.float, device='cuda').log_softmax(2).requires_grad_() |
| with cudnn.flags(enabled=True): |
| res_gpu = torch.nn.functional.ctc_loss(inp, target, input_lengths, target_lengths, reduction='none') |
| res_cpu = torch.nn.functional.ctc_loss(inp.cpu(), target, input_lengths, target_lengths, reduction='none') |
| self.assertEqual(res_cpu, res_gpu, atol=1e-3, rtol=0) |
| |
| def test_CTCLoss_zero_lengths(self): |
| devices = ['cpu'] |
| devices += ['cuda'] if TEST_CUDA else [] |
| N = 3 |
| S = 2 |
| C = 200 |
| T = 1 |
| target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.int) |
| input_lengths = torch.full(size=(N,), fill_value=0, dtype=torch.int) |
| target_lengths = torch.full(size=(N,), fill_value=0, dtype=torch.int) |
| for device in devices: |
| inp = torch.randn(T, N, C, dtype=torch.float, device=device).log_softmax(2).requires_grad_() |
| res = torch.nn.functional.ctc_loss(inp, target, input_lengths, target_lengths, reduction='none') |
| self.assertTrue((res == 0).all().item()) |
| res.sum().backward() |
| self.assertTrue((inp.grad == 0).all().item()) |
| target_lengths = torch.full(size=(N,), fill_value=1, dtype=torch.int) |
| for device in devices: |
| inp = torch.randn(T, N, C, dtype=torch.float, device=device).log_softmax(2).requires_grad_() |
| res = torch.nn.functional.ctc_loss(inp, target, input_lengths, target_lengths, reduction='none') |
| self.assertTrue((res == torch.inf).all().item()) |
| res.sum().backward() |
| self.assertTrue((inp.grad == 0).all().item()) |
| |
| @unittest.skipIf(not TEST_CUDA, 'CUDA not available') |
| def test_CTCLoss_zero_infinity(self): |
| target_lengths = [60, 25, 20] |
| input_lengths = [50, 50, 50] |
| targets = torch.randint(1, 15, (sum(target_lengths),), dtype=torch.int, device='cuda') |
| log_probs = torch.randn(50, 3, 15, dtype=torch.float, device='cuda').log_softmax(2).requires_grad_() |
| res = torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths, |
| reduction='sum', zero_infinity=True) |
| with torch.backends.cudnn.flags(enabled=False): |
| res2 = torch.nn.functional.ctc_loss(log_probs, targets.cuda().long(), input_lengths, target_lengths, |
| reduction='sum', zero_infinity=True) |
| res_cpu = torch.nn.functional.ctc_loss(log_probs.cpu(), targets.cpu(), input_lengths, target_lengths, |
| reduction='sum', zero_infinity=True) |
| |
| self.assertEqual(res2, res, atol=1e-4, rtol=0) |
| self.assertEqual(res_cpu, res.cpu(), atol=1e-4, rtol=0) |
| g1, = torch.autograd.grad(res, log_probs) |
| g2, = torch.autograd.grad(res2, log_probs) |
| g3, = torch.autograd.grad(res_cpu, log_probs) |
| self.assertEqual(g2, g3, atol=1e-4, rtol=0) |
| self.assertEqual(g1, g2, atol=1e-4, rtol=0) |
| self.assertTrue((g1 == g1).all().item()) # check that we don't have NaN |
| |
| def test_RNN_cell_no_broadcasting(self): |
| def test(cell_module, input, hx, input_size, hidden_size): |
| cell = cell_module(input_size, hidden_size) |
| self.assertRaises(RuntimeError, lambda: cell(input, hx)) |
| |
| def test_all(hidden_size, bad_hx, good_hx, input_size, input): |
| test(nn.RNNCell, input, bad_hx, input_size, hidden_size) |
| test(nn.GRUCell, input, bad_hx, input_size, hidden_size) |
| test(nn.LSTMCell, input, (bad_hx, good_hx), input_size, hidden_size) |
| test(nn.LSTMCell, input, (good_hx, bad_hx), input_size, hidden_size) |
| |
| hidden_size = 20 |
| input_size = 10 |
| input = torch.randn(3, input_size) |
| bad_hx = torch.randn(1, hidden_size) |
| good_hx = torch.randn(3, hidden_size) |
| |
| # Test hidden/input batch size broadcasting |
| test_all(hidden_size, bad_hx, good_hx, input_size, input) |
| |
| # Test hx's hidden_size vs module's hidden_size broadcasting |
| bad_hx = torch.randn(3, 1) |
| test_all(hidden_size, bad_hx, good_hx, input_size, input) |
| |
| # Test input's input_size vs module's input_size broadcasting |
| bad_input = torch.randn(3, 1) |
| test_all(hidden_size, good_hx, good_hx, input_size, bad_input) |
| |
| def test_LSTM_cell(self): |
| # this is just a smoke test; these modules are implemented through |
| # autograd so no Jacobian test is needed |
| for bias in (True, False): |
| input = torch.randn(3, 10) |
| hx = torch.randn(3, 20) |
| cx = torch.randn(3, 20) |
| lstm = nn.LSTMCell(10, 20, bias=bias) |
| for _ in range(6): |
| hx, cx = lstm(input, (hx, cx)) |
| |
| (hx + cx).sum().backward() |
| |
| def test_LSTM_cell_forward_input_size(self): |
| input = torch.randn(3, 11) |
| hx = torch.randn(3, 20) |
| cx = torch.randn(3, 20) |
| lstm = nn.LSTMCell(10, 20) |
| self.assertRaises(Exception, lambda: lstm(input, (hx, cx))) |
| |
| def test_LSTM_cell_forward_hidden_size(self): |
| input = torch.randn(3, 10) |
| hx = torch.randn(3, 21) |
| cx = torch.randn(3, 20) |
| lstm = nn.LSTMCell(10, 20) |
| self.assertRaises(Exception, lambda: lstm(input, (hx, cx))) |
| self.assertRaises(Exception, lambda: lstm(input, (cx, hx))) |
| |
| |
| @unittest.skipIf(not TEST_CUDA, 'CUDA not available') |
| def test_pack_sequence_batch_sizes_throw(self): |
| with self.assertRaisesRegex(ValueError, r"batch_sizes should always be on CPU"): |
| m = nn.LSTM(3, 4, bidirectional=True, num_layers=2).to('cuda') |
| a = torch.rand(5, 3, device='cuda') |
| b = torch.tensor([1, 1, 1, 1, 1], device='cuda') |
| input = nn.utils.rnn.PackedSequence(a, b) |
| |
| def test_Transformer_cell(self): |
| # this is just a smoke test; these modules are implemented through |
| # autograd so no Jacobian test is needed |
| d_model = 512 |
| nhead = 16 |
| num_encoder_layers = 4 |
| num_decoder_layers = 3 |
| dim_feedforward = 256 |
| dropout = 0.3 |
| bsz = 8 |
| seq_length = 35 |
| tgt_length = 15 |
| for batch_first, src_size, tgt_size in zip((True, False), |
| [(bsz, seq_length, d_model), |
| (seq_length, bsz, d_model)], |
| [(bsz, tgt_length, d_model), |
| (tgt_length, bsz, d_model)]): |
| transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, |
| dim_feedforward, dropout, batch_first=batch_first, |
| dtype=torch.double) |
| src = torch.randn(src_size, dtype=torch.double) |
| src_mask = transformer.generate_square_subsequent_mask(seq_length).double() |
| tgt = torch.randn(tgt_size, dtype=torch.double) |
| tgt_mask = transformer.generate_square_subsequent_mask(tgt_length).double() |
| memory_mask = torch.randn(tgt_length, seq_length).double() |
| src_key_padding_mask = torch.rand(bsz, seq_length) >= 0.5 |
| tgt_key_padding_mask = torch.rand(bsz, tgt_length) >= 0.5 |
| memory_key_padding_mask = torch.rand(bsz, seq_length) >= 0.5 |
| |
| output = transformer(src, tgt, |
| src_mask=src_mask, |
| tgt_mask=tgt_mask, |
| memory_mask=memory_mask, |
| src_key_padding_mask=src_key_padding_mask, |
| tgt_key_padding_mask=tgt_key_padding_mask, |
| memory_key_padding_mask=memory_key_padding_mask) |
| output.sum().backward() |
| |
| def test_transformerdecoderlayer(self): |
| # this is a deterministic test for TransformerDecoderLayer |
| d_model = 4 |
| nhead = 2 |
| dim_feedforward = 16 |
| dropout = 0.0 |
| bsz = 2 |
| seq_length = 5 |
| tgt_length = 3 |
| |
| for batch_first in (False, True): |
| def perm_fn(x): |
| return x.transpose(1, 0) if batch_first else x |
| |
| model = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, |
| batch_first=batch_first) |
| |
| # set constant weights of the model |
| for idx, p in enumerate(model.parameters()): |
| x = p.data |
| sz = x.view(-1).size(0) |
| shape = x.shape |
| x = torch.cos(torch.arange(0, sz).float().view(shape)) |
| p.data.copy_(x) |
| |
| # deterministic input |
| decoder_input = torch.tensor([[[20., 30., 40., 50.]]]) |
| memory_input = torch.tensor([[[60., 70., 80., 90.]]]) |
| result = model(decoder_input, memory_input) |
| ref_output = torch.tensor([[[2.314351, 0.094805, -0.671322, 0.101977]]]) |
| result = result.detach().numpy() |
| ref_output = ref_output.detach().numpy() |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| np.testing.assert_allclose(result, ref_output, atol=1e-5) |
| |
| # deterministic input |
| decoder_input = perm_fn(torch.tensor([[[9., 10., 11., 12.]], |
| [[11., 12., 13., 14.]]])) |
| memory_input = torch.tensor([[[1., 2., 3., 4.]]]) |
| result = model(decoder_input, memory_input) |
| result = result.detach().numpy() |
| ref_output = perm_fn(torch.tensor([[[2.422245, 0.051716, -0.606338, -0.024756]], |
| [[2.422245, 0.051716, -0.606338, -0.024756]]])) |
| ref_output = ref_output.detach().numpy() |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| np.testing.assert_allclose(result, ref_output, atol=1e-5) |
| |
| # deterministic input |
| decoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]], |
| [[5., 6., 7., 8.]]])) |
| memory_input = perm_fn(torch.tensor([[[9., 10., 11., 12.]], |
| [[11., 12., 13., 14.]]])) |
| result = model(decoder_input, memory_input) |
| ref_output = perm_fn(torch.tensor([[[2.343536, 0.085561, -0.654954, 0.074991]], |
| [[2.343536, 0.085561, -0.654954, 0.074991]]])) |
| result = result.detach().numpy() |
| ref_output = ref_output.detach().numpy() |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| np.testing.assert_allclose(result, ref_output, atol=1e-5) |
| |
| # deterministic input |
| decoder_input = perm_fn(torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034], |
| [0.2678, 0.3677, 0.4459, 0.7166]], |
| [[0.8100, 0.3716, 0.4096, 0.1976], |
| [0.6958, 0.8844, 0.6081, 0.8315]], |
| [[0.0494, 0.9343, 0.5955, 0.3830], |
| [0.5404, 0.3464, 0.9378, 0.6200]]])) |
| memory_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], |
| [0.5387, 0.1655, 0.3565, 0.0471]], |
| [[0.8335, 0.2799, 0.5031, 0.2947], |
| [0.1402, 0.0318, 0.7636, 0.1346]], |
| [[0.6333, 0.9344, 0.1376, 0.9938], |
| [0.8924, 0.2872, 0.6692, 0.2944]], |
| [[0.9897, 0.6915, 0.3154, 0.1733], |
| [0.8645, 0.3513, 0.3064, 0.0767]], |
| [[0.8117, 0.2366, 0.4838, 0.7881], |
| [0.3718, 0.4945, 0.9511, 0.0864]]])) |
| result = model(decoder_input, memory_input) |
| ref_output = perm_fn(torch.tensor([[[2.430065, 0.027862, -0.601136, -0.073096], |
| [2.431935, 0.028907, -0.599809, -0.072488]], |
| [[2.428457, 0.027053, -0.602275, -0.073462], |
| [2.431970, 0.029387, -0.599789, -0.071621]], |
| [[2.431934, 0.028196, -0.599802, -0.073809], |
| [2.432306, 0.028858, -0.599542, -0.072846]]])) |
| result = result.detach().numpy() |
| ref_output = ref_output.detach().numpy() |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| np.testing.assert_allclose(result, ref_output, atol=1e-5) |
| |
| # key_padding_mask |
| key_padding_mask = torch.zeros(2, 3) == 1 |
| result = model(decoder_input, memory_input, tgt_key_padding_mask=key_padding_mask) |
| ref_output = perm_fn(torch.tensor([[[2.430065, 0.027862, -0.601136, -0.073096], |
| [2.431935, 0.028907, -0.599809, -0.072488]], |
| [[2.428457, 0.027053, -0.602275, -0.073462], |
| [2.431970, 0.029387, -0.599789, -0.071621]], |
| [[2.431934, 0.028196, -0.599802, -0.073809], |
| [2.432306, 0.028858, -0.599542, -0.072846]]])) |
| result = result.detach().numpy() |
| ref_output = ref_output.detach().numpy() |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| np.testing.assert_allclose(result, ref_output, atol=1e-5) |
| |
| # key_padding_mask |
| key_padding_mask[0, 2] = 1 |
| key_padding_mask[1, 1] = 1 |
| key_padding_mask[1, 2] = 1 |
| result = model(decoder_input, memory_input, tgt_key_padding_mask=key_padding_mask) |
| ref_output = perm_fn(torch.tensor([[[2.430025, 0.027643, -0.601164, -0.073476], |
| [2.4323, 0.029375, -0.599553, -0.071881]], |
| [[2.428523, 0.026838, -0.602226, -0.07391], |
| [2.432634, 0.029842, -0.599318, -0.071253]], |
| [[2.432278, 0.028152, -0.599555, -0.074139], |
| [2.432659, 0.029244, -0.599294, -0.072382]]])) |
| result = result.detach().numpy() |
| ref_output = ref_output.detach().numpy() |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| np.testing.assert_allclose(result, ref_output, atol=1e-5) |
| |
| # memory_key_padding_mask |
| key_padding_mask = torch.zeros(2, 5) == 1 |
| result = model(decoder_input, memory_input, memory_key_padding_mask=key_padding_mask) |
| ref_output = perm_fn(torch.tensor([[[2.430065, 0.027862, -0.601136, -0.073096], |
| [2.431935, 0.028907, -0.599809, -0.072488]], |
| [[2.428457, 0.027053, -0.602275, -0.073462], |
| [2.431970, 0.029387, -0.599789, -0.071621]], |
| [[2.431934, 0.028196, -0.599802, -0.073809], |
| [2.432306, 0.028858, -0.599542, -0.072846]]])) |
| result = result.detach().numpy() |
| ref_output = ref_output.detach().numpy() |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| np.testing.assert_allclose(result, ref_output, atol=1e-5) |
| |
| # memory_key_padding_mask |
| key_padding_mask[0, 4] = 1 |
| key_padding_mask[1, 3] = 1 |
| key_padding_mask[1, 4] = 1 |
| result = model(decoder_input, memory_input, memory_key_padding_mask=key_padding_mask) |
| ref_output = perm_fn(torch.tensor([[[2.429757, 0.027358, -0.601351, -0.073816], |
| [2.432692, 0.028583, -0.599263, -0.073634]], |
| [[2.428247, 0.02662, -0.602419, -0.074123], |
| [2.432657, 0.029055, -0.599293, -0.072732]], |
| [[2.431515, 0.027687, -0.600096, -0.074459], |
| [2.433075, 0.028543, -0.598987, -0.073985]]])) |
| result = result.detach().numpy() |
| ref_output = ref_output.detach().numpy() |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| np.testing.assert_allclose(result, ref_output, atol=1e-5) |
| |
| @set_default_dtype(torch.double) |
| def test_transformerdecoderlayer_gelu(self): |
| # this is a deterministic test for TransformerDecoderLayer with gelu activation |
| d_model = 4 |
| nhead = 2 |
| dim_feedforward = 16 |
| dropout = 0.0 |
| bsz = 2 |
| seq_length = 5 |
| tgt_length = 3 |
| |
| for activation, batch_first in product(('gelu', F.gelu, nn.GELU()), (True, False)): |
| def perm_fn(x): |
| return x.transpose(1, 0) if batch_first else x |
| |
| model = nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward, dropout, |
| activation, batch_first=batch_first) |
| |
| # set constant weights of the model |
| for idx, p in enumerate(model.parameters()): |
| x = p.data |
| sz = x.view(-1).size(0) |
| shape = x.shape |
| x = torch.cos(torch.arange(0, sz).float().view(shape)) |
| p.data.copy_(x) |
| |
| # deterministic input |
| decoder_input = torch.tensor([[[20., 30., 40., 50.]]]) |
| memory_input = torch.tensor([[[60., 70., 80., 90.]]]) |
| result = model(decoder_input, memory_input) |
| ref_output = torch.tensor([[[2.306435, 0.095946, -0.675796, 0.10687]]]) |
| torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0) |
| |
| # deterministic input |
| decoder_input = perm_fn(torch.tensor([[[9., 10., 11., 12.]], |
| [[11., 12., 13., 14.]]])) |
| memory_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]]])) |
| result = model(decoder_input, memory_input) |
| ref_output = perm_fn(torch.tensor([[[2.415448, 0.054389, -0.610932, -0.0156613]], |
| [[2.415448, 0.054389, -0.610932, -0.0156613]]])) |
| torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0) |
| |
| # deterministic input |
| decoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]], |
| [[5., 6., 7., 8.]]])) |
| memory_input = perm_fn(torch.tensor([[[9., 10., 11., 12.]], |
| [[11., 12., 13., 14.]]])) |
| result = model(decoder_input, memory_input) |
| ref_output = perm_fn(torch.tensor([[[2.338531, 0.087709, -0.65776, 0.080646]], |
| [[2.338531, 0.087709, -0.65776, 0.080646]]])) |
| torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0) |
| |
| # deterministic input |
| decoder_input = perm_fn(torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034], |
| [0.2678, 0.3677, 0.4459, 0.7166]], |
| [[0.8100, 0.3716, 0.4096, 0.1976], |
| [0.6958, 0.8844, 0.6081, 0.8315]], |
| [[0.0494, 0.9343, 0.5955, 0.3830], |
| [0.5404, 0.3464, 0.9378, 0.6200]]])) |
| memory_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], |
| [0.5387, 0.1655, 0.3565, 0.0471]], |
| [[0.8335, 0.2799, 0.5031, 0.2947], |
| [0.1402, 0.0318, 0.7636, 0.1346]], |
| [[0.6333, 0.9344, 0.1376, 0.9938], |
| [0.8924, 0.2872, 0.6692, 0.2944]], |
| [[0.9897, 0.6915, 0.3154, 0.1733], |
| [0.8645, 0.3513, 0.3064, 0.0767]], |
| [[0.8117, 0.2366, 0.4838, 0.7881], |
| [0.3718, 0.4945, 0.9511, 0.0864]]])) |
| result = model(decoder_input, memory_input) |
| ref_output = perm_fn(torch.tensor([[[2.42049104, 0.03443088, -0.60793706, -0.05436271], |
| [2.42210631, 0.03546578, -0.60679895, -0.05357488]], |
| [[2.41907674, 0.0336104, -0.60892977, -0.05490462], |
| [2.42216881, 0.03586554, -0.6067524, -0.05289126]], |
| [[2.42205716, 0.03488046, -0.60683681, -0.05460596], |
| [2.42240309, 0.0354595, -0.60659063, -0.05378816]]])) |
| torch.testing.assert_close(result, ref_output, rtol=1e-5, atol=0) |
| |
| @skipIfRocm(msg='Large numerical errors') |
| def test_transformerdecoder(self): |
| def get_a_test_layer(use_cuda, activation, batch_first=False): |
| d_model = 4 |
| nhead = 2 |
| dim_feedforward = 16 |
| dropout = 0.0 |
| device = torch.device("cuda" if use_cuda else "cpu") |
| |
| layer = nn.TransformerDecoderLayer( |
| d_model, |
| nhead, |
| dim_feedforward=dim_feedforward, |
| dropout=dropout, |
| activation=activation, |
| batch_first=batch_first).to(device) |
| |
| with torch.no_grad(): |
| # set constant weights of the model |
| for idx, p in enumerate(layer.parameters()): |
| x = p.data |
| sz = x.view(-1).size(0) |
| shape = x.shape |
| x = torch.cos(torch.arange(0, sz).float().view(shape)) |
| p.data.copy_(x) |
| |
| return layer |
| |
| # this is a deterministic test for TransformerDecoder |
| for batch_first in (False, True): |
| def perm_fn(x): |
| return x.transpose(1, 0) if batch_first else x |
| activation = F.relu |
| use_cuda = torch.cuda.is_available() |
| device = torch.device("cuda" if use_cuda else "cpu") |
| |
| decoder_layer = get_a_test_layer(use_cuda=use_cuda, activation=activation, |
| batch_first=batch_first) |
| |
| model = nn.TransformerDecoder(decoder_layer, 1).to(device) |
| |
| # deterministic input |
| decoder_input = torch.tensor([[[20., 30., 40., 50.]]]).to(device) |
| memory_input = torch.tensor([[[60., 70., 80., 90.]]]).to(device) |
| result = model(decoder_input, memory_input) |
| ref_output = torch.tensor( |
| [[[2.314351, 0.094805, -0.671322, 0.101977]]]).to(device) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-3) |
| |
| # deterministic input |
| decoder_input = perm_fn(torch.tensor([[[9., 10., 11., 12.]], |
| [[11., 12., 13., 14.]]])).to(device) |
| memory_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]]])).to(device) |
| result = model(decoder_input, memory_input) |
| ref_output = perm_fn(torch.tensor([[[2.422245, 0.051716, -0.606338, -0.024756]], |
| [[2.422245, 0.051716, -0.606338, -0.024756]]] |
| )).to(device) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-4) |
| |
| # deterministic input |
| decoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]], |
| [[5., 6., 7., 8.]]])).to(device) |
| memory_input = perm_fn(torch.tensor([[[9., 10., 11., 12.]], |
| [[11., 12., 13., 14.]]])).to(device) |
| result = model(decoder_input, memory_input) |
| ref_output = perm_fn(torch.tensor([[[2.343536, 0.085561, -0.654954, 0.074991]], |
| [[2.343536, 0.085561, -0.654954, 0.074991]]] |
| )).to(device) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-4) |
| |
| # deterministic input |
| decoder_input = perm_fn(torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034], |
| [0.2678, 0.3677, 0.4459, 0.7166]], |
| [[0.8100, 0.3716, 0.4096, 0.1976], |
| [0.6958, 0.8844, 0.6081, 0.8315]], |
| [[0.0494, 0.9343, 0.5955, 0.3830], |
| [0.5404, 0.3464, 0.9378, 0.6200]]] |
| )).to(device) |
| memory_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], |
| [0.5387, 0.1655, 0.3565, 0.0471]], |
| [[0.8335, 0.2799, 0.5031, 0.2947], |
| [0.1402, 0.0318, 0.7636, 0.1346]], |
| [[0.6333, 0.9344, 0.1376, 0.9938], |
| [0.8924, 0.2872, 0.6692, 0.2944]], |
| [[0.9897, 0.6915, 0.3154, 0.1733], |
| [0.8645, 0.3513, 0.3064, 0.0767]], |
| [[0.8117, 0.2366, 0.4838, 0.7881], |
| [0.3718, 0.4945, 0.9511, 0.0864]]] |
| )).to(device) |
| result = model(decoder_input, memory_input) |
| ref_output = perm_fn(torch.tensor([[[2.430065, 0.027862, -0.601136, -0.073096], |
| [2.431935, 0.028907, -0.599809, -0.072488]], |
| [[2.428457, 0.027053, -0.602275, -0.073462], |
| [2.431970, 0.029387, -0.599789, -0.071621]], |
| [[2.431934, 0.028196, -0.599802, -0.073809], |
| [2.432306, 0.028858, -0.599542, -0.072846]]] |
| )).to(device) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) |
| |
| # key_padding_mask |
| key_padding_mask = torch.zeros(2, 3).to(device) == 1 |
| result = model(decoder_input, memory_input, |
| tgt_key_padding_mask=key_padding_mask) |
| ref_output = perm_fn(torch.tensor([[[2.430065, 0.027862, -0.601136, -0.073096], |
| [2.431935, 0.028907, -0.599809, -0.072488]], |
| [[2.428457, 0.027053, -0.602275, -0.073462], |
| [2.431970, 0.029387, -0.599789, -0.071621]], |
| [[2.431934, 0.028196, -0.599802, -0.073809], |
| [2.432306, 0.028858, -0.599542, -0.072846]]] |
| )).to(device) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) |
| |
| # key_padding_mask |
| key_padding_mask[0, 2] = 1 |
| key_padding_mask[1, 1] = 1 |
| key_padding_mask[1, 2] = 1 |
| result = model(decoder_input, memory_input, |
| tgt_key_padding_mask=key_padding_mask) |
| ref_output = perm_fn(torch.tensor([[[2.430025, 0.027643, -0.601164, -0.073476], |
| [2.4323, 0.029375, -0.599553, -0.071881]], |
| [[2.428523, 0.026838, -0.602226, -0.07391], |
| [2.432634, 0.029842, -0.599318, -0.071253]], |
| [[2.432278, 0.028152, -0.599555, -0.074139], |
| [2.432659, 0.029244, -0.599294, -0.072382]]] |
| )).to(device) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) |
| |
| # memory_key_padding_mask |
| key_padding_mask = torch.zeros(2, 5).to(device) == 1 |
| result = model(decoder_input, memory_input, |
| memory_key_padding_mask=key_padding_mask) |
| ref_output = perm_fn(torch.tensor([[[2.430065, 0.027862, -0.601136, -0.073096], |
| [2.431935, 0.028907, -0.599809, -0.072488]], |
| [[2.428457, 0.027053, -0.602275, -0.073462], |
| [2.431970, 0.029387, -0.599789, -0.071621]], |
| [[2.431934, 0.028196, -0.599802, -0.073809], |
| [2.432306, 0.028858, -0.599542, -0.072846]]] |
| )).to(device) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) |
| |
| # memory_key_padding_mask |
| key_padding_mask[0, 4] = 1 |
| key_padding_mask[1, 3] = 1 |
| key_padding_mask[1, 4] = 1 |
| result = model(decoder_input, |
| memory_input, |
| memory_key_padding_mask=key_padding_mask) |
| ref_output = perm_fn(torch.tensor([[[2.429757, 0.027358, -0.601351, -0.073816], |
| [2.432692, 0.028583, -0.599263, -0.073634]], |
| [[2.428247, 0.02662, -0.602419, -0.074123], |
| [2.432657, 0.029055, -0.599293, -0.072732]], |
| [[2.431515, 0.027687, -0.600096, -0.074459], |
| [2.433075, 0.028543, -0.598987, -0.073985]]] |
| )).to(device) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) |
| |
| # multiple layers no norm |
| model = nn.TransformerDecoder(decoder_layer, 2).to(device) |
| |
| # deterministic input |
| decoder_input = torch.tensor([[[20., 30., 40., 50.]]]).to(device) |
| memory_input = torch.tensor([[[60., 70., 80., 90.]]]).to(device) |
| result = model(decoder_input, memory_input) |
| ref_output = torch.tensor( |
| [[[2.31316, 0.0950293, -0.671995, 0.102802]]]).to(device) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-3) |
| |
| # multiple layers no norm |
| model = nn.TransformerDecoder(decoder_layer, 6).to(device) |
| |
| # deterministic input |
| decoder_input = perm_fn(torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034], |
| [0.2678, 0.3677, 0.4459, 0.7166]], |
| [[0.8100, 0.3716, 0.4096, 0.1976], |
| [0.6958, 0.8844, 0.6081, 0.8315]], |
| [[0.0494, 0.9343, 0.5955, 0.3830], |
| [0.5404, 0.3464, 0.9378, 0.6200]]] |
| )).to(device) |
| memory_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], |
| [0.5387, 0.1655, 0.3565, 0.0471]], |
| [[0.8335, 0.2799, 0.5031, 0.2947], |
| [0.1402, 0.0318, 0.7636, 0.1346]], |
| [[0.6333, 0.9344, 0.1376, 0.9938], |
| [0.8924, 0.2872, 0.6692, 0.2944]], |
| [[0.9897, 0.6915, 0.3154, 0.1733], |
| [0.8645, 0.3513, 0.3064, 0.0767]], |
| [[0.8117, 0.2366, 0.4838, 0.7881], |
| [0.3718, 0.4945, 0.9511, 0.0864]]] |
| )).to(device) |
| result = model(decoder_input, memory_input) |
| ref_output = perm_fn(torch.tensor([[[2.42794, 0.026164, -0.60263, -0.0747591], |
| [2.43113, 0.0279516, -0.600376, -0.0736896]], |
| [[2.42794, 0.026164, -0.60263, -0.0747591], |
| [2.43113, 0.0279516, -0.600376, -0.0736896]], |
| [[2.42794, 0.026164, -0.60263, -0.0747591], |
| [2.43113, 0.0279516, -0.600376, -0.0736896]]] |
| )).to(device) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) |
| |
| # multiple layers with norm |
| # d_model = 4 |
| norm = nn.LayerNorm(4) |
| model = nn.TransformerDecoder(decoder_layer, 2, norm=norm).to(device) |
| |
| # deterministic input |
| decoder_input = torch.tensor([[[20., 30., 40., 50.]]]).to(device) |
| memory_input = torch.tensor([[[60., 70., 80., 90.]]]).to(device) |
| result = model(decoder_input, memory_input) |
| ref_output = torch.tensor( |
| [[[1.66166, -0.326986, -1.01466, -0.320017]]]).to(device) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-3) |
| |
| # multiple layers with norm |
| model = nn.TransformerDecoder(decoder_layer, 6, norm=norm).to(device) |
| |
| # deterministic input |
| decoder_input = perm_fn(torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034], |
| [0.2678, 0.3677, 0.4459, 0.7166]], |
| [[0.8100, 0.3716, 0.4096, 0.1976], |
| [0.6958, 0.8844, 0.6081, 0.8315]], |
| [[0.0494, 0.9343, 0.5955, 0.3830], |
| [0.5404, 0.3464, 0.9378, 0.6200]]] |
| )).to(device) |
| memory_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], |
| [0.5387, 0.1655, 0.3565, 0.0471]], |
| [[0.8335, 0.2799, 0.5031, 0.2947], |
| [0.1402, 0.0318, 0.7636, 0.1346]], |
| [[0.6333, 0.9344, 0.1376, 0.9938], |
| [0.8924, 0.2872, 0.6692, 0.2944]], |
| [[0.9897, 0.6915, 0.3154, 0.1733], |
| [0.8645, 0.3513, 0.3064, 0.0767]], |
| [[0.8117, 0.2366, 0.4838, 0.7881], |
| [0.3718, 0.4945, 0.9511, 0.0864]]] |
| )).to(device) |
| result = model(decoder_input, memory_input) |
| ref_output = perm_fn(torch.tensor([[[1.69559, -0.357291, -0.894741, -0.443553], |
| [1.69571, -0.357363, -0.894154, -0.444196]], |
| [[1.69559, -0.357291, -0.894741, -0.443553], |
| [1.69571, -0.357363, -0.894154, -0.444196]], |
| [[1.69559, -0.357291, -0.894741, -0.443553], |
| [1.69571, -0.357363, -0.894154, -0.444196]]] |
| )).to(device) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) |
| |
| # gelu activation test cases |
| activation = "gelu" |
| use_cuda = torch.cuda.is_available() |
| device = torch.device("cuda" if use_cuda else "cpu") |
| |
| decoder_layer = get_a_test_layer(use_cuda=use_cuda, activation=activation, |
| batch_first=batch_first) |
| |
| model = nn.TransformerDecoder(decoder_layer, 1).to(device) |
| |
| # deterministic input |
| decoder_input = torch.tensor([[[20., 30., 40., 50.]]]).to(device) |
| memory_input = torch.tensor([[[60., 70., 80., 90.]]]).to(device) |
| result = model(decoder_input, memory_input) |
| ref_output = torch.tensor([[[2.306435, 0.095946, -0.675796, 0.10687]]]).to(device) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-3) |
| |
| # deterministic input |
| decoder_input = perm_fn(torch.tensor([[[9., 10., 11., 12.]], |
| [[11., 12., 13., 14.]]])).to(device) |
| memory_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]]])).to(device) |
| result = model(decoder_input, memory_input) |
| ref_output = perm_fn(torch.tensor([[[2.415448, 0.054389, -0.610932, -0.0156613]], |
| [[2.415448, 0.054389, -0.610932, -0.0156613]]])).to(device) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-4) |
| |
| # deterministic input |
| decoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]], |
| [[5., 6., 7., 8.]]])).to(device) |
| memory_input = perm_fn(torch.tensor([[[9., 10., 11., 12.]], |
| [[11., 12., 13., 14.]]])).to(device) |
| result = model(decoder_input, memory_input) |
| ref_output = perm_fn(torch.tensor([[[2.338531, 0.087709, -0.65776, 0.080646]], |
| [[2.338531, 0.087709, -0.65776, 0.080646]]])).to(device) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-4) |
| |
| # deterministic input |
| decoder_input = perm_fn(torch.tensor([[[0.4517, 0.6793, 0.5313, 0.0034], |
| [0.2678, 0.3677, 0.4459, 0.7166]], |
| [[0.8100, 0.3716, 0.4096, 0.1976], |
| [0.6958, 0.8844, 0.6081, 0.8315]], |
| [[0.0494, 0.9343, 0.5955, 0.3830], |
| [0.5404, 0.3464, 0.9378, 0.6200]]] |
| )).to(device) |
| memory_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], |
| [0.5387, 0.1655, 0.3565, 0.0471]], |
| [[0.8335, 0.2799, 0.5031, 0.2947], |
| [0.1402, 0.0318, 0.7636, 0.1346]], |
| [[0.6333, 0.9344, 0.1376, 0.9938], |
| [0.8924, 0.2872, 0.6692, 0.2944]], |
| [[0.9897, 0.6915, 0.3154, 0.1733], |
| [0.8645, 0.3513, 0.3064, 0.0767]], |
| [[0.8117, 0.2366, 0.4838, 0.7881], |
| [0.3718, 0.4945, 0.9511, 0.0864]]] |
| )).to(device) |
| result = model(decoder_input, memory_input) |
| ref_output = perm_fn(torch.tensor([[[2.42049104, 0.03443088, -0.60793706, -0.05436271], |
| [2.42210631, 0.03546578, -0.60679895, -0.05357488]], |
| [[2.41907674, 0.0336104, -0.60892977, -0.05490462], |
| [2.42216881, 0.03586554, -0.6067524, -0.05289126]], |
| [[2.42205716, 0.03488046, -0.60683681, -0.05460596], |
| [2.42240309, 0.0354595, -0.60659063, -0.05378816]]] |
| )).to(device) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| torch.testing.assert_close(result, ref_output, rtol=1e-7, atol=1e-5) |
| |
| @unittest.skipIf(not (TEST_CUDNN and TEST_MULTIGPU), 'CUDNN or multi-gpu not available') |
| def test_cudnn_rnn_dropout_states_device(self): |
| rnn = nn.RNN(10, 20, num_layers=2, dropout=.5) |
| device = 1 |
| input = torch.randn(5, 4, 10).cuda(device) |
| rnn.cuda(device) |
| hx = torch.randn(2, 4, 20).cuda(device) |
| output = rnn(input, hx) |
| |
| def test_cudnn_forward_exception(self): |
| rnns = [ |
| (nn.LSTM(10, 20, batch_first=True), (torch.zeros(1, 2, 19), torch.zeros(1, 2, 19))), |
| (nn.LSTM(10, 20, batch_first=True, proj_size=10), (torch.zeros(1, 2, 19), torch.zeros(1, 2, 19))), |
| (nn.GRU(10, 20, batch_first=True), torch.zeros(1, 2, 19)), |
| (nn.RNN(10, 20, batch_first=True), torch.zeros(1, 2, 19)), |
| ] |
| x_wrong = torch.randn(2, 3, 3) |
| x_right = torch.randn(2, 3, 10) |
| for rnn, hidden in rnns: |
| self.assertRaisesRegex(RuntimeError, "Expected hidden.*size.*got", rnn, x_right, hidden) |
| self.assertRaisesRegex(RuntimeError, re.escape("input.size(-1) must be equal to input_size"), rnn, x_wrong) |
| |
| @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') |
| @skipIfRocm |
| def test_cudnn_weight_format(self): |
| rnns = [ |
| nn.LSTM(10, 20, batch_first=True), |
| nn.LSTM(10, 20, batch_first=True, proj_size=10), |
| nn.GRU(10, 20, batch_first=True), |
| nn.RNN(10, 20, batch_first=True) |
| ] |
| first_warn = True |
| for rnn in rnns: |
| rnn.cuda() |
| input = torch.randn(5, 4, 10, requires_grad=True, device="cuda") |
| hx = torch.randn(1, 5, 20, requires_grad=True, device="cuda") |
| all_vars = [input, hx] + list(rnn.parameters()) |
| if isinstance(rnn, nn.LSTM): |
| # LSTM with projections has different hx size |
| if rnn.proj_size > 0: |
| hx = torch.randn(1, 5, 10, requires_grad=True, device="cuda") |
| all_vars[1] = hx |
| cx = torch.randn(1, 5, 20, requires_grad=True, device="cuda") |
| all_vars[2:2] = [cx] |
| hx = (hx, cx) |
| |
| output = rnn(input, hx) |
| output[0].sum().backward() |
| grads = [v.grad.data.clone() for v in all_vars] |
| for v in all_vars: |
| v.grad.data.zero_() |
| |
| # Weights will no longer view onto the same chunk of memory |
| weight = all_vars[4] |
| weight_data = weight.data.clone() |
| with torch.no_grad(): |
| weight.set_(weight_data) |
| |
| for _ in range(2): |
| with warnings.catch_warnings(record=True) as w: |
| output_noncontig = rnn(input, hx) |
| if first_warn: |
| self.assertEqual(len(w), 1) |
| self.assertIn('weights are not part of single contiguous chunk of memory', w[0].message.args[0]) |
| first_warn = False |
| warnings.resetwarnings() |
| output_noncontig[0].sum().backward() |
| grads_noncontig = [v.grad.data.clone() for v in all_vars] |
| for v in all_vars: |
| v.grad.data.zero_() |
| self.assertEqual(output, output_noncontig) |
| self.assertEqual(grads_noncontig, grads) |
| |
| # Make sure these still share storage |
| weight_data[:] = 4 |
| self.assertEqual(weight_data, all_vars[4].data) |
| |
| @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') |
| def test_cudnn_weight_tying(self): |
| rnns = [ |
| nn.LSTM(10, 20, batch_first=True, bidirectional=True), |
| nn.LSTM(10, 20, batch_first=True, bidirectional=True, proj_size=10), |
| nn.GRU(10, 20, batch_first=True, bidirectional=True), |
| nn.RNN(10, 20, batch_first=True, bidirectional=True) |
| ] |
| for rnn in rnns: |
| rnn.bias_ih_l0_reverse = rnn.bias_ih_l0 |
| rnn.cuda() |
| input = torch.randn(5, 4, 10, requires_grad=True, device="cuda") |
| hx = torch.randn(2, 5, 20, requires_grad=True, device="cuda") |
| all_vars = [input, hx] + list(rnn.parameters()) |
| opt = torch.optim.SGD(rnn.parameters(), lr=0.1) |
| opt.zero_grad() |
| if isinstance(rnn, nn.LSTM): |
| # LSTM with projections has different hx size |
| if rnn.proj_size > 0: |
| hx = torch.randn(2, 5, 10, requires_grad=True, device="cuda") |
| all_vars[1] = hx |
| cx = torch.randn(2, 5, 20, requires_grad=True, device="cuda") |
| all_vars[2:2] = [cx] |
| hx = (hx, cx) |
| |
| with warnings.catch_warnings(record=True) as w: |
| output = rnn(input, hx) |
| output[0].sum().backward() |
| |
| opt.step() |
| with warnings.catch_warnings(record=True) as w: |
| output_cuda = rnn(input, hx) |
| rnn.cpu() |
| hx = (hx[0].cpu(), hx[1].cpu()) if isinstance(rnn, nn.LSTM) else hx.cpu() |
| output_cpu = rnn(input.cpu(), hx) |
| self.assertEqual(output_cuda, output_cpu) |
| |
| |
| def test_transformer_args_check(self): |
| model_name = 'Transformer' |
| d_model = 128 |
| nhead = 4 |
| num_encoder_layers = 2 |
| num_decoder_layers = 3 |
| dim_feedforward = 65 |
| dropout = 0.3 |
| bsz = 3 |
| seq_len = 35 |
| tgt_len = 15 |
| activations = [F.relu, F.gelu] |
| |
| wrong_bsz = 7 |
| wrong_d_model = 63 |
| wrong_nhead = 5 |
| wrong_activation = "abc" |
| |
| def test(encoder_input_shape, decoder_input_shape, |
| src_mask_len=None, tgt_mask_len=None, memory_mask_size=None, |
| src_key_padding_mask_size=None, tgt_key_padding_mask_size=None, |
| memory_key_padding_mask_size=None, |
| src_is_causal=False, tgt_is_causal=False, |
| memory_is_causal=False): |
| |
| encoder_input = torch.randn(encoder_input_shape) |
| decoder_input = torch.randn(decoder_input_shape) |
| model = getattr(nn, model_name)(d_model, nhead, num_encoder_layers, |
| num_decoder_layers, dim_feedforward, dropout) |
| |
| if src_mask_len is not None: |
| src_mask = model.generate_square_subsequent_mask(src_mask_len) |
| else: |
| src_mask = None |
| |
| if tgt_mask_len is not None: |
| tgt_mask = model.generate_square_subsequent_mask(tgt_mask_len) |
| else: |
| tgt_mask = None |
| |
| if memory_mask_size is not None: |
| memory_task = torch.rand(memory_mask_size) |
| else: |
| memory_task = None |
| |
| if src_key_padding_mask_size is not None: |
| src_key_padding_mask = torch.rand(src_key_padding_mask_size) >= 0.5 |
| else: |
| src_key_padding_mask = None |
| |
| if tgt_key_padding_mask_size is not None: |
| tgt_key_padding_mask = torch.rand(tgt_key_padding_mask_size) >= 0.5 |
| else: |
| tgt_key_padding_mask = None |
| |
| if memory_key_padding_mask_size is not None: |
| memory_key_padding_mask = torch.rand(memory_key_padding_mask_size) >= 0.5 |
| else: |
| memory_key_padding_mask = None |
| |
| with self.assertRaises(RuntimeError): |
| model(encoder_input, decoder_input, |
| src_mask=src_mask, |
| tgt_mask=tgt_mask, |
| memory_mask=memory_task, |
| src_key_padding_mask=src_key_padding_mask, |
| tgt_key_padding_mask=tgt_key_padding_mask, |
| memory_key_padding_mask=memory_key_padding_mask, |
| src_is_causal=src_is_causal, |
| tgt_is_causal=tgt_is_causal, |
| memory_is_causal=memory_is_causal) |
| |
| |
| correct_encoder_input_shape = (seq_len, bsz, d_model) |
| correct_decoder_input_shape = (tgt_len, bsz, d_model) |
| |
| def update_shape(shape, dim, new_dim_size): |
| new_shape = list(shape) |
| new_shape[dim] = new_dim_size |
| return tuple(new_shape) |
| |
| # Incorrect encoder_input batch size |
| encoder_input_shape = update_shape(correct_encoder_input_shape, 1, wrong_bsz) |
| decoder_input_shape = correct_decoder_input_shape |
| test(encoder_input_shape, decoder_input_shape) |
| |
| # Incorrect decoder_input batch size |
| encoder_input_shape = correct_encoder_input_shape |
| decoder_input_shape = update_shape(correct_decoder_input_shape, 1, wrong_bsz) |
| test(encoder_input_shape, decoder_input_shape) |
| |
| # Incorrect encoder_input input size |
| encoder_input_shape = update_shape(correct_encoder_input_shape, 2, wrong_d_model) |
| decoder_input_shape = correct_decoder_input_shape |
| test(encoder_input_shape, decoder_input_shape) |
| |
| # Incorrect decoder_input input size |
| encoder_input_shape = correct_encoder_input_shape |
| decoder_input_shape = update_shape(correct_decoder_input_shape, 2, wrong_d_model) |
| test(encoder_input_shape, decoder_input_shape) |
| |
| # Incorrect nhead |
| encoder_input_shape = correct_encoder_input_shape |
| decoder_input_shape = correct_decoder_input_shape |
| with self.assertRaises(AssertionError): |
| model = getattr(nn, model_name)(d_model, wrong_nhead, num_encoder_layers, |
| num_decoder_layers, dim_feedforward, dropout) |
| |
| # Incorrect src_mask |
| encoder_input_shape = correct_encoder_input_shape |
| decoder_input_shape = correct_decoder_input_shape |
| wrong_src_mask_size = seq_len + 1 |
| test(encoder_input_shape, decoder_input_shape, src_mask_len=wrong_src_mask_size) |
| |
| # Incorrect tgt_mask |
| encoder_input_shape = correct_encoder_input_shape |
| decoder_input_shape = correct_decoder_input_shape |
| wrong_tgt_mask_size = tgt_len + 1 |
| test(encoder_input_shape, decoder_input_shape, tgt_mask_len=wrong_tgt_mask_size) |
| |
| # Incorrect memory_mask |
| encoder_input_shape = correct_encoder_input_shape |
| decoder_input_shape = correct_decoder_input_shape |
| wrong_tgt_mask_size = tgt_len + 1 |
| test(encoder_input_shape, decoder_input_shape, |
| memory_mask_size=(wrong_tgt_mask_size, wrong_src_mask_size)) |
| |
| # Incorrect src_key_padding_mask |
| encoder_input_shape = correct_encoder_input_shape |
| decoder_input_shape = correct_decoder_input_shape |
| with self.assertRaises(AssertionError): |
| test(encoder_input_shape, decoder_input_shape, |
| src_key_padding_mask_size=(wrong_bsz, wrong_src_mask_size)) |
| |
| # Incorrect tgt_key_padding_mask |
| encoder_input_shape = correct_encoder_input_shape |
| decoder_input_shape = correct_decoder_input_shape |
| with self.assertRaises(AssertionError): |
| test(encoder_input_shape, decoder_input_shape, |
| tgt_key_padding_mask_size=(wrong_bsz, wrong_tgt_mask_size)) |
| |
| # Incorrect memory_key_padding_mask |
| encoder_input_shape = correct_encoder_input_shape |
| decoder_input_shape = correct_decoder_input_shape |
| with self.assertRaises(AssertionError): |
| test(encoder_input_shape, decoder_input_shape, |
| memory_key_padding_mask_size=(wrong_bsz, wrong_src_mask_size)) |
| |
| # Correct activations |
| for activation in activations: |
| model = getattr(nn, model_name)(d_model, nhead, num_encoder_layers, num_decoder_layers, |
| dim_feedforward, dropout, activation) |
| # Incorrect activation |
| with self.assertRaises(RuntimeError): |
| model = getattr(nn, model_name)(d_model, nhead, num_encoder_layers, num_decoder_layers, |
| dim_feedforward, dropout, wrong_activation) |
| |
| |
| def test_transformer_layer_args_check(self): |
| model_names = ['TransformerEncoderLayer', 'TransformerDecoderLayer'] |
| d_model = 128 |
| nhead = 4 |
| dim_feedforward = 65 |
| dropout = 0.3 |
| bsz = 3 |
| seq_len = 35 |
| tgt_len = 15 |
| activations = [F.relu, F.gelu] |
| |
| wrong_activation = "abc" |
| |
| encoder_input_shape = (seq_len, bsz, d_model) |
| decoder_input_shape = (tgt_len, bsz, d_model) |
| |
| encoder_input = torch.randn(encoder_input_shape) |
| decoder_input = torch.randn(decoder_input_shape) |
| |
| for model_name in model_names: |
| for activation in activations: |
| model = getattr(nn, model_name)(d_model, nhead, dim_feedforward, |
| dropout, activation) |
| # Incorrect activation |
| for model_name in model_names: |
| with self.assertRaises(RuntimeError): |
| model = getattr(nn, model_name)(d_model, nhead, dim_feedforward, |
| dropout, wrong_activation) |
| |
| def test_rnn_args_check(self): |
| input_size = 3 |
| hidden_size = 5 |
| num_layers = 2 |
| batch_size = 4 |
| seq_len = 6 |
| num_directions = 1 |
| bad_size = 7 # prime number so that no size can divide it. |
| |
| def test(input_shape, hidden_shape, mode): |
| for input, hidden in get_inputs(input_shape, hidden_shape, mode): |
| model = getattr(nn, mode)(input_size, hidden_size, num_layers) |
| self.assertRaises(RuntimeError, lambda: model(input, hidden)) |
| |
| correct_input_shape = (seq_len, batch_size, input_size) |
| correct_hidden_shape = (num_layers * num_directions, batch_size, hidden_size) |
| |
| def update_shape(shape, dim, new_dim_size): |
| new_shape = list(shape) |
| new_shape[dim] = new_dim_size |
| return tuple(new_shape) |
| |
| def get_inputs(input_shape, hidden_shape, mode): |
| '''returns list( tuple(input, hidden) ) |
| where input, hidden are inputs to a model''' |
| input = torch.randn(input_shape) |
| hidden = torch.randn(hidden_shape) |
| if mode != 'LSTM': |
| return [(input, hidden)] |
| if hidden_shape == correct_hidden_shape: |
| return [(input, (hidden, hidden))] |
| good_hidden = torch.randn(correct_hidden_shape) |
| return [ |
| (input, (hidden, good_hidden)), |
| (input, (good_hidden, hidden)), |
| ] |
| |
| rnn_modes = ['RNN', 'GRU', 'LSTM'] |
| for mode in rnn_modes: |
| # Incorrect input batch size |
| input_shape = update_shape(correct_input_shape, 1, bad_size) |
| hidden_shape = correct_hidden_shape |
| test(input_shape, hidden_shape, mode) |
| |
| # Incorrect hidden batch size |
| input_shape = correct_input_shape |
| hidden_shape = update_shape(correct_hidden_shape, 1, bad_size) |
| test(input_shape, hidden_shape, mode) |
| |
| # Incorrect input size |
| input_shape = update_shape(correct_input_shape, 2, bad_size) |
| hidden_shape = correct_hidden_shape |
| test(input_shape, hidden_shape, mode) |
| |
| # Incorrect hidden size |
| input_shape = correct_input_shape |
| hidden_shape = update_shape(correct_hidden_shape, 2, bad_size) |
| test(input_shape, hidden_shape, mode) |
| |
| # Incorrect hidden[0] |
| input_shape = correct_input_shape |
| hidden_shape = update_shape(correct_hidden_shape, 0, bad_size) |
| test(input_shape, hidden_shape, mode) |
| |
| def test_projections_lstm_args_check(self): |
| input_size = 3 |
| hidden_size = 5 |
| proj_size = 2 |
| num_layers = 2 |
| batch_size = 4 |
| seq_len = 6 |
| num_directions = 1 |
| bad_size = 7 # prime number so that no size can divide it. |
| |
| def test(input_shape, hidden_h_shape, hidden_c_shape): |
| for input, hidden in get_inputs(input_shape, hidden_h_shape, hidden_c_shape): |
| model = nn.LSTM(input_size, hidden_size, num_layers, proj_size=proj_size) |
| self.assertRaises(RuntimeError, lambda: model(input, hidden)) |
| |
| correct_input_shape = (seq_len, batch_size, input_size) |
| correct_hidden_h_shape = (num_layers * num_directions, batch_size, proj_size) |
| correct_hidden_c_shape = (num_layers * num_directions, batch_size, hidden_size) |
| |
| def update_shape(shape, dim, new_dim_size): |
| new_shape = list(shape) |
| new_shape[dim] = new_dim_size |
| return tuple(new_shape) |
| |
| def get_inputs(input_shape, hidden_h_shape, hidden_c_shape): |
| '''returns list( tuple(input, hidden) ) |
| where input, hidden are inputs to a model''' |
| input = torch.randn(input_shape) |
| hidden_h = torch.randn(hidden_h_shape) |
| hidden_c = torch.randn(hidden_c_shape) |
| return [(input, (hidden_h, hidden_c))] |
| |
| # Incorrect input batch size |
| input_shape = update_shape(correct_input_shape, 1, bad_size) |
| test(input_shape, correct_hidden_h_shape, correct_hidden_c_shape) |
| |
| # Incorrect hidden batch size |
| input_shape = correct_input_shape |
| hidden_h_shape = update_shape(correct_hidden_h_shape, 1, bad_size) |
| hidden_c_shape = update_shape(correct_hidden_c_shape, 1, bad_size) |
| test(input_shape, hidden_h_shape, hidden_c_shape) |
| |
| # Incorrect input size |
| input_shape = update_shape(correct_input_shape, 2, bad_size) |
| test(input_shape, correct_hidden_h_shape, correct_hidden_c_shape) |
| |
| # Incorrect hidden size |
| input_shape = correct_input_shape |
| hidden_h_shape = update_shape(correct_hidden_h_shape, 2, bad_size) |
| hidden_c_shape = update_shape(correct_hidden_c_shape, 2, bad_size) |
| test(input_shape, hidden_h_shape, hidden_c_shape) |
| |
| # Incorrect hidden[0] |
| input_shape = correct_input_shape |
| hidden_h_shape = update_shape(correct_hidden_h_shape, 0, bad_size) |
| hidden_c_shape = update_shape(correct_hidden_c_shape, 0, bad_size) |
| test(input_shape, hidden_h_shape, hidden_c_shape) |
| |
| # Incorrect proj size = hidden size |
| input_shape = correct_input_shape |
| hidden_h_shape = update_shape(correct_hidden_h_shape, 0, hidden_size) |
| hidden_c_shape = correct_hidden_c_shape |
| test(input_shape, hidden_h_shape, hidden_c_shape) |
| |
| # Incorrect proj size != hidden size |
| input_shape = correct_input_shape |
| hidden_h_shape = update_shape(correct_hidden_h_shape, 0, bad_size) |
| hidden_c_shape = correct_hidden_c_shape |
| test(input_shape, hidden_h_shape, hidden_c_shape) |
| |
| # Incorrect cell size != hidden size |
| input_shape = correct_input_shape |
| hidden_h_shape = correct_hidden_h_shape |
| hidden_c_shape = update_shape(correct_hidden_c_shape, 0, bad_size) |
| test(input_shape, hidden_h_shape, hidden_c_shape) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") |
| def test_rnn_check_device(self): |
| import copy |
| input_size = 3 |
| hidden_size = 5 |
| num_layers = 2 |
| batch_size = 4 |
| seq_len = 6 |
| num_directions = 1 |
| |
| correct_input_shape = (seq_len, batch_size, input_size) |
| correct_hidden_shape = (num_layers * num_directions, batch_size, hidden_size) |
| rnn_modes = ['RNN', 'GRU', 'LSTM'] |
| |
| for mode in rnn_modes: |
| model = getattr(nn, mode)(input_size, hidden_size, num_layers) |
| model_cuda = copy.deepcopy(model).to('cuda:0') |
| input = torch.randn(correct_input_shape) |
| hidden = torch.randn(correct_hidden_shape) |
| |
| # input and weights are not at the same device |
| with self.assertRaisesRegex(RuntimeError, |
| "Input and parameter tensors are not at the same device"): |
| model(input.to('cuda:0')) |
| with self.assertRaisesRegex(RuntimeError, |
| "Input and parameter tensors are not at the same device"): |
| model_cuda(input) |
| |
| # input and hiddens are not at the same device |
| with self.assertRaisesRegex(RuntimeError, |
| r"Input and hidden tensors are not at the same device"): |
| if mode == 'LSTM': |
| model(input, (hidden.to('cuda:0'), hidden.to('cuda:0'))) |
| else: |
| model(input, (hidden.to('cuda:0'))) |
| with self.assertRaisesRegex(RuntimeError, |
| r"Input and hidden tensors are not at the same device"): |
| if mode == 'LSTM': |
| model_cuda(input.to('cuda:0'), (hidden, hidden)) |
| else: |
| model_cuda(input.to('cuda:0'), (hidden)) |
| |
| # hidden tensors are not at the same CUDA device |
| if mode == 'LSTM': |
| with self.assertRaisesRegex(RuntimeError, |
| "Input and hidden tensors are not at the same device"): |
| model(input.to('cuda:0'), (hidden.to('cuda:0'), hidden.to('cuda:1'))) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") |
| def test_projections_lstm_check_device(self): |
| input_size = 3 |
| hidden_size = 5 |
| proj_size = 2 |
| num_layers = 2 |
| batch_size = 4 |
| seq_len = 6 |
| num_directions = 1 |
| |
| correct_input_shape = (seq_len, batch_size, input_size) |
| correct_hidden_h_shape = (num_layers * num_directions, batch_size, proj_size) |
| correct_hidden_c_shape = (num_layers * num_directions, batch_size, hidden_size) |
| |
| model = nn.LSTM(input_size, hidden_size, num_layers, proj_size=proj_size) |
| input = torch.randn(correct_input_shape) |
| hidden_h = torch.randn(correct_hidden_h_shape) |
| hidden_c = torch.randn(correct_hidden_c_shape) |
| |
| # input and weights are not at the same device |
| with self.assertRaisesRegex(RuntimeError, |
| "Input and parameter tensors are not at the same device"): |
| model(input.to('cuda:0')) |
| |
| # input and hiddens are not at the same device |
| with self.assertRaisesRegex(RuntimeError, |
| r"Input and hidden tensors are not at the same device"): |
| model(input, (hidden_h.to('cuda:0'), hidden_c.to('cuda:0'))) |
| |
| # hidden tensors are not at the same CUDA device |
| with self.assertRaisesRegex(RuntimeError, |
| "Input and hidden tensors are not at the same device"): |
| model(input.to('cuda:0'), (hidden_h.to('cuda:0'), hidden_c.to('cuda:1'))) |
| |
| def test_rnn_initial_hidden_state(self): |
| rnn_modes = ['RNN', 'GRU', 'LSTM'] |
| for mode in rnn_modes: |
| rnn = getattr(nn, mode)(30, 20, 2) |
| input = torch.randn(10, 32, 30) |
| hidden = torch.zeros(2, 32, 20) |
| |
| if mode == 'LSTM': |
| hidden = (hidden, hidden) |
| output1, hidden1 = rnn(input, hidden) |
| output2, hidden2 = rnn(input) |
| self.assertEqual(output1, output2) |
| self.assertEqual(hidden1, hidden2) |
| |
| def test_projections_lstm_initial_hidden_state(self): |
| for bidir in [False, True]: |
| rnn = nn.LSTM(30, 20, 2, bidirectional=bidir, proj_size=10) |
| num_dirs = 2 if bidir else 1 |
| input = torch.randn(10, 32, 30) |
| hidden_h = torch.zeros(2 * num_dirs, 32, 10) |
| hidden_c = torch.zeros(2 * num_dirs, 32, 20) |
| hidden = (hidden_h, hidden_c) |
| output1, hidden1 = rnn(input, hidden) |
| output2, hidden2 = rnn(input) |
| self.assertEqual(output1, output2) |
| self.assertEqual(hidden1, hidden2) |
| |
| def test_projections_errors_on_gru_and_rnn(self): |
| error_msg = "proj_size argument is only supported for LSTM, not RNN or GRU" |
| for mode in ['RNN', 'GRU']: |
| with self.assertRaisesRegex(ValueError, error_msg): |
| rnn = getattr(nn, mode)(30, 20, 2, proj_size=10) |
| |
| def _test_RNN_cpu_vs_cudnn(self, dropout, dtype=torch.double): |
| |
| def forward_backward(cuda, rnn, input_val, grad_output, weights_val, hx_val, grad_hy, |
| cx_val=None, grad_cy=None): |
| is_lstm = isinstance(rnn, nn.LSTM) |
| |
| for x_layer, y_layer in zip(rnn.all_weights, weights_val): |
| for x, y in zip(x_layer, y_layer): |
| x.data.copy_(y.data) |
| |
| if isinstance(input_val, rnn_utils.PackedSequence): |
| input = rnn_utils.PackedSequence( |
| input_val.data.data.requires_grad_(True), input_val.batch_sizes) |
| input_var = input.data |
| else: |
| input = input_val.clone().requires_grad_(True) |
| input_var = input |
| if is_lstm: |
| if cx_val is None: |
| hx = (hx_val.clone().requires_grad_(True), |
| hx_val.add(1).requires_grad_(True)) |
| else: |
| hx = (hx_val.clone().requires_grad_(True), |
| cx_val.add(1).requires_grad_(True)) |
| else: |
| hx = hx_val.clone().requires_grad_(True) |
| |
| if cuda: |
| rnn.cuda() |
| input_var.data = input_var.data.cuda() |
| if is_lstm: |
| hx[0].data = hx[0].data.cuda() |
| hx[1].data = hx[1].data.cuda() |
| else: |
| hx.data = hx.data.cuda() |
| grad_hy = grad_hy.cuda() |
| if grad_cy is not None: |
| grad_cy = grad_cy.cuda() |
| grad_output = grad_output.cuda() |
| |
| output, hy = rnn(input, hx) |
| |
| if isinstance(output, rnn_utils.PackedSequence): |
| output = output.data |
| |
| if is_lstm: |
| if grad_cy is None: |
| torch.autograd.backward([output, hy[0], hy[1]], [grad_output, grad_hy, grad_hy + 1]) |
| else: |
| torch.autograd.backward([output, hy[0], hy[1]], [grad_output, grad_hy, grad_cy + 1]) |
| else: |
| torch.autograd.backward([output, hy], [grad_output, grad_hy]) |
| |
| return {'output': output.data, |
| 'hy': hy[0].data if is_lstm else hy.data, |
| 'weights': rnn.all_weights, |
| 'grad_input': input_var.grad.data, |
| 'grad_hx': hx[0].grad.data if is_lstm else hx.grad.data, |
| 'cy': hy[1].data if is_lstm else None, |
| 'grad_cx': hx[1].grad.data if is_lstm else None} |
| |
| input_size = 10 |
| hidden_size = 6 |
| proj_size = 3 |
| num_layers = 2 |
| seq_length = 7 |
| batch = 6 |
| |
| def make_noncontig(tensor): |
| ndim = tensor.dim() |
| return torch.stack([tensor.clone().zero_(), tensor], ndim).select(ndim, 1) |
| |
| def compare_cpu_gpu(outputs_cpu, outputs_gpu): |
| self.assertEqual(list(outputs_cpu.keys()), list(outputs_gpu.keys())) |
| for key in outputs_cpu.keys(): |
| if key != 'weights': |
| self.assertEqual(outputs_cpu[key], outputs_gpu[key], atol=5e-5, rtol=0, msg=key) |
| |
| # check grad weights separately, as nested dict |
| for cpu_layer_weight, gpu_layer_weight in zip(outputs_cpu['weights'], outputs_gpu['weights']): |
| for (cpu_weight, gpu_weight) in zip(cpu_layer_weight, gpu_layer_weight): |
| self.assertEqual(cpu_weight.grad.data, gpu_weight.grad.data, atol=5e-5, rtol=0) |
| |
| for module in (nn.RNN, nn.LSTM, nn.GRU): |
| for bias, bidirectional, batch_first, contig, variable_len, lens_as_tensor \ |
| in product((True, False), repeat=6): |
| |
| num_directions = 2 if bidirectional else 1 |
| if batch_first: |
| input_val = torch.randn(batch, seq_length, input_size, dtype=dtype) |
| grad_output = torch.randn(batch, seq_length, hidden_size * num_directions, dtype=dtype) |
| else: |
| input_val = torch.randn(seq_length, batch, input_size, dtype=dtype) |
| grad_output = torch.randn(seq_length, batch, hidden_size * num_directions, dtype=dtype) |
| |
| hx_val = torch.randn(num_layers * num_directions, batch, hidden_size, dtype=dtype) |
| grad_hy = torch.randn(num_layers * num_directions, batch, hidden_size, dtype=dtype) |
| |
| if not contig: |
| grad_output = make_noncontig(grad_output) |
| grad_hy = make_noncontig(grad_hy) |
| input_var = make_noncontig(input_val) |
| hx_val = make_noncontig(hx_val) |
| |
| if variable_len: |
| lengths = [7, 5, 5, 2, 1, 1] |
| if lens_as_tensor: |
| lengths = torch.tensor(lengths, dtype=torch.long) |
| input_val = rnn_utils.pack_padded_sequence(input_val, lengths, batch_first=batch_first) |
| grad_output = rnn_utils.pack_padded_sequence(grad_output, lengths, batch_first=batch_first).data |
| |
| rnn = module(input_size, |
| hidden_size, |
| num_layers, |
| bias=bias, |
| dropout=dropout, |
| bidirectional=bidirectional, |
| batch_first=batch_first).to(dtype) |
| |
| outputs_cpu = forward_backward( |
| False, rnn, input_val, grad_output, rnn.all_weights, hx_val, grad_hy) |
| |
| rnn_gpu = module(input_size, |
| hidden_size, |
| num_layers, |
| bias=bias, |
| dropout=dropout, |
| bidirectional=bidirectional, |
| batch_first=batch_first).to(dtype) |
| |
| outputs_gpu = forward_backward( |
| True, rnn_gpu, input_val, grad_output, rnn.all_weights, hx_val, grad_hy) |
| |
| compare_cpu_gpu(outputs_cpu, outputs_gpu) |
| |
| for nonlinearity in ('tanh', 'relu'): |
| hx_val = torch.randn(num_layers, batch, hidden_size, dtype=dtype) |
| input_val = torch.randn(seq_length, batch, input_size, dtype=dtype) |
| grad_output = torch.randn( |
| seq_length, batch, hidden_size * num_directions, dtype=dtype) |
| grad_hy = torch.randn( |
| num_layers * num_directions, batch, hidden_size, dtype=dtype) |
| |
| rnn = nn.RNN(input_size, hidden_size, num_layers, bias=bias, nonlinearity=nonlinearity).to(dtype) |
| outputs_cpu = forward_backward(False, rnn, input_val, grad_output, rnn.all_weights, hx_val, grad_hy) |
| |
| rnn_gpu = nn.RNN(input_size, hidden_size, num_layers, bias=bias, nonlinearity=nonlinearity).to(dtype) |
| outputs_gpu = forward_backward(True, rnn_gpu, input_val, grad_output, rnn.all_weights, hx_val, grad_hy) |
| |
| compare_cpu_gpu(outputs_cpu, outputs_gpu) |
| |
| # checking LSTM with projections |
| for bias, bidirectional, batch_first, contig, variable_len, lens_as_tensor \ |
| in product((True, False), repeat=6): |
| num_directions = 2 if bidirectional else 1 |
| if batch_first: |
| input_val = torch.randn(batch, seq_length, input_size, dtype=dtype) |
| grad_output = torch.randn(batch, seq_length, proj_size * num_directions, dtype=dtype) |
| else: |
| input_val = torch.randn(seq_length, batch, input_size, dtype=dtype) |
| grad_output = torch.randn(seq_length, batch, proj_size * num_directions, dtype=dtype) |
| |
| hx_val = torch.randn(num_layers * num_directions, batch, proj_size, dtype=dtype) |
| cx_val = torch.randn(num_layers * num_directions, batch, hidden_size, dtype=dtype) |
| grad_hy = torch.randn(num_layers * num_directions, batch, proj_size, dtype=dtype) |
| grad_cy = torch.randn(num_layers * num_directions, batch, hidden_size, dtype=dtype) |
| |
| if not contig: |
| grad_output = make_noncontig(grad_output) |
| grad_hy = make_noncontig(grad_hy) |
| grad_cy = make_noncontig(grad_cy) |
| input_var = make_noncontig(input_val) |
| hx_val = make_noncontig(hx_val) |
| cx_val = make_noncontig(cx_val) |
| |
| if variable_len: |
| lengths = [7, 5, 5, 2, 1, 1] |
| if lens_as_tensor: |
| lengths = torch.tensor(lengths, dtype=torch.long) |
| input_val = rnn_utils.pack_padded_sequence(input_val, lengths, batch_first=batch_first) |
| grad_output = rnn_utils.pack_padded_sequence(grad_output, lengths, batch_first=batch_first).data |
| |
| rnn = nn.LSTM(input_size, |
| hidden_size, |
| num_layers, |
| bias=bias, |
| dropout=dropout, |
| bidirectional=bidirectional, |
| batch_first=batch_first, |
| proj_size=proj_size).to(dtype) |
| |
| outputs_cpu = forward_backward( |
| False, rnn, input_val, grad_output, rnn.all_weights, |
| hx_val, grad_hy, cx_val, grad_cy) |
| |
| rnn_gpu = nn.LSTM(input_size, |
| hidden_size, |
| num_layers, |
| bias=bias, |
| dropout=dropout, |
| bidirectional=bidirectional, |
| batch_first=batch_first, |
| proj_size=proj_size).to(dtype) |
| |
| outputs_gpu = forward_backward( |
| True, rnn_gpu, input_val, grad_output, rnn.all_weights, |
| hx_val, grad_hy, cx_val, grad_cy) |
| compare_cpu_gpu(outputs_cpu, outputs_gpu) |
| |
| @unittest.skipIf(not TEST_CUDNN, "needs cudnn") |
| def test_RNN_cpu_vs_cudnn_no_dropout(self): |
| dtype = torch.double |
| self._test_RNN_cpu_vs_cudnn(0, dtype) |
| |
| @unittest.skipIf(not TEST_CUDNN, "needs cudnn") |
| def test_RNN_cpu_vs_cudnn_with_dropout(self): |
| # Because of dropout randomness, can only compare dropout=0 and dropout=1 |
| self._test_RNN_cpu_vs_cudnn(1) |
| |
| @unittest.skipIf(not TEST_CUDNN, "needs cudnn") |
| def test_RNN_cudnn_weight_norm(self): |
| input_size = 10 |
| hidden_size = 6 |
| num_layers = 2 |
| seq_length = 7 |
| batch = 6 |
| |
| # runs on CPU to acquire expected output |
| def check_weight_norm(m, name): |
| input = torch.randn(seq_length, batch, input_size) |
| expected_output = m(input) |
| |
| # adds weight normalization |
| m = torch.nn.utils.weight_norm(m, name=name) |
| |
| # moves to CUDA |
| m = m.cuda() |
| input = input.cuda() |
| |
| # otherwise, subsequent warnings will be hidden, and further tests rely on them |
| warnings.simplefilter("always") |
| self.assertEqual(m(input), expected_output) |
| |
| # remove weight norm |
| m = torch.nn.utils.remove_weight_norm(m, name=name) |
| self.assertEqual(m(input), expected_output) |
| |
| check_weight_norm(nn.LSTM(input_size, hidden_size, num_layers), 'weight_hh_l0') |
| check_weight_norm(nn.LSTM(input_size, hidden_size, num_layers, proj_size=3), 'weight_hr_l0') |
| |
| @unittest.skipIf(not TEST_CUDA, 'CUDA not available') |
| def test_partial_flat_weights(self): |
| input_size = 10 |
| hidden_size = 6 |
| num_layers = 2 |
| |
| m = nn.LSTM(input_size, hidden_size, num_layers) |
| inp = torch.randn(3, 2, 10) |
| out_expected = m(inp) |
| # deletes an attribute of original LSTM |
| weight_orig = m.weight_hh_l0 |
| del m.weight_hh_l0 |
| self.assertFalse(hasattr(m, "weight_hh_l0")) |
| # verifies that moving to CUDA with only some attributes defined |
| # does not throw an error |
| m.cuda() |
| # recompute the weight and make sure that module can be used |
| m.weight_hh_l0 = weight_orig.cuda() |
| inp = inp.cuda() |
| # otherwise, subsequent warnings will be hidden, and further tests rely on them |
| warnings.simplefilter("always") |
| self.assertEqual(m(inp)[0].cpu(), out_expected[0]) |
| |
| @unittest.skipIf(not TEST_CUDNN, "needs cudnn") |
| @set_default_dtype(torch.double) |
| def test_RNN_dropout(self): |
| # checking the assumption that cuDNN sticks dropout in between |
| # RNN layers |
| for p in (0, 0.276, 0.731, 1): |
| for train in (True, False): |
| for cuda in (True, False): |
| rnn = nn.RNN(10, 1000, 2, bias=False, dropout=p, nonlinearity='relu') |
| if cuda: |
| rnn.cuda() |
| |
| if train: |
| rnn.train() |
| else: |
| rnn.eval() |
| rnn.weight_ih_l0.data.fill_(1) |
| rnn.weight_hh_l0.data.fill_(1) |
| rnn.weight_ih_l1.data.fill_(1) |
| rnn.weight_hh_l1.data.fill_(1) |
| input = torch.ones(1, 1, 10) |
| hx = torch.zeros(2, 1, 1000) |
| if cuda: |
| input = input.cuda() |
| hx = hx.cuda() |
| |
| output, hy = rnn(input, hx) |
| self.assertEqual(output.data.min(), output.data.max()) |
| output_val = output.data[0][0][0] |
| if p == 0 or not train: |
| self.assertEqual(output_val, 10000) |
| elif p == 1: |
| self.assertEqual(output_val, 0) |
| else: |
| self.assertGreater(output_val, 8000) |
| self.assertLess(output_val, 12000) |
| denorm_mod = (output_val * (1 - p)) % 10 |
| self.assertLess(min(denorm_mod, 10 - denorm_mod), 1e-2) |
| |
| self.assertEqual(hy[0].data.min(), hy[0].data.max()) |
| self.assertEqual(hy[1].data.min(), hy[1].data.max()) |
| self.assertEqual(hy.data[0][0][0], 10) |
| self.assertEqual(hy.data[1][0][0], output_val) |
| |
| @unittest.skipIf(not TEST_CUDNN, "needs cudnn") |
| @set_default_dtype(torch.double) |
| def test_error_RNN_seq_len_zero(self): |
| # checking error message when RNN has seq_len = 0 |
| for module in (nn.RNN, nn.LSTM, nn.GRU): |
| for bidirectional in [True, False]: |
| for device in get_all_device_types(): |
| input = torch.ones(0, 10, 5) |
| rnn = module(5, 6, bidirectional=bidirectional) |
| if device == 'cuda': |
| rnn.cuda() |
| input = input.cuda() |
| |
| with self.assertRaisesRegex(RuntimeError, "Expected sequence length to be larger than 0 in RNN"): |
| rnn(input) |
| |
| def test_RNN_input_size_zero(self): |
| for module in (nn.RNN, nn.LSTM, nn.GRU): |
| for device in get_all_device_types(): |
| input = torch.zeros((5, 0, 3)) |
| rnn = module(input_size=3, hidden_size=4) |
| if device == 'cuda': |
| rnn.cuda() |
| input = input.cuda() |
| outs = rnn(input) |
| self.assertEqual(outs[0].shape, torch.Size([5, 0, 4])) |
| # Check that backward does not cause a hard error |
| outs[0].sum().backward() |
| |
| @unittest.skipIf(not TEST_CUDNN, "needs cudnn") |
| def test_RNN_dropout_state(self): |
| for p in (0, 0.1234): |
| for train in (True, False): |
| for cuda in (True, False): |
| rnn = nn.RNN(100, 100, 2, bias=False, dropout=p, nonlinearity='relu') |
| if cuda: |
| rnn.cuda() |
| |
| if train: |
| rnn.train() |
| else: |
| rnn.eval() |
| input = torch.rand(1, 1, 100) |
| hx = torch.rand(2, 1, 100) |
| if cuda: |
| input = input.cuda() |
| hx = hx.cuda() |
| |
| output1, hy1 = rnn(input, hx) |
| output2, hy2 = rnn(input, hx) |
| |
| buf = io.BytesIO() |
| rnn_pickle = torch.save(rnn, buf) |
| buf.seek(0) |
| # weights_only=False as this is legacy code that saves the model |
| rnn2 = torch.load(buf, weights_only=False) |
| rnn2.flatten_parameters() |
| output3, hy3 = rnn2(input, hx) |
| |
| if p == 0 or not train: |
| self.assertEqual(output1, output2) |
| self.assertEqual(output1, output3) |
| self.assertEqual(hy1, hy2) |
| self.assertEqual(hy1, hy3) |
| else: |
| self.assertNotEqual(output1, output2) |
| self.assertNotEqual(output1, output3) |
| self.assertNotEqual(hy1, hy2) |
| self.assertNotEqual(hy1, hy3) |
| |
| @unittest.skipIf(not TEST_CUDNN, "needs cudnn") |
| @set_default_dtype(torch.double) |
| def test_RNN_change_dropout(self): |
| for train, cuda in product((True, False), repeat=2): |
| rnn = nn.RNN(100, 100, 2, dropout=0, nonlinearity='relu') |
| input = torch.rand(3, 2, 100) |
| if cuda: |
| input.data = input.data.cuda() |
| rnn.cuda() |
| |
| if train: |
| rnn.train() |
| else: |
| rnn.eval() |
| |
| prev_output = None |
| for p in (0, 0.5, 0, 0.7, 0.2, 1, 0.2, 0): |
| rnn.dropout = p |
| output1, hy1 = rnn(input) |
| output2, hy2 = rnn(input) |
| |
| if p == 0 or p == 1 or not train: |
| self.assertEqual(output1, output2) |
| self.assertEqual(hy1, hy2) |
| else: |
| self.assertNotEqual(output1, output2) |
| self.assertNotEqual(hy1, hy2) |
| |
| if prev_output is not None: |
| if not train: |
| self.assertEqual(output1.data, prev_output) |
| self.assertEqual(output2.data, prev_output) |
| else: |
| self.assertNotEqual(output1.data, prev_output) |
| self.assertNotEqual(output2.data, prev_output) |
| prev_output = output1.data |
| |
| def test_inplace_thnn(self): |
| modules = [nn.ReLU, nn.ELU, nn.SELU, nn.CELU, nn.RReLU] |
| for mod in modules: |
| r = mod(inplace=True) |
| input = torch.randn(5, 5, requires_grad=True) |
| output = r(input + 0) |
| grad_output = torch.randn(5, 5) |
| grad_output_clone = grad_output.clone() |
| output.backward(grad_output) |
| self.assertEqual(grad_output, grad_output_clone) |
| |
| |
| def test_pixel_shuffle_unshuffle(self): |
| def _test_pixel_shuffle_unshuffle_helper(num_input_dims, valid_channels_dim=True, |
| upscale_factor=None): |
| # Function to imperatively ensure pixels are shuffled to the correct locations. |
| # Used to validate the batch operations in pixel_shuffle. |
| def _verify_pixel_shuffle(input, output, upscale_factor): |
| for c in range(output.size(-3)): |
| for h in range(output.size(-2)): |
| for w in range(output.size(-1)): |
| height_idx = h // upscale_factor |
| weight_idx = w // upscale_factor |
| channel_idx = (upscale_factor * (h % upscale_factor)) + (w % upscale_factor) + \ |
| (c * upscale_factor ** 2) |
| self.assertEqual(output[..., c, h, w], input[..., channel_idx, height_idx, weight_idx]) |
| |
| upscale_factor = random.randint(2, 5) if upscale_factor is None else upscale_factor |
| # If valid_channels_dim=False, add 1 to make channels dim indivisible by upscale_factor ** 2. |
| channels = random.randint(1, 4) * upscale_factor ** 2 + (0 if valid_channels_dim else 1) |
| height = random.randint(5, 10) |
| width = random.randint(5, 10) |
| |
| if num_input_dims == 1: |
| input = torch.rand(channels, requires_grad=True) |
| elif num_input_dims == 2: |
| input = torch.rand(height, width, requires_grad=True) |
| else: |
| batch_sizes = [random.randint(1, 3) for _ in range(num_input_dims - 3)] |
| input = torch.rand(*batch_sizes, channels, height, width, requires_grad=True) |
| ps = nn.PixelShuffle(upscale_factor) |
| pus = nn.PixelUnshuffle(downscale_factor=upscale_factor) |
| |
| if num_input_dims >= 3 and valid_channels_dim and upscale_factor > 0: |
| output = ps(input) |
| _verify_pixel_shuffle(input, output, upscale_factor) |
| output.backward(output.data) |
| self.assertEqual(input.data, input.grad.data) |
| |
| # Ensure unshuffle properly inverts shuffle. |
| unshuffle_output = pus(output) |
| self.assertEqual(input, unshuffle_output) |
| else: |
| self.assertRaises(RuntimeError, lambda: ps(input)) |
| |
| def _test_pixel_unshuffle_error_case_helper(num_input_dims, valid_height_dim=True, valid_width_dim=True, |
| downscale_factor=None): |
| downscale_factor = random.randint(2, 5) if downscale_factor is None else downscale_factor |
| channels = random.randint(1, 4) |
| # If valid_height_dim=False, add 1 to make height dim indivisible by downscale_factor. |
| height = random.randint(3, 5) * abs(downscale_factor) + (0 if valid_height_dim else 1) |
| # If valid_width_dim=False, add 1 to make width dim indivisible by downscale_factor. |
| width = random.randint(3, 5) * abs(downscale_factor) + (0 if valid_width_dim else 1) |
| |
| if num_input_dims == 1: |
| input = torch.rand(channels, requires_grad=True) |
| elif num_input_dims == 2: |
| input = torch.rand(height, width, requires_grad=True) |
| else: |
| batch_sizes = [random.randint(1, 3) for _ in range(num_input_dims - 3)] |
| input = torch.rand(*batch_sizes, channels, height, width, requires_grad=True) |
| |
| pus = nn.PixelUnshuffle(downscale_factor) |
| self.assertRaises(RuntimeError, lambda: pus(input)) |
| |
| def _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims): |
| # For 1D - 2D, this is an error case. |
| # For 3D - 5D, this is a success case for pixel_shuffle + pixel_unshuffle. |
| _test_pixel_shuffle_unshuffle_helper(num_input_dims=num_input_dims) |
| |
| # Error cases for pixel_shuffle. |
| _test_pixel_shuffle_unshuffle_helper(num_input_dims=num_input_dims, valid_channels_dim=False) |
| _test_pixel_shuffle_unshuffle_helper(num_input_dims=num_input_dims, upscale_factor=0) |
| _test_pixel_shuffle_unshuffle_helper(num_input_dims=num_input_dims, upscale_factor=-2) |
| |
| # Error cases for pixel_unshuffle. |
| _test_pixel_unshuffle_error_case_helper(num_input_dims=num_input_dims, valid_height_dim=False) |
| _test_pixel_unshuffle_error_case_helper(num_input_dims=num_input_dims, valid_width_dim=False) |
| _test_pixel_unshuffle_error_case_helper(num_input_dims=num_input_dims, downscale_factor=0) |
| _test_pixel_unshuffle_error_case_helper(num_input_dims=num_input_dims, downscale_factor=-2) |
| |
| def test_pixel_shuffle_unshuffle_1D(): |
| _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims=1) |
| |
| def test_pixel_shuffle_unshuffle_2D(): |
| _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims=2) |
| |
| def test_pixel_shuffle_unshuffle_3D(): |
| _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims=3) |
| |
| def test_pixel_shuffle_unshuffle_4D(): |
| _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims=4) |
| |
| def test_pixel_shuffle_unshuffle_5D(): |
| _test_pixel_shuffle_unshuffle_for_input_dims(num_input_dims=5) |
| |
| test_pixel_shuffle_unshuffle_1D() |
| test_pixel_shuffle_unshuffle_2D() |
| test_pixel_shuffle_unshuffle_3D() |
| test_pixel_shuffle_unshuffle_4D() |
| test_pixel_shuffle_unshuffle_5D() |
| |
| @set_default_dtype(torch.double) |
| def test_pixel_shuffle_nhwc_cpu(self): |
| input = torch.randn(3, 18, 4, 4, device='cpu') |
| input = input.contiguous(memory_format=torch.channels_last).requires_grad_() |
| grad = torch.randn(3, 18, 4, 4, device='cpu') |
| ps = torch.nn.PixelShuffle(3) |
| pus = torch.nn.PixelUnshuffle(3) |
| |
| ref_input = input.detach().clone().contiguous().requires_grad_(True) |
| ref_grad = grad.detach().clone().contiguous() |
| ref_ps = torch.nn.PixelShuffle(3) |
| ref_pus = torch.nn.PixelUnshuffle(3) |
| |
| out = pus(ps(input)) |
| out.backward(grad) |
| ref_out = ref_pus(ref_ps(ref_input)) |
| ref_out.backward(ref_grad) |
| |
| self.assertTrue(out.is_contiguous(memory_format=torch.channels_last)) |
| self.assertTrue(ref_out.is_contiguous()) |
| self.assertEqual(out, ref_out) |
| self.assertEqual(input.grad, ref_input.grad) |
| |
| # These tests should be OpInfo'd |
| def test_elu_inplace_on_view(self): |
| v = torch.tensor([1.0, -1.0, 1.0, -1.0], requires_grad=True, dtype=torch.double) |
| |
| def func(root): |
| x = root.clone() |
| view = x.narrow(0, 1, 2) |
| res = F.elu(view, inplace=True) |
| self.assertIs(res, view) |
| return x |
| |
| gradcheck(func, [v]) |
| gradgradcheck(func, [v]) |
| |
| def test_elu_inplace_gradgrad(self): |
| v = torch.randn(8, requires_grad=True, dtype=torch.double) |
| |
| def func(root): |
| x = root.clone() |
| return F.elu(x, inplace=True) |
| |
| gradcheck(func, [v]) |
| gradgradcheck(func, [v]) |
| |
| def test_relu_inplace_on_view(self): |
| v = torch.tensor([1.0, -1.0, 1.0, -1.0], requires_grad=True, dtype=torch.double) |
| |
| def func(root): |
| x = root.clone() |
| view = x.narrow(0, 1, 2) |
| res = F.relu(view, inplace=True) |
| self.assertIs(res, view) |
| return x |
| |
| gradcheck(func, [v]) |
| gradgradcheck(func, [v]) |
| |
| def test_PReLU_backward_requires_grad_false(self): |
| devices = ['cpu'] |
| devices += ['cuda'] if TEST_CUDA else [] |
| for d in devices: |
| m = nn.PReLU().to(d) |
| x = torch.randn(2, 3, 4, 5, device=d, requires_grad=False) |
| y = m(x) |
| y.mean().backward() |
| self.assertEqual(x.grad, None) |
| |
| def test_bce_loss_always_nonnegative(self): |
| target = torch.ones(5) |
| input = torch.ones(5) |
| self.assertEqual((nn.BCELoss()(input, target) < 0).sum(), 0) |
| |
| target = torch.zeros(5) |
| input = torch.zeros(5) |
| self.assertEqual((nn.BCELoss()(input, target) < 0).sum(), 0) |
| |
| def test_bce_with_logits_raises_if_target_and_input_are_different_size(self): |
| target = torch.rand(5) |
| input = torch.rand(5, 1) |
| with self.assertRaises(ValueError): |
| nn.BCEWithLogitsLoss()(input, target) |
| |
| target = torch.rand(5, 1) |
| input = torch.rand(5) |
| with self.assertRaises(ValueError): |
| nn.BCEWithLogitsLoss()(input, target) |
| |
| def test_bce_with_logits_gives_same_result_as_sigmoid_and_bce_loss(self): |
| sigmoid = nn.Sigmoid() |
| |
| target = torch.rand(64, 4) |
| output = torch.rand(64, 4) - 0.5 |
| |
| self.assertEqual(nn.BCEWithLogitsLoss()(output, target), nn.BCELoss()(sigmoid(output), target)) |
| |
| weight = torch.rand(4) |
| self.assertEqual(nn.BCEWithLogitsLoss(weight)(output, target), nn.BCELoss(weight)(sigmoid(output), target)) |
| |
| target = torch.zeros(4, 1, dtype=torch.float) |
| output = torch.empty(4, 1, dtype=torch.float).fill_(-100) |
| |
| self.assertEqual(nn.BCEWithLogitsLoss()(output, target), nn.BCELoss()(sigmoid(output), target)) |
| |
| self.assertEqual(nn.BCEWithLogitsLoss(reduction='none')(output, target), |
| nn.BCELoss(reduction='none')(sigmoid(output), target)) |
| |
| weight = torch.rand(1, dtype=torch.float) |
| self.assertEqual(nn.BCEWithLogitsLoss(weight)(output, target), nn.BCELoss(weight)(sigmoid(output), target)) |
| |
| def test_bce_loss_input_range(self): |
| bceloss = nn.BCELoss() |
| |
| target = torch.rand(25, 25) |
| output_valid = torch.rand(25, 25) |
| output_too_negative = output_valid - 1.0 |
| output_too_positive = output_valid + 1.0 |
| |
| loss_valid = bceloss(output_valid, target) |
| with self.assertRaisesRegex(RuntimeError, 'between 0 and 1'): |
| loss_too_negative = bceloss(output_too_negative, target) |
| with self.assertRaisesRegex(RuntimeError, 'between 0 and 1'): |
| loss_too_positive = bceloss(output_too_positive, target) |
| |
| def test_bce_loss_size_mismatch(self): |
| bceloss = nn.BCELoss() |
| a = torch.rand(25) |
| b = torch.rand(25, 1) |
| with self.assertRaisesRegex(ValueError, r'Using a target size \('): |
| bceloss(a, b) |
| |
| def test_bce_with_logits_gives_same_result_as_sigmoid_and_bce_loss_large_tensors_with_grad(self): |
| x_size = 1024 |
| y_size = 256 |
| target = torch.rand(x_size, y_size) |
| |
| for reduction in ['none', 'mean', 'sum']: |
| output_sig = torch.rand(x_size, y_size) - 0.5 |
| output_logits = output_sig.clone().detach() |
| |
| output_sig.requires_grad = True |
| output_logits.requires_grad = True |
| weight = torch.rand(y_size) |
| |
| loss_sig = nn.BCELoss(weight, reduction=reduction)( |
| torch.sigmoid(output_sig), target |
| ) |
| loss_logits = nn.BCEWithLogitsLoss(weight, reduction=reduction)( |
| output_logits, target |
| ) |
| |
| self.assertEqual(loss_logits, loss_sig) |
| |
| if reduction == 'none': |
| grad = torch.rand(x_size, y_size) |
| loss_sig.backward(grad) |
| loss_logits.backward(grad) |
| else: |
| loss_sig.backward() |
| loss_logits.backward() |
| |
| self.assertEqual(output_sig.grad, output_logits.grad) |
| |
| def test_bce_with_logits_has_correct_forward_grad(self): |
| output = torch.randn(3, 5, requires_grad=True, dtype=torch.double) |
| target = torch.randn(3, 5, dtype=torch.double) |
| for reduction in ('sum', 'mean', 'none'): |
| gradcheck(lambda self, target: nn.BCEWithLogitsLoss(reduction=reduction)(self, target), |
| (output, target), check_forward_ad=True) |
| |
| def test_bce_with_logits_has_correct_grad_at_zero(self): |
| output = torch.zeros(3, 1, requires_grad=True) |
| target = torch.zeros(3, 1) |
| nn.BCEWithLogitsLoss(reduction='sum')(output, target).backward() |
| expected_grad = torch.empty(3, 1).fill_(0.5) |
| self.assertEqual(output.grad, expected_grad) |
| |
| def test_bce_with_logits_broadcasts_weights(self): |
| target = torch.rand(16, 4) |
| output = torch.rand(16, 4) - 0.5 |
| |
| weight = torch.rand(4) |
| out1 = nn.BCEWithLogitsLoss(weight)(output, target) |
| |
| weight = weight.expand(16, 4).contiguous() |
| out2 = nn.BCEWithLogitsLoss(weight)(output, target) |
| |
| self.assertEqual(out1, out2) |
| |
| weight = torch.rand(16, 1) |
| out1 = nn.BCEWithLogitsLoss(weight)(output, target) |
| |
| weight = weight.expand(16, 4).contiguous() |
| out2 = nn.BCEWithLogitsLoss(weight)(output, target) |
| |
| self.assertEqual(out1, out2) |
| |
| def test_bce_with_logits_ones_in_pos_weights_are_the_same_as_none(self): |
| target = torch.rand(64, 4) |
| output = torch.rand(64, 4) - 0.5 |
| pos_weight = torch.ones(64, 4) |
| |
| self.assertEqual(nn.BCEWithLogitsLoss()(output, target), |
| nn.BCEWithLogitsLoss(pos_weight=pos_weight)(output, target)) |
| |
| def test_bce_with_logits_broadcasts_pos_weights(self): |
| target = torch.rand(64, 4) |
| output = torch.rand(64, 4) - 0.5 |
| pos_weight = torch.rand(4) |
| out1 = nn.BCEWithLogitsLoss(pos_weight=pos_weight)(output, target) |
| |
| pos_weight1 = pos_weight.expand(1, 4) |
| out2 = nn.BCEWithLogitsLoss(pos_weight=pos_weight1)(output, target) |
| |
| pos_weight2 = pos_weight.expand(64, 4) |
| out3 = nn.BCEWithLogitsLoss(pos_weight=pos_weight2)(output, target) |
| |
| self.assertEqual(out1, out2) |
| self.assertEqual(out1, out3) |
| |
| def test_bce_with_logits_with_pos_weight_has_correct_grad_at_zero(self): |
| output = torch.zeros(3, 1, requires_grad=True) |
| target = torch.zeros(3, 1) |
| pos_weight = torch.ones(3, 1) |
| nn.BCEWithLogitsLoss(pos_weight=pos_weight, reduction='sum')(output, target).backward() |
| expected_grad = torch.empty(3, 1).fill_(0.5) |
| grad = output.grad |
| self.assertEqual(grad, expected_grad) |
| |
| def test_bce_with_logits_stability(self): |
| output = torch.tensor([0., -120.]) |
| target = torch.tensor([0., 1.]) |
| pos_weight = torch.tensor([1., 1.]) |
| |
| out1 = nn.BCEWithLogitsLoss()(output, target) |
| self.assertTrue(torch.isfinite(out1).all().item()) |
| |
| out2 = nn.BCEWithLogitsLoss(pos_weight=pos_weight)(output, target) |
| self.assertTrue(torch.isfinite(out2).all().item()) |
| |
| def test_bce_loss_broadcasts_weights(self): |
| sigmoid = nn.Sigmoid() |
| target = torch.rand(16, 4) |
| output = torch.rand(16, 4) - 0.5 |
| |
| weight = torch.rand(4) |
| out1 = nn.BCELoss(weight)(sigmoid(output), target) |
| |
| weight = weight.expand(16, 4).contiguous() |
| out2 = nn.BCELoss(weight)(sigmoid(output), target) |
| |
| self.assertEqual(out1, out2) |
| |
| weight = torch.rand(16, 1) |
| out1 = nn.BCELoss(weight)(sigmoid(output), target) |
| |
| weight = weight.expand(16, 4).contiguous() |
| out2 = nn.BCELoss(weight)(sigmoid(output), target) |
| |
| self.assertEqual(out1, out2) |
| |
| def test_hardtanh_inplace_gradgrad(self): |
| v = torch.randn(8, requires_grad=True, dtype=torch.double) |
| |
| def func(root): |
| x = root.clone() |
| return F.hardtanh(x, inplace=True) |
| |
| gradcheck(func, [v]) |
| gradgradcheck(func, [v]) |
| |
| # test hardtanh backward for large tensor |
| def test_hardtanh_backward(self): |
| x = torch.randn(128, 10000, requires_grad=True) |
| grad = torch.randn(128, 10000) |
| z = torch.zeros(128, 10000) |
| y = F.hardtanh(x) |
| y.backward(grad) |
| # ref backward path for hardtanh |
| mask = (x > -1) & (x < 1) |
| x_grad_ref = torch.where(mask, grad, z) |
| self.assertEqual(x.grad, x_grad_ref) |
| |
| def test_batchnorm_nhwc_cpu(self): |
| def helper(self, mod, size, dtype, mixed_dtype=False, format=torch.channels_last, precision=None): |
| channels = size[1] |
| input = torch.randn(size, dtype=dtype, device='cpu', requires_grad=True) |
| input = input.contiguous(memory_format=format).to(dtype) |
| input.retain_grad() |
| grad = torch.randn(size, dtype=dtype, device='cpu') |
| grad = grad.contiguous(memory_format=format) |
| bn = mod(channels).cpu().to(dtype) |
| bn.weight.data.uniform_() |
| bn.bias.data.uniform_() |
| |
| ref_input = input.detach().clone().contiguous().requires_grad_(True) |
| ref_grad = grad.detach().clone().contiguous() |
| ref_bn = mod(channels).cpu().to(dtype) |
| ref_bn.load_state_dict(bn.state_dict()) |
| |
| if mixed_dtype: |
| bn.float() |
| ref_bn.float() |
| |
| out = bn(input) |
| out.backward(grad) |
| ref_out = ref_bn(ref_input) |
| ref_out.backward(ref_grad) |
| |
| self.assertTrue(out.is_contiguous(memory_format=format)) |
| self.assertTrue(ref_out.is_contiguous()) |
| self.assertEqual(out, ref_out) |
| self.assertEqual(bn.weight.grad, ref_bn.weight.grad, atol=precision, rtol=precision) |
| self.assertEqual(bn.bias.grad, ref_bn.bias.grad) |
| self.assertEqual(input.grad, ref_input.grad) |
| |
| # test NC11 and N1HW; test mixed dtype |
| for shape in [(4, 8, 10, 10), (4, 1, 9, 9), (4, 9, 1, 1)]: |
| for dtype in [torch.float, torch.bfloat16, torch.float16]: |
| for mixed_dtype in [False, True]: |
| if dtype == torch.float: |
| mixed_dtype = False |
| helper(self, nn.BatchNorm2d, shape, dtype, mixed_dtype, torch.channels_last) |
| |
| precisons = {torch.float: 1e-4, torch.bfloat16: 1e-4, torch.float16: None} |
| for shape in [(4, 8, 2, 10, 10), (4, 1, 2, 9, 9), (4, 9, 1, 1, 1)]: |
| for dtype in [torch.float, torch.bfloat16, torch.float16]: |
| for mixed_dtype in [False, True]: |
| if dtype == torch.float: |
| mixed_dtype = False |
| helper(self, nn.BatchNorm3d, shape, dtype, mixed_dtype, torch.channels_last_3d, precisons[dtype]) |
| |
| @parametrize_test( |
| 'bn_module', |
| [ |
| subtest(torch.nn.BatchNorm2d, name="BatchNorm2d"), |
| subtest(torch.nn.SyncBatchNorm, name="SyncBatchNorm"), |
| ], |
| ) |
| def test_batchnorm_non_contig_cpu(self, bn_module): |
| def helper(self, dtype): |
| input = torch.arange(6, dtype=torch.float).reshape(1, 3, 2, 1).cpu() |
| input = input.permute(0, 2, 1, 3) |
| |
| bn = bn_module(2).cpu().float().eval() |
| bn.weight.data.uniform_() |
| bn.bias.data.uniform_() |
| |
| ref_input = input.detach().clone().contiguous() |
| ref_bn = nn.BatchNorm2d(2).cpu().float().eval() |
| ref_bn.load_state_dict(bn.state_dict()) |
| |
| out = bn(input) |
| ref_out = ref_bn(ref_input) |
| |
| self.assertTrue(out.is_contiguous(memory_format=torch.channels_last)) |
| self.assertTrue(ref_out.is_contiguous()) |
| self.assertEqual(out, ref_out) |
| |
| input_bf = torch.arange(24, dtype=dtype).reshape(1, 3, 2, 4) |
| input_bf = input_bf.permute(0, 2, 1, 3) |
| input_f = input_bf.float() |
| bn_mix = bn_module(2).float().eval() |
| ref_bn_f = deepcopy(bn_mix) |
| out_bf = bn_mix(input_bf) |
| ref_out_bf = ref_bn_f(input_f) |
| self.assertEqual(ref_out_bf, out_bf.float(), atol=0.05, rtol=0.05) |
| |
| helper(self, torch.bfloat16) |
| helper(self, torch.float16) |
| |
| @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") |
| @unittest.skipIf(not TEST_CUDNN, "needs cudnn") |
| def test_batchnorm_cudnn_nhwc(self): |
| def run_test(input, grad_output): |
| c = input.size(1) |
| mod = nn.BatchNorm2d(c).cuda().float() |
| mod.weight.data.uniform_() |
| mod.bias.data.uniform_() |
| ref_input = input.detach().clone().contiguous().requires_grad_(True) |
| ref_grad = grad.detach().clone().contiguous() |
| ref_mod = nn.BatchNorm2d(c).cuda().float() |
| ref_mod.load_state_dict(mod.state_dict()) |
| out = mod(input) |
| out.backward(grad_output) |
| ref_out = ref_mod(ref_input) |
| ref_out.backward(ref_grad) |
| self.assertTrue(out.is_contiguous(memory_format=torch.channels_last)) |
| self.assertTrue(ref_out.is_contiguous()) |
| self.assertEqual(out, ref_out) |
| self.assertEqual(mod.weight.grad, ref_mod.weight.grad) |
| self.assertEqual(mod.bias.grad, ref_mod.bias.grad) |
| self.assertEqual(input.grad, ref_input.grad) |
| |
| input = torch.randint(1, 10, (4, 8, 2, 2), dtype=torch.float32, device="cuda") |
| input = input.contiguous(memory_format=torch.channels_last).detach().requires_grad_() |
| |
| grad = torch.randint(1, 10, (4, 8, 2, 2), dtype=torch.float32, device="cuda") |
| grad = grad.contiguous(memory_format=torch.channels_last) |
| run_test(input, grad) |
| # see #42588, grad is channels_last contiguous, but grad.suggest_memory_format (rightly) return "contiguous" |
| # not channels_last |
| input = torch.randint(1, 10, (2, 8, 8, 1), dtype=torch.float32, device="cuda") |
| input = input.contiguous(memory_format=torch.channels_last).detach().requires_grad_() |
| grad = torch.randint(1, 10, (2, 8, 8, 1), dtype=torch.float32, device="cuda") |
| grad = grad.permute(0, 2, 1, 3) |
| run_test(input, grad) |
| |
| @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") |
| def test_batchnorm_cudnn_half(self): |
| # THNN |
| input = torch.randint(1, 10, (2, 3, 2, 2), dtype=torch.half, device="cuda", requires_grad=True) |
| m = nn.BatchNorm2d(3).half().cuda() |
| thnn_output = m(input) |
| thnn_output.sum().backward() |
| thnn_input_grad = input.grad.data.clone() |
| self.assertEqualTypeString(thnn_output, input) |
| # cuDNN |
| if TEST_CUDNN: |
| input.grad = None |
| m = m.float() |
| cudnn_output = m(input) |
| cudnn_output.sum().backward() |
| cudnn_input_grad = input.grad.data.clone() |
| self.assertEqualTypeString(cudnn_output, input) |
| self.assertEqual(cudnn_output, thnn_output) |
| self.assertEqual(cudnn_input_grad, thnn_input_grad, atol=1e-3, rtol=0) |
| |
| @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") |
| def test_batchnorm_nonaffine_cuda_half_input(self): |
| input = torch.randn(16, 3, 24, 24, dtype=torch.half, device="cuda") |
| m = nn.BatchNorm2d(3, affine=False).cuda().float() # keep running stats in FP32 |
| output = m(input) |
| self.assertEqualTypeString(output, input) |
| m.eval() |
| output = m(input) |
| self.assertEqualTypeString(output, input) |
| |
| def test_batchnorm_raises_error_if_less_than_one_value_per_channel(self): |
| x = torch.rand(10)[None, :, None] |
| with self.assertRaises(ValueError): |
| torch.nn.BatchNorm1d(10)(x) |
| |
| def test_batchnorm_raises_error_if_running_mean_is_not_same_size_as_input(self): |
| input = torch.rand(2, 10) |
| running_var = torch.rand(10) |
| wrong_sizes = [9, 11] |
| for size in wrong_sizes: |
| with self.assertRaises(RuntimeError): |
| F.batch_norm(input, torch.rand(size), running_var) |
| |
| def test_batchnorm_raises_error_if_running_var_is_not_same_size_as_input(self): |
| input = torch.rand(2, 10) |
| running_mean = torch.rand(10) |
| wrong_sizes = [9, 11] |
| for size in wrong_sizes: |
| with self.assertRaises(RuntimeError): |
| F.batch_norm(input, running_mean, torch.rand(size)) |
| |
| def test_batchnorm_raises_error_if_weight_is_not_same_size_as_input(self): |
| input = torch.rand(2, 10) |
| running_mean = torch.rand(10) |
| running_var = torch.rand(10) |
| wrong_sizes = [9, 11] |
| for size in wrong_sizes: |
| with self.assertRaises(RuntimeError): |
| F.batch_norm(input, running_mean, running_var, weight=Parameter(torch.rand(size))) |
| |
| def test_batchnorm_raises_error_if_bias_is_not_same_size_as_input(self): |
| input = torch.rand(2, 10) |
| running_mean = torch.rand(10) |
| running_var = torch.rand(10) |
| wrong_sizes = [9, 11] |
| for size in wrong_sizes: |
| with self.assertRaises(RuntimeError): |
| F.batch_norm(input, running_mean, running_var, bias=Parameter(torch.rand(size))) |
| |
| def test_batchnorm_raises_error_if_running_var_or_running_mean_have_forward_grad(self): |
| args = ( |
| torch.randn(3, 2, 5), # input |
| torch.randn(2), # running_mean |
| torch.randn(2), # running_var |
| ) |
| kwargs = {'training': False, 'momentum': -1.2} |
| fn = partial(F.batch_norm, **kwargs) |
| |
| for dual_indices in ((0,), (1,), (1, 2), (0, 1), (0, 1, 2),): |
| tangents = tuple(torch.rand_like(x) for x in args) |
| |
| with fwAD.dual_level(): |
| duals = [fwAD.make_dual(primal, tangent) if i in dual_indices else primal |
| for i, (primal, tangent) in enumerate(zip(args, tangents))] |
| msg = "batch_norm is not differentiable wrt running_mean and running_var" |
| # 0 needs to have forward grad because otherwise we won't even run batch_norm_jvp |
| if (1 in dual_indices or 2 in dual_indices) and 0 in dual_indices: |
| with self.assertRaisesRegex(RuntimeError, msg): |
| fn(*duals) |
| else: |
| fn(*duals) |
| |
| def test_batchnorm_buffer_update_when_stats_are_not_tracked(self): |
| input_size = (32, 4) |
| # Instantiate BN with buffers that are not None |
| bn = nn.BatchNorm1d(input_size[1], track_running_stats=True) |
| # Use buffers for normalization but don't update them |
| bn.track_running_stats = False |
| # Store initial values |
| num_batches = bn.num_batches_tracked.clone() |
| running_mean = bn.running_mean.clone() |
| running_var = bn.running_var.clone() |
| # Forward random tensor |
| _ = bn(torch.rand(input_size)) |
| # Ensure none of the buffers has been updated |
| self.assertTrue(torch.equal(num_batches, bn.num_batches_tracked)) |
| self.assertTrue(torch.equal(running_mean, bn.running_mean)) |
| self.assertTrue(torch.equal(running_var, bn.running_var)) |
| |
| @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") |
| def test_batchnorm_nhwc_cuda(self): |
| for dtype in (torch.half, torch.float): |
| (N, C, H, W) = 2, 64, 50, 50 |
| model = torch.nn.BatchNorm2d(C, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) |
| model = model.eval().cuda().to(dtype) |
| inp1 = torch.randn(N, C, H, W, device=torch.device('cuda'), dtype=dtype) |
| inp2 = inp1.contiguous(memory_format=torch.channels_last) |
| out1 = model(inp1) |
| out2 = model(inp2) |
| self.assertTrue(torch.equal(out1, out2)) |
| |
| def test_batchnorm_load_state_dict(self): |
| bn = torch.nn.BatchNorm2d(3) |
| self.assertEqual(bn.state_dict()["num_batches_tracked"], torch.tensor(0)) |
| |
| bn.num_batches_tracked = torch.tensor(10) |
| self.assertEqual(bn.state_dict()["num_batches_tracked"], torch.tensor(10)) |
| |
| empty_dict = OrderedDict() |
| bn.load_state_dict(empty_dict, strict=False) |
| self.assertEqual(bn.state_dict()["num_batches_tracked"], torch.tensor(10)) |
| |
| # test that when `num_batches_tracked` is not in loaded state_dict, |
| # meta num_batches_tracked is still replaced with singleton 0 tensor |
| with torch.device('meta'): |
| meta_bn = torch.nn.BatchNorm2d(3) |
| self.assertTrue(meta_bn.num_batches_tracked.device == torch.device('meta')) |
| meta_bn.load_state_dict(empty_dict, assign=True, strict=False) |
| self.assertEqual(meta_bn.state_dict()["num_batches_tracked"], torch.tensor(0)) |
| |
| def test_batch_norm_update_stats(self): |
| input = torch.rand(0, 1) |
| running_mean = torch.rand(1) |
| running_var = torch.rand(1) |
| with self.assertRaisesRegex(RuntimeError, |
| re.escape("input tensor must have at least one element, but got input_sizes = [0, 1]")): |
| torch.batch_norm_update_stats(input=input, momentum=0.0, running_mean=running_mean, running_var=running_var) |
| |
| def test_pairwise_distance(self): |
| input1 = torch.randn(4, 4, requires_grad=True, dtype=torch.double) |
| input2 = torch.randn(4, 4, requires_grad=True, dtype=torch.double) |
| self.assertTrue(gradcheck(lambda x, y: F.pairwise_distance(x, y), (input1, input2))) |
| |
| # TODO: Create an OpInfo for pdist |
| def test_pdist(self): |
| for device, trans in itertools.product(device_(), [False, True]): |
| inp = torch.randn(4, 5, dtype=torch.double, device=device, requires_grad=True) |
| if trans: |
| inp = inp.transpose(0, 1) |
| for p in [0, 1, 2, 0.5, 1.5, 2.5, float('inf')]: |
| self.assertTrue(gradcheck(lambda x: F.pdist(x, p), (inp,))) |
| |
| def test_pdist_zeros(self): |
| """Test that grad is still valid when dist is 0""" |
| for device in device_(): |
| inp = torch.randn(1, 3, dtype=torch.double, device=device, requires_grad=True).repeat([2, 1]) |
| for p in [0, 1, 2, 0.5, 1.5, 2.5, float('inf')]: |
| self.assertTrue(gradcheck(lambda x: F.pdist(x, p), (inp,))) |
| |
| def test_pdist_empty_row(self): |
| for device in device_(): |
| inp = torch.randn(1, 3, dtype=torch.double, device=device, requires_grad=True) |
| self.assertTrue(gradcheck(F.pdist, (inp,))) |
| |
| def test_pdist_empty_col(self): |
| for device in device_(): |
| inp = torch.randn(4, 0, dtype=torch.double, device=device, requires_grad=True) |
| self.assertTrue(gradcheck(F.pdist, (inp,))) |
| |
| @unittest.expectedFailure |
| def test_pdist_cpu_gradgrad_unimplemented(self): |
| inp = torch.randn(4, 5, requires_grad=True) |
| gradgradcheck(F.pdist, (inp,)) |
| |
| @unittest.expectedFailure |
| def test_pdist_cuda_gradgrad_unimplemented(self): |
| inp = torch.randn(4, 5, device='cuda', requires_grad=True) |
| gradgradcheck(F.pdist, (inp,)) |
| |
| # Merge into OpInfo? |
| # test for backward in https://github.com/pytorch/pytorch/issues/15511 |
| def test_pdist_large(self): |
| for device in device_(): |
| def func(x): |
| return torch.pdist(x, p=2) |
| |
| # shape[0] should be able to be (roughly) arbitrarily large, but the kernel |
| # is currently limited to smaller sizes (see issue above); this is just testing |
| # a floor. |
| shape = (1000, 1) |
| x = torch.randn(shape, device=device).requires_grad_() |
| output = torch.pdist(x, p=2) |
| # just run a single backward, as gradcheck/gradgradcheck is expensive here |
| output.sum().backward() |
| |
| def test_cosine_embedding_loss_with_diff_type(self): |
| for device in device_(): |
| input1 = torch.tensor([[2, 3, 4], [6, 2, 4]], dtype=torch.double, device=device) |
| input2 = torch.tensor([[2, 3, 5], [3, 2, 1]], dtype=torch.double, device=device) |
| target = torch.tensor([1, -1], dtype=torch.int, device=device) |
| expected = torch.nn.functional.cosine_embedding_loss(input1, input2, target) |
| for dt1 in get_all_math_dtypes(device): |
| for dt2 in get_all_math_dtypes(device): |
| for dt3 in get_all_math_dtypes(device): |
| # dt3 is used as dtype for target = [1, -1], so let's skip unsigned type |
| if dt3 == torch.uint8: |
| continue |
| if dt1.is_complex or dt2.is_complex or dt3.is_complex: |
| continue |
| input1 = input1.to(dt1) |
| input2 = input2.to(dt2) |
| target = target.to(dt3) |
| result = torch.nn.functional.cosine_embedding_loss(input1, input2, target) |
| self.assertEqual(result.item(), expected.item(), atol=0.001, rtol=0) |
| |
| def test_cosine_embedding_loss_error_on_diff_shapes(self): |
| for device in device_(): |
| input1 = torch.empty((0, 0), dtype=torch.double, device=device) |
| input2 = torch.empty((0,), dtype=torch.double, device=device) |
| target = torch.empty((0,), dtype=torch.int, device=device) |
| with self.assertRaisesRegex(RuntimeError, ".*expects 2D.*"): |
| torch.nn.functional.cosine_embedding_loss(input1, input2, target) |
| |
| def test_cosine_embedding_loss_error_on_nonexpandable_shapes(self): |
| for device in device_(): |
| input1 = torch.empty((1, 5), dtype=torch.double, device=device) |
| input2 = torch.empty((1, 6), dtype=torch.double, device=device) |
| target = torch.ones((1,), dtype=torch.int, device=device) |
| with self.assertRaisesRegex(RuntimeError, ".*must match the size.*"): |
| torch.nn.functional.cosine_embedding_loss(input1, input2, target) |
| |
| def test_kl_div_with_diff_type(self): |
| for device in device_(): |
| input = torch.tensor([[2, 3, 5], [3, 2, 1]], dtype=torch.double, device=device) |
| target = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.double, device=device) |
| expected = torch.nn.functional.kl_div(input, target) |
| real_dtypes = (torch.float32, torch.float64, torch.float16) |
| for input_dtype, target_dtype in product(real_dtypes, repeat=2): |
| if (torch.device(device).type == 'cpu' and target_dtype == torch.float16): |
| continue |
| input = input.to(input_dtype) |
| target = target.to(target_dtype) |
| result = torch.nn.functional.kl_div(input, target) |
| self.assertEqual(result.item(), expected.item(), atol=0.001, rtol=0) |
| |
| def test_kl_div_with_diff_type_log_target(self): |
| for device in device_(): |
| input = torch.tensor([[2, 3, 5], [3, 2, 1]], dtype=torch.double, device=device) |
| target = torch.tensor([[1, 2, 3], [4, 5, 6]], dtype=torch.double, device=device).log() |
| expected = torch.nn.functional.kl_div(input, target, log_target=True) |
| real_dtypes = (torch.float32, torch.float64, torch.float16) |
| for input_dtype, target_dtype in product(real_dtypes, repeat=2): |
| if (torch.device(device).type == 'cpu' and target_dtype == torch.float16): |
| continue |
| input = input.to(input_dtype) |
| target = target.to(target_dtype) |
| result = torch.nn.functional.kl_div(input, target, log_target=True) |
| self.assertEqual(result.item(), expected.item(), atol=0.001, rtol=0) |
| |
| def test_kl_div_log_softmax_target(self): |
| for device in device_(): |
| a = torch.tensor([[1.0, 2, 3], [5.0, 5, 5]], device=device) |
| b = torch.tensor([[1.0, 2, 3], [5.0, 5, 5]], device=device) |
| self.assertEqual( |
| F.kl_div(F.log_softmax(a, 1), F.log_softmax(b, 1), reduction='none', log_target=True), |
| torch.zeros_like(a) |
| ) |
| |
| def test_cosine_embedding_loss_no_reduce(self): |
| input1 = torch.randn(15, 10, requires_grad=True, dtype=torch.double) |
| input2 = torch.randn(15, 10, requires_grad=True, dtype=torch.double) |
| target = torch.randn(15, dtype=torch.double).sign() |
| self.assertTrue(gradcheck(lambda x, y, z: F.cosine_embedding_loss( |
| x, y, z, reduction='none'), (input1, input2, target))) |
| self.assertEqual(F.cosine_embedding_loss(input1, input2, target, reduction='none'), |
| loss_reference_fns['CosineEmbeddingLoss'](input1, input2, target, reduction='none')) |
| |
| def test_cosine_embedding_loss_margin_no_reduce(self): |
| input1 = torch.randn(15, 10, requires_grad=True, dtype=torch.double) |
| input2 = torch.randn(15, 10, requires_grad=True, dtype=torch.double) |
| target = torch.randn(15, dtype=torch.double).sign() |
| self.assertTrue(gradcheck(lambda x, y, z: F.cosine_embedding_loss( |
| x, y, z, margin=0.5, reduction='none'), (input1, input2, target))) |
| self.assertEqual(F.cosine_embedding_loss(input1, input2, target, margin=0.5, reduction='none'), |
| loss_reference_fns['CosineEmbeddingLoss'](input1, input2, target, |
| margin=0.5, reduction='none')) |
| |
| def test_cosine_embedding_loss_invalid_shape(self): |
| input1 = torch.randn(15, 10) |
| input2 = torch.randn(15, 10) |
| target = torch.randn(15, 1).sign() |
| |
| with self.assertRaisesRegex(RuntimeError, "1D target tensor expected"): |
| F.cosine_embedding_loss(input1, input2, target) |
| |
| with self.assertRaisesRegex(RuntimeError, "1D target tensor expects 2D input tensors"): |
| F.cosine_embedding_loss(torch.randn(10), torch.randn(10), torch.randn(10)) |
| |
| with self.assertRaisesRegex(RuntimeError, "0D target tensor expects 1D input tensors"): |
| F.cosine_embedding_loss(torch.randn(2, 5), torch.randn(2, 5), torch.randn(())) |
| |
| def test_margin_ranking_loss_no_reduce(self): |
| input1 = torch.randn(15, dtype=torch.double).mul_(10).requires_grad_() |
| input2 = torch.randn(15, dtype=torch.double).mul_(10).requires_grad_() |
| target = torch.randn(15, dtype=torch.double).sign() |
| self.assertTrue(gradcheck(lambda x, y, z: F.margin_ranking_loss( |
| x, y, z, reduction='none'), (input1, input2, target))) |
| self.assertEqual(F.margin_ranking_loss(input1, input2, target, reduction='none'), |
| loss_reference_fns['MarginRankingLoss'](input1, input2, target, reduction='none')) |
| |
| def test_margin_ranking_loss_margin_no_reduce(self): |
| input1 = torch.randn(15, dtype=torch.double).mul_(10).requires_grad_() |
| input2 = torch.randn(15, dtype=torch.double).mul_(10).requires_grad_() |
| target = torch.randn(15, dtype=torch.double).sign() |
| self.assertTrue(gradcheck(lambda x, y, z: F.margin_ranking_loss( |
| x, y, z, margin=0.5, reduction='none'), (input1, input2, target))) |
| self.assertEqual(F.margin_ranking_loss(input1, input2, target, margin=0.5, reduction='none'), |
| loss_reference_fns['MarginRankingLoss'](input1, input2, target, margin=0.5, reduction='none')) |
| |
| def test_triplet_margin_loss(self): |
| input1 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) |
| input2 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) |
| input3 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) |
| self.assertTrue(gradcheck(lambda x1, x2, x3: F.triplet_margin_loss( |
| x1, x2, x3), (input1, input2, input3))) |
| self.assertEqual(F.triplet_margin_loss(input1, input2, input3), |
| loss_reference_fns['TripletMarginLoss'](input1, input2, input3)) |
| |
| def test_triplet_margin_loss_swap(self): |
| input1 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) |
| input2 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) |
| input3 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) |
| self.assertTrue(gradcheck(lambda x1, x2, x3: F.triplet_margin_loss( |
| x1, x2, x3, swap=True), (input1, input2, input3))) |
| self.assertEqual(F.triplet_margin_loss(input1, input2, input3, swap=True), |
| loss_reference_fns['TripletMarginLoss'](input1, input2, input3, swap=True)) |
| |
| def test_triplet_margin_loss_no_reduce(self): |
| input1 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) |
| input2 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) |
| input3 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) |
| self.assertTrue(gradcheck(lambda x1, x2, x3: F.triplet_margin_loss( |
| x1, x2, x3, reduction='none'), (input1, input2, input3))) |
| self.assertEqual(F.triplet_margin_loss(input1, input2, input3, reduction='none'), |
| loss_reference_fns['TripletMarginLoss'](input1, input2, input3, reduction='none')) |
| |
| def test_triplet_margin_loss_swap_no_reduce(self): |
| input1 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) |
| input2 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) |
| input3 = torch.randn(5, 10, requires_grad=True, dtype=torch.double) |
| self.assertTrue(gradcheck(lambda x1, x2, x3: F.triplet_margin_loss( |
| x1, x2, x3, swap=True, reduction='none'), (input1, input2, input3))) |
| self.assertEqual(F.triplet_margin_loss(input1, input2, input3, swap=True, reduction='none'), |
| loss_reference_fns['TripletMarginLoss'](input1, input2, input3, swap=True, reduction='none')) |
| |
| def test_pointwise_loss_target_grad_none_reduction(self): |
| i = torch.randn(5, 10) |
| t = torch.randn(5, 10, requires_grad=True) |
| self.assertEqual(F.mse_loss(i, t, reduction='none').size(), t.size()) |
| self.assertEqual(F.l1_loss(i, t, reduction='none').size(), t.size()) |
| |
| def test_pointwise_loss_broadcast(self): |
| losses = { |
| 'mse_loss': lambda x, y, r: F.mse_loss(x, y, reduction=r), |
| 'l1_loss': lambda x, y, r: F.l1_loss(x, y, reduction=r), |
| 'smooth_l1_loss': lambda x, y, r: F.smooth_l1_loss(x, y, reduction=r), |
| 'huber_loss': lambda x, y, r: F.huber_loss(x, y, reduction=r), |
| } |
| |
| input = torch.randn(2, 1, requires_grad=True, dtype=torch.double) |
| for fn in losses.values(): |
| for requires_grad in [True, False]: |
| # When target.requires_grad=True, its impl is in Python, while the other is in TH. |
| target = torch.randn(2, 10, requires_grad=requires_grad, dtype=torch.double) |
| for reduction in ['none', 'mean', 'sum']: |
| l = fn(input, target, reduction) |
| if reduction == 'none': |
| self.assertEqual(l.size(), target.size()) |
| self.assertTrue(gradcheck(fn, (input, target, reduction))) |
| |
| # https://github.com/pytorch/pytorch/issues/27692 reports |
| # that l1_loss get a wrong result for big batch size |
| def test_l1_loss_correct(self): |
| for dtype in [torch.float, torch.cfloat]: |
| for N in range(1, 50, 10): |
| input = torch.rand(N, 3, 1024, 1024, dtype=dtype) |
| self.assertEqual( |
| torch.nn.L1Loss()(input, torch.zeros_like(input)), |
| input.abs().mean()) |
| |
| def test_smoothl1loss_intergral_target(self): |
| def _input_grad(input, target, reduction): |
| output = F.smooth_l1_loss(input, target, reduction=reduction, beta=0.5) |
| output.sum().backward() |
| return input.grad |
| |
| for device, dtype, reduction in product(device_(), |
| integral_types(), |
| ('none', 'sum', 'mean')): |
| input = torch.randn(2, 2, device=device, requires_grad=True) |
| target = torch.randint(0, 9, (2, 2), device=device, dtype=dtype) |
| |
| input_grad_with_float_target = _input_grad(input, target.float(), reduction) |
| |
| input_grad = _input_grad(input.detach().clone().requires_grad_(True), |
| target, |
| reduction) |
| self.assertEqual(input_grad, input_grad_with_float_target) |
| |
| def test_smoothl1loss_negative_beta_not_supported(self): |
| with self.assertRaises(RuntimeError): |
| F.smooth_l1_loss(torch.randn(2, 2), torch.randn(2, 2), beta=-1.0) |
| |
| def test_huber_loss_invalid_delta(self): |
| def _test_huber_loss_delta_error_helper(delta): |
| input, target = torch.randn(2, 2), torch.randn(2, 2) |
| loss = torch.nn.HuberLoss(delta=delta) |
| with self.assertRaises(RuntimeError): |
| loss(input, target) |
| |
| def test_huber_loss_negative_delta(): |
| _test_huber_loss_delta_error_helper(delta=-0.5) |
| |
| def test_huber_loss_zero_delta(): |
| _test_huber_loss_delta_error_helper(delta=0.0) |
| |
| test_huber_loss_negative_delta() |
| test_huber_loss_zero_delta() |
| |
| @set_default_dtype(torch.double) |
| def test_cosine_similarity(self): |
| # Check cosine_similarity input/output shapes |
| input_size = (1, 3, 2, 1) |
| expected_size = (1, 2, 1) |
| input1 = torch.randn(input_size, requires_grad=True) |
| input2 = torch.randn(input_size, requires_grad=True) |
| self.assertEqual(F.cosine_similarity(input1, input2, dim=1).size(), expected_size) |
| |
| # Check numerical precision, issue #18057 |
| vv1 = torch.tensor([float(i) for i in range(84)]).unsqueeze(0) |
| vv2 = torch.tensor([float(i) for i in range(84)]).unsqueeze(0) |
| out = F.cosine_similarity(vv1, vv2) |
| self.assertLessEqual(out, 1.0) |
| |
| # Check dividing by 0. |
| # previous behavior: <x,y>/max(eps, ||x|| * ||y||) |
| # current: <x/max(eps, ||x||), y/max(eps,||y||)> |
| # if f(x,y) is the cosine similarity, then |
| # df/dx = y/(||x|| * ||y||) - (x * <x,y> * ||y||/||x||)/(||x|| * ||y||)^2 |
| # the tests below check division by zero in the backward formula when |
| # x := input2 = 0, y := input1 != 0. |
| # For these inputs the gradient wrt x simplifies to g(x,y) := y/(||x|| * ||y||) |
| # Previous test checks g(x,y) == y/eps, |
| # Current test checks g(x,y) == (y/||y||)/eps. |
| input1 = torch.randn(10).requires_grad_() |
| input2 = torch.zeros_like(input1).requires_grad_() |
| torch.cosine_similarity(input1, input2, 0).sum().backward() |
| self.assertEqual(input1.grad, torch.zeros_like(input1)) |
| self.assertEqual(input2.grad, input1 / input1.norm() * 1e8) |
| |
| # Check type promotion, issue #61454 |
| input = torch.tensor(12.) |
| out = F.cosine_similarity(input.to(torch.int8), input, dim=-1) |
| self.assertEqual(out, 1.) |
| |
| # Check broadcasting #109333 |
| a = torch.ones(2, 3, dtype=torch.float) |
| b = torch.ones(1, 1, dtype=torch.float) |
| out = F.cosine_similarity(a, b) |
| self.assertEqual(out, torch.ones(2, dtype=torch.float)) |
| |
| a = torch.ones(2, 3, dtype=torch.float) |
| b = torch.ones(1, dtype=torch.float) |
| out = F.cosine_similarity(a, b) |
| self.assertEqual(out, torch.ones(2, dtype=torch.float)) |
| |
| |
| def test_grid_sample_error_checking(self): |
| input = torch.empty(1, 1, 2, 2) |
| grid = torch.empty(1, 1, 1, 2) |
| |
| # assert no error |
| F.grid_sample(input, grid, align_corners=False) |
| |
| with self.assertRaisesRegex(ValueError, "but got: 'garbage'"): |
| F.grid_sample(input, grid, mode='garbage', align_corners=False) |
| |
| with self.assertRaisesRegex(ValueError, "but got: 'garbage'"): |
| F.grid_sample(input, grid, padding_mode='garbage', align_corners=False) |
| |
| with self.assertRaisesRegex(RuntimeError, "expected grid to have size 1 in last dimension"): |
| F.grid_sample(input[0], grid, align_corners=False) |
| |
| with self.assertRaisesRegex(RuntimeError, "expected grid to have size 2 in last dimension"): |
| F.grid_sample(input, torch.empty(1, 1, 1, 1, 3), align_corners=False) |
| |
| with self.assertRaisesRegex(RuntimeError, "expected grid and input to have same batch size"): |
| F.grid_sample(input, torch.empty(2, 1, 1, 2), align_corners=False) |
| |
| with self.assertRaisesRegex(RuntimeError, "expected grid to have size 2 in last dimension"): |
| F.grid_sample(input, torch.empty(1, 1, 1, 3), align_corners=False) |
| |
| with self.assertRaisesRegex(RuntimeError, "expected input to have non-empty spatial dimensions"): |
| F.grid_sample(torch.empty(1, 1, 0, 2), grid, align_corners=False) |
| |
| with self.assertRaisesRegex(RuntimeError, "bicubic interpolation only supports 4D input"): |
| F.grid_sample(torch.empty(1, 1, 2, 2, 2), torch.empty(1, 1, 1, 1, 3), mode='bicubic') |
| |
| if TEST_CUDA: |
| with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"): |
| F.grid_sample(input.cuda(), grid, align_corners=False) |
| |
| def test_affine_grid_error_checking(self): |
| # 2D affine |
| theta = torch.empty(1, 2, 3, dtype=torch.double) |
| size = torch.Size([1, 1, 2, 2]) |
| |
| # assert no error |
| F.affine_grid(theta, size, align_corners=False) |
| |
| # check for warning for empty span along dimension |
| with warnings.catch_warnings(record=True) as w: |
| # Ensure warnings are being shown |
| warnings.simplefilter("always") |
| # Should not trigger warning |
| F.affine_grid(theta, torch.Size([1, 1, 2, 1]), align_corners=False) |
| # Check no warning occurs |
| self.assertNotIn('See the documentation of affine_grid for details.', ' '.join(map(str, w))) |
| # Should trigger warning |
| F.affine_grid(theta, torch.Size([1, 1, 2, 1]), align_corners=True) |
| # Check warning occurs |
| self.assertIn('See the documentation of affine_grid for details.', ' '.join(map(str, w))) |
| |
| with self.assertRaisesRegex(ValueError, "Expected theta to have floating point type"): |
| F.affine_grid(theta.int(), size, align_corners=False) |
| |
| with self.assertRaisesRegex(ValueError, "Expected a batch of 2D affine matrices of shape Nx2x3"): |
| F.affine_grid(theta[0], size, align_corners=False) |
| |
| with self.assertRaisesRegex(ValueError, "Expected a batch of 2D affine matrices of shape Nx2x3"): |
| F.affine_grid(theta.unsqueeze(0), size, align_corners=False) |
| |
| with self.assertRaisesRegex(ValueError, "Expected a batch of 2D affine matrices of shape Nx2x3"): |
| F.affine_grid(theta.repeat(1, 2, 1), size, align_corners=False) |
| |
| with self.assertRaisesRegex(ValueError, "Expected a batch of 2D affine matrices of shape Nx2x3"): |
| F.affine_grid(theta.repeat(1, 1, 2), size, align_corners=False) |
| |
| # 3D affine |
| theta = torch.empty(1, 3, 4, dtype=torch.double) |
| size = torch.Size([1, 1, 2, 2, 2]) |
| |
| # assert no error |
| F.affine_grid(theta, size, align_corners=False) |
| |
| # check for warning for empty span along dimension |
| with warnings.catch_warnings(record=True) as w: |
| # Ensure warnings are being shown |
| warnings.simplefilter("always") |
| # Should not trigger warning |
| F.affine_grid(theta, torch.Size([1, 1, 3, 2, 1]), align_corners=False) |
| # Check no warning occurs |
| self.assertNotIn('See the documentation of affine_grid for details.', ' '.join(map(str, w))) |
| # Should trigger warning |
| F.affine_grid(theta, torch.Size([1, 1, 3, 2, 1]), align_corners=True) |
| # Check warning occurs |
| self.assertIn('See the documentation of affine_grid for details.', ' '.join(map(str, w))) |
| |
| with self.assertRaisesRegex(ValueError, "Expected a batch of 3D affine matrices of shape Nx3x4"): |
| F.affine_grid(theta[0], size, align_corners=False) |
| |
| with self.assertRaisesRegex(ValueError, "Expected a batch of 3D affine matrices of shape Nx3x4"): |
| F.affine_grid(theta.unsqueeze(0), size, align_corners=False) |
| |
| with self.assertRaisesRegex(ValueError, "Expected a batch of 3D affine matrices of shape Nx3x4"): |
| F.affine_grid(theta.repeat(1, 2, 1), size, align_corners=False) |
| |
| with self.assertRaisesRegex(ValueError, "Expected a batch of 3D affine matrices of shape Nx3x4"): |
| F.affine_grid(theta.repeat(1, 1, 2), size, align_corners=False) |
| |
| with self.assertRaisesRegex(NotImplementedError, "affine_grid only supports 4D and 5D sizes"): |
| F.affine_grid(theta, torch.Size([1, 2, 2]), align_corners=False) |
| |
| with self.assertRaisesRegex(NotImplementedError, "affine_grid only supports 4D and 5D sizes"): |
| F.affine_grid(theta, torch.Size([1, 1, 2, 2, 2, 2]), align_corners=False) |
| |
| @parametrize_test('device', ['cpu'] + (['cuda'] if TEST_CUDA else [])) |
| @parametrize_test('nd', [2, 3]) |
| def test_affine_grid_backward_cl_cf_consistency(self, device, nd): |
| # Test based on reported issue: https://github.com/pytorch/pytorch/issues/124154 |
| |
| theta = torch.rand([6, nd, nd + 1], requires_grad=True, device=device) |
| size = [6, 3, 4, 5] if nd == 2 else [6, 3, 4, 5, 5] |
| grid = torch.nn.functional.affine_grid(theta, size, align_corners=False) |
| |
| grad_tensor = torch.rand(grid.shape, device=device) |
| |
| memory_format_cl = torch.channels_last if nd == 2 else torch.channels_last_3d |
| grad_tensor_cl = grad_tensor.contiguous(memory_format=memory_format_cl) |
| |
| assert theta.grad is None |
| grid.backward(grad_tensor_cl) |
| theta_grad_cl = theta.grad.clone().contiguous() |
| |
| theta.grad.zero_() |
| grid.backward(grad_tensor) |
| theta_grad_cf = theta.grad |
| |
| self.assertEqual(theta_grad_cf, theta_grad_cl) |
| |
| @set_default_dtype(torch.double) |
| def test_grid_sample(self): |
| # Backward pass of native C++ and CUDA kernels branch depending on whether input requires gradient, |
| # so we test both cases. |
| def test(N, C, H, W, mode, padding_mode, align_corners, input_requires_grad): |
| def test_shape(N, C, IH, IW, H, W, mode, padding_mode, align_corners): |
| for grid_dim_contig_order in [(0, 1, 2, 3), (0, 3, 1, 2), (3, 0, 1, 2), (0, 2, 1, 3)]: |
| # grid_dim_contig_order specifies the dimension order that can |
| # make grid to be contiguous. |
| # i.e., grid.permute(grid_dim_contig_order) is contiguous. |
| # e.g., with grid_dim_contig_order=[0, 3, 1, 2], grid should be |
| # initialized with contiguous tensor of shape [N, 2, H, W] |
| # and permuted to [N, H, W, 2] afterwards. |
| grid_shape = [N, H, W, 2] |
| grid_init_shape = [grid_shape[d] for d in grid_dim_contig_order] |
| grid_fwd_permute = [None, None, None, None] |
| for i, d in enumerate(grid_dim_contig_order): |
| grid_fwd_permute[d] = i |
| |
| def get_grid(device='cpu', data=None): |
| if data is not None: |
| assert list(data.shape) == grid_shape |
| data = data.permute(grid_dim_contig_order).to(device) |
| else: |
| data = torch.randn(grid_init_shape, device=device) |
| grid = data.permute(grid_fwd_permute) |
| assert grid.permute(grid_dim_contig_order).is_contiguous() |
| return grid |
| |
| input_cpu = torch.randn(C, N, IH, IW).transpose(0, 1).requires_grad_(input_requires_grad) |
| grid_cpu = get_grid().requires_grad_() |
| out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode, |
| align_corners=align_corners) |
| self.assertTrue(out_cpu.size() == torch.Size([N, C, H, W])) |
| |
| gradients = torch.randn_like(out_cpu) |
| out_cpu.backward(gradients) |
| |
| |
| # Compare against unvectorized CPU fallback |
| |
| # NOTE [ grid_sample CPU fallback ] |
| # grid_sample uses AVX for 2d images, but that requires 32-bit indexing for |
| # 32-bit floats. So we also have a fallback that is used only for float tensors |
| # requiring 64-bit indexing. That requires too much memory to run on CI, so we |
| # also export the fallback and test it here to ensure feature parity with |
| # the vectorized version. |
| input_fallback = input_cpu.float().detach_().requires_grad_() |
| grid_fallback = grid_cpu.float().detach_().requires_grad_() |
| out_fallback = torch._grid_sampler_2d_cpu_fallback( |
| input_fallback, grid_fallback, |
| F.GRID_SAMPLE_PADDING_MODES[padding_mode], |
| align_corners) |
| self.assertEqual(out_fallback, out_cpu.float(), atol=1e-5, rtol=5e-5) |
| |
| out_fallback.backward(gradients.float()) |
| if input_requires_grad: |
| self.assertEqual(input_fallback.grad, input_cpu.grad.float(), atol=1e-4, rtol=5e-5) |
| self.assertEqual(grid_fallback.grad, grid_cpu.grad.float(), atol=1e-4, rtol=5e-5) |
| |
| if TEST_CUDA: |
| input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_(input_requires_grad) |
| grid_cuda = get_grid('cuda', grid_cpu.detach()).requires_grad_() |
| out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode, |
| align_corners=align_corners) |
| self.assertEqual(out_cpu, out_cuda) |
| |
| out_cuda.backward(gradients.cuda()) |
| if input_requires_grad: |
| self.assertEqual(input_cpu.grad, input_cuda.grad) |
| self.assertEqual(grid_cpu.grad, grid_cuda.grad, atol=5e-5, rtol=0) |
| |
| # check that zero-dimensional input strides don't error out |
| base_input = torch.randn(N, C, 1, IW) |
| input_cpu = base_input.expand_as(input_cuda).requires_grad_(input_requires_grad) |
| out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode, |
| align_corners=align_corners) |
| |
| input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_(input_requires_grad) |
| out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode, |
| align_corners=align_corners) |
| self.assertEqual(out_cpu, out_cuda) |
| |
| # test same size output |
| test_shape(N, C, H, W, H, W, mode, padding_mode, align_corners) |
| |
| # test larger output |
| N = random.randint(2, 8) |
| C = random.randint(2, 8) |
| IH = random.randint(2, 8) |
| IW = random.randint(2, 8) |
| H = random.randint(IH + 1, 12) |
| W = random.randint(IW + 1, 12) |
| test_shape(N, C, IH, IW, H, W, mode, padding_mode, align_corners) |
| |
| # test smaller output |
| N = random.randint(2, 8) |
| C = random.randint(2, 8) |
| IH = random.randint(2, 8) |
| IW = random.randint(2, 8) |
| H = random.randint(2, IH) |
| W = random.randint(2, IW) |
| test_shape(N, C, IH, IW, H, W, mode, padding_mode, align_corners) |
| |
| # test 1x1 inpput |
| N = random.randint(2, 8) |
| C = random.randint(2, 8) |
| IH = 1 |
| IW = 1 |
| H = random.randint(2, 5) |
| W = random.randint(2, 5) |
| test_shape(N, C, IH, IW, H, W, mode, padding_mode, align_corners) |
| |
| # testing empty grid |
| N = random.randint(2, 8) |
| C = random.randint(2, 8) |
| IH = random.randint(2, 8) |
| IW = random.randint(2, 8) |
| W = random.randint(3, IW + 2) |
| test_shape(N, C, IH, IW, 0, W, mode, padding_mode, align_corners) |
| |
| # testing empty channel |
| N = random.randint(2, 8) |
| IH = random.randint(2, 8) |
| IW = random.randint(2, 8) |
| H = random.randint(3, IH + 2) |
| W = random.randint(3, IW + 2) |
| test_shape(N, 0, IH, IW, H, W, mode, padding_mode, align_corners) |
| |
| # testing empty batch |
| C = random.randint(2, 8) |
| IH = random.randint(2, 8) |
| IW = random.randint(2, 8) |
| H = random.randint(3, IH + 2) |
| W = random.randint(3, IW + 2) |
| test_shape(0, C, IH, IW, H, W, mode, padding_mode, align_corners) |
| |
| for mode in ('bilinear', 'nearest', 'bicubic'): |
| for padding_mode in ('zeros', 'border', 'reflection'): |
| for align_corners in (True, False): |
| # test known input on CPU |
| input = torch.arange(1., 11).view(1, 1, 2, 5) |
| grid = torch.tensor( |
| [[[-0.9, -4.1], [0, 0.2000], [1, -1], [-0.333, 1e-6], [0.5, 1.0]], |
| [[-1.0, -0.5], [0, 0.3333], [1, -1], [-0.200, 1e-6], [1.5, 0.5]]]).view(1, 2, 5, 2) |
| if mode == 'bilinear': |
| if padding_mode == 'zeros': |
| if align_corners: |
| groundtruth = torch.tensor( |
| [[0.0000, 6.0000000000, 5.0000, 4.8340, 9.0000], |
| [2.2500, 6.3332500450, 5.0000, 5.1000, 0.0000]]).view(1, 1, 2, 5) |
| else: |
| groundtruth = torch.tensor( |
| [[0.0000, 6.5000000000, 1.2500, 4.6675000191, 4.6250], |
| [0.5000, 7.1665000916, 1.2500, 5.0000000000, 0.0000]]).view(1, 1, 2, 5) |
| elif padding_mode == 'border': |
| if align_corners: |
| groundtruth = torch.tensor( |
| [[1.2000, 6.0000000000, 5.0000, 4.8340, 9.0000], |
| [2.2500, 6.3332500450, 5.0000, 5.1000, 8.7500]]).view(1, 1, 2, 5) |
| else: |
| groundtruth = torch.tensor( |
| [[1.0000, 6.5000000000, 5.0000, 4.6675000191, 9.2500], |
| [1.0000, 7.1665000916, 5.0000, 5.0000000000, 10.0000]]).view(1, 1, 2, 5) |
| elif padding_mode == 'reflection': |
| if align_corners: |
| groundtruth = torch.tensor( |
| [[3.4500, 6.0000000000, 5.0000, 4.8340, 9.0000], |
| [2.2500, 6.3332500450, 5.0000, 5.1000, 7.7500]]).view(1, 1, 2, 5) |
| else: |
| groundtruth = torch.tensor( |
| [[3.0000004768, 6.5000000000, 5.0000, 4.6675000191, 9.2500], |
| [1.0000000000, 7.1665000916, 5.0000, 5.0000000000, 9.2500]]).view(1, 1, 2, 5) |
| else: |
| raise AssertionError(f"missing groundtruth test for padding mode '{padding_mode}'") |
| elif mode == 'nearest': |
| if padding_mode == 'zeros': |
| if align_corners: |
| groundtruth = torch.tensor( |
| [[0., 8., 5., 7., 9.], |
| [1., 8., 5., 8., 0.]]).view(1, 1, 2, 5) |
| else: |
| groundtruth = torch.tensor( |
| [[0., 8., 5., 7., 0.], |
| [1., 8., 5., 8., 0.]]).view(1, 1, 2, 5) |
| elif padding_mode == 'border': |
| if align_corners: |
| groundtruth = torch.tensor( |
| [[1., 8., 5., 7., 9.], |
| [1., 8., 5., 8., 10.]]).view(1, 1, 2, 5) |
| else: |
| groundtruth = torch.tensor( |
| [[1., 8., 5., 7., 9.], |
| [1., 8., 5., 8., 10.]]).view(1, 1, 2, 5) |
| elif padding_mode == 'reflection': |
| if align_corners: |
| groundtruth = torch.tensor( |
| [[1., 8., 5., 7., 9.], |
| [1., 8., 5., 8., 9.]]).view(1, 1, 2, 5) |
| else: |
| groundtruth = torch.tensor( |
| [[1., 8., 5., 7., 9.], |
| [1., 8., 5., 8., 9.]]).view(1, 1, 2, 5) |
| else: |
| raise AssertionError(f"missing groundtruth test for padding mode '{padding_mode}'") |
| elif mode == 'bicubic': |
| if padding_mode == 'zeros': |
| if align_corners: |
| groundtruth = torch.tensor( |
| [[-0.10424726, 7.1400003, 5.0000, 5.7842274, 9.0000], |
| [2.4492188, 7.4814040, 5.0000, 6.0277520, 0.0000]]).view(1, 1, 2, 5) |
| else: |
| groundtruth = torch.tensor( |
| [[0.00000, 7.6287503, 1.0625, 5.5977230, 5.3270264], |
| [0.40625, 8.0288770, 1.0625, 5.9375067, -0.3515625]]).view(1, 1, 2, 5) |
| elif padding_mode == 'border': |
| if align_corners: |
| groundtruth = torch.tensor( |
| [[1.1520010, 6.0599990, 5.0000, 4.870930, 9.0000000], |
| [2.1328125, 6.4258375, 5.0000, 5.076003, 8.8671875]]).view(1, 1, 2, 5) |
| else: |
| groundtruth = torch.tensor( |
| [[0.894531, 6.6050020, 4.625, 4.7138715, 9.800781], |
| [0.906250, 7.2822485, 4.625, 5.0000052, 10.00000]]).view(1, 1, 2, 5) |
| elif padding_mode == 'reflection': |
| if align_corners: |
| groundtruth = torch.tensor( |
| [[3.1822524, 6.239998, 5.0000, 4.8709273, 9.00000], |
| [1.7812500, 6.703594, 5.0000, 5.0760007, 8.21875]]).view(1, 1, 2, 5) |
| else: |
| groundtruth = torch.tensor( |
| [[2.7993753, 6.6050020, 4.25, 4.7138715, 10.269531], |
| [0.8125000, 7.2822485, 4.25, 5.0000052, 9.332031]]).view(1, 1, 2, 5) |
| else: |
| raise AssertionError(f"missing groundtruth test for padding mode '{padding_mode}'") |
| |
| else: |
| raise AssertionError(f"missing groundtruth test for interpolation mode '{mode}'") |
| output = F.grid_sample(input, grid, mode=mode, padding_mode=padding_mode, |
| align_corners=align_corners) |
| self.assertEqual(output, groundtruth, atol=1e-5, rtol=0, |
| msg=f"groundtruth comparison failed for mode={mode}, " |
| f"padding_mode={padding_mode}") |
| |
| # See NOTE [ grid_sample CPU fallback ] |
| output = torch._grid_sampler_2d_cpu_fallback( |
| input.float(), grid.float(), |
| F.GRID_SAMPLE_PADDING_MODES[padding_mode], |
| align_corners) |
| self.assertEqual(output, groundtruth.float(), atol=1e-5, rtol=0) |
| |
| # explicit check for gradient edge cases |
| input = torch.arange(0., 5).expand((1, 1, 5, 5)) |
| grid = torch.tensor( |
| [[[1.0, 1.0], [1.0, -1.0], [0.8, 0.8], [0.8, -0.8]], |
| [[-1.0, -1.0], [-1.0, 1.0], [-0.8, -0.8], [-0.8, 0.8]]]).view(1, 2, 4, 2).requires_grad_() |
| if mode == 'bilinear': |
| if padding_mode == 'zeros': |
| if align_corners: |
| groundtruth = torch.tensor( |
| [[[[-8., -8.], [-8., 0.], [2., 0.], [2., 0.]], |
| [[2., 0.], [2., 0.], [2., 0.], [2., 0.]]]]).view(1, 2, 4, 2) |
| else: |
| groundtruth = torch.tensor( |
| [[[[-5., -5.], [-5., 5.], [-10., -10.], [-10., 10.]], |
| [[0., 0.], [0., 0.], [0., 0.], [0., 0.]]]]).view(1, 2, 4, 2) |
| elif padding_mode == 'border': |
| if align_corners: |
| groundtruth = torch.tensor( |
| [[[[-0., -0.], [-0., 0.], [2., 0.], [2., 0.]], |
| [[0., 0.], [0., 0.], [2., 0.], [2., 0.]]]]).view(1, 2, 4, 2) |
| else: |
| groundtruth = torch.tensor( |
| [[[[-0., -0.], [-0., 0.], [-0., -0.], [-0., 0.]], |
| [[0., 0.], [0., 0.], [0., 0.], [0., 0.]]]]).view(1, 2, 4, 2) |
| elif padding_mode == 'reflection': |
| if align_corners: |
| groundtruth = torch.tensor( |
| [[[[-0., -0.], [-0., 0.], [2., 0.], [2., 0.]], |
| [[0., 0.], [0., 0.], [2., 0.], [2., 0.]]]]).view(1, 2, 4, 2) |
| else: |
| groundtruth = torch.tensor( |
| [[[[-0., -0.], [-0., 0.], [-0., -0.], [-0., 0.]], |
| [[0., 0.], [0., 0.], [0., 0.], [0., 0.]]]]).view(1, 2, 4, 2) |
| else: |
| raise AssertionError(f"missing gradient groundtruth test for padding mode '{padding_mode}'") |
| elif mode == 'nearest': |
| groundtruth = torch.tensor( |
| [[[[-0., -0.], [-0., 0.], [-0., -0.], [-0., 0.]], |
| [[0., 0.], [0., 0.], [0., 0.], [0., 0.]]]]).view(1, 2, 4, 2) |
| elif mode == 'bicubic': |
| if padding_mode == 'zeros': |
| if align_corners: |
| groundtruth = torch.tensor( |
| [[[[-4.5, -6.], [-4.5, 6.], [2.725679, 0.740878], [2.725679, -0.740878]], |
| [[1.5, 0.], [1.5, 0.], [1.927921, -0.05688], [1.927921, 0.05688]]]]).view(1, 2, 4, 2) |
| else: |
| groundtruth = torch.tensor( |
| [[[[-5.859375, -5.888672], [-5.859375, 5.888672], [-5.6250, -7.5000], [-5.6250, 7.5000]], |
| [[-0.234375, -0.263672], [-0.234375, 0.263672], [1.8750, 0.], [1.8750, 0.]]]] |
| ).view(1, 2, 4, 2) |
| elif padding_mode == 'border': |
| if align_corners: |
| groundtruth = torch.tensor( |
| [[[[1.5, 0.], [1.5, 0.], [1.74, 0.], [1.74, 0.]], |
| [[1.5, 0.], [1.5, 0.], [1.74, 0.], [1.74, 0.]]]]).view(1, 2, 4, 2) |
| else: |
| groundtruth = torch.tensor( |
| [[[[-0.46875, 0.], [-0.46875, 0.], [1.8750, 0.], [1.8750, 0.]], |
| [[-0.46875, 0.], [-0.46875, 0.], [1.8750, 0.], [1.8750, 0.]]]]).view(1, 2, 4, 2) |
| elif padding_mode == 'reflection': |
| if align_corners: |
| groundtruth = torch.tensor( |
| [[[[0., 0.], [0., 0.], [1.92, 0.], [1.92, 0.]], |
| [[0., 0.], [0., 0.], [1.92, 0.], [1.92, 0.]]]]).view(1, 2, 4, 2) |
| else: |
| groundtruth = torch.tensor( |
| [[[[0., 0.], [0., 0.], [1.875, 0.], [1.875, 0.]], |
| [[0., 0.], [0., 0.], [1.875, 0.], [1.875, 0.]]]]).view(1, 2, 4, 2) |
| else: |
| raise AssertionError(f"missing gradient groundtruth test for padding mode '{padding_mode}'") |
| else: |
| raise AssertionError(f"missing gradient groundtruth test for interpolation mode '{mode}'") |
| for input_requires_grad in [False, True]: |
| input = input.requires_grad_(input_requires_grad) |
| F.grid_sample(input, grid, mode=mode, padding_mode=padding_mode, |
| align_corners=align_corners).sum().backward() |
| self.assertEqual(grid.grad, groundtruth, atol=1e-5, rtol=0, |
| msg=f"gradient groundtruth comparison failed for mode={mode}, " |
| f"padding_mode={padding_mode}, input_requires_grad={input_requires_grad}") |
| grid.grad.zero_() |
| |
| # See NOTE [ grid_sample CPU fallback ] |
| torch._grid_sampler_2d_cpu_fallback( |
| input.float(), grid.float(), |
| F.GRID_SAMPLE_PADDING_MODES[padding_mode], |
| align_corners).sum().backward() |
| self.assertEqual(grid.grad, groundtruth, atol=1e-5, rtol=0) |
| |
| # do gradcheck |
| N = random.randint(2, 8) |
| C = random.randint(2, 6) |
| H = random.randint(2, 8) |
| W = random.randint(2, 8) |
| input = torch.randn(N, C, H, W, requires_grad=True) |
| grid = torch.randn(N, H, W, 2, requires_grad=True) |
| |
| for input_requires_grad in [False, True]: |
| input.requires_grad_(input_requires_grad) |
| self.assertTrue(gradcheck( |
| lambda inp, grd: F.grid_sample(inp, grd, mode=mode, padding_mode=padding_mode, |
| align_corners=align_corners), |
| (input, grid))) |
| test(N, C, H, W, mode, padding_mode, align_corners, input_requires_grad) |
| if TEST_CUDNN: |
| with cudnn.flags(enabled=False): |
| test(N, C, H, W, mode, padding_mode, align_corners, input_requires_grad) |
| |
| @set_default_dtype(torch.double) |
| def test_grid_sample_3d(self): |
| # Backward pass of native C++ and CUDA kernels branch depending on whether input requires gradient, |
| # so we test both cases. |
| def test(N, C, D, H, W, mode, padding_mode, align_corners, input_requires_grad): |
| def test_shape(N, C, ID, IH, IW, D, H, W, mode, padding_mode, align_corners): |
| input_cpu = torch.randn(C, N, ID, IH, IW).transpose(0, 1).requires_grad_(input_requires_grad) |
| grid_cpu = torch.randn(D, N, H, W, 3).transpose(0, 1).requires_grad_() |
| out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode, |
| align_corners=align_corners) |
| self.assertTrue(out_cpu.size() == torch.Size([N, C, D, H, W])) |
| |
| gradients = torch.randn_like(out_cpu) |
| out_cpu.backward(gradients) |
| |
| if TEST_CUDA: |
| input_cuda = input_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_(input_requires_grad) |
| grid_cuda = grid_cpu.detach().transpose(0, 1).cuda().transpose(0, 1).requires_grad_() |
| out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode, |
| align_corners=align_corners) |
| self.assertEqual(out_cpu, out_cuda) |
| |
| out_cuda.backward(gradients.cuda()) |
| if input_requires_grad: |
| self.assertEqual(input_cpu.grad, input_cuda.grad) |
| self.assertEqual(grid_cpu.grad, grid_cuda.grad, atol=5e-5, rtol=0) |
| |
| # check that zero-dimensional input strides don't error out |
| base_input = torch.randn(N, C, 1, IH, IW) |
| input_cpu = base_input.expand_as(input_cuda).requires_grad_(input_requires_grad) |
| grid_cpu = torch.randn(N, D, H, W, 3, requires_grad=True) |
| out_cpu = F.grid_sample(input_cpu, grid_cpu, mode=mode, padding_mode=padding_mode, |
| align_corners=align_corners) |
| |
| input_cuda = base_input.cuda().expand_as(input_cuda).requires_grad_(input_requires_grad) |
| grid_cuda = grid_cpu.detach().cuda().requires_grad_() |
| out_cuda = F.grid_sample(input_cuda, grid_cuda, mode=mode, padding_mode=padding_mode, |
| align_corners=align_corners) |
| self.assertEqual(out_cpu, out_cuda) |
| |
| # test same size output |
| test_shape(N, C, D, H, W, D, H, W, mode, padding_mode, align_corners) |
| |
| # test larger output |
| N = random.randint(2, 7) |
| C = random.randint(2, 5) |
| ID = random.randint(2, 7) |
| IH = random.randint(2, 7) |
| IW = random.randint(2, 7) |
| D = random.randint(ID + 1, 10) |
| H = random.randint(IH + 1, 10) |
| W = random.randint(IW + 1, 10) |
| test_shape(N, C, ID, IH, IW, D, H, W, mode, padding_mode, align_corners) |
| |
| # test smaller output |
| N = random.randint(2, 7) |
| C = random.randint(2, 5) |
| ID = random.randint(2, 7) |
| IH = random.randint(2, 7) |
| IW = random.randint(2, 7) |
| D = random.randint(2, ID) |
| H = random.randint(2, IH) |
| W = random.randint(2, IW) |
| test_shape(N, C, ID, IH, IW, D, H, W, mode, padding_mode, align_corners) |
| |
| # test 1x1 inpput |
| N = random.randint(2, 7) |
| C = random.randint(2, 7) |
| ID = 1 |
| IH = 1 |
| IW = 1 |
| H = random.randint(2, 5) |
| W = random.randint(2, 5) |
| test_shape(N, C, ID, IH, IW, D, H, W, mode, padding_mode, align_corners) |
| |
| # testing empty grid |
| N = random.randint(2, 7) |
| C = random.randint(2, 5) |
| ID = random.randint(2, 7) |
| IH = random.randint(2, 7) |
| IW = random.randint(2, 7) |
| D = random.randint(3, ID + 2) |
| W = random.randint(3, IW + 2) |
| test_shape(N, C, ID, IH, IW, D, 0, W, mode, padding_mode, align_corners) |
| |
| # testing empty channel |
| N = random.randint(2, 7) |
| ID = random.randint(2, 5) |
| IH = random.randint(2, 7) |
| IW = random.randint(2, 7) |
| D = random.randint(3, ID + 2) |
| H = random.randint(3, IH + 2) |
| W = random.randint(3, IW + 2) |
| test_shape(N, 0, ID, IH, IW, D, H, W, mode, padding_mode, align_corners) |
| |
| # testing empty batch |
| C = random.randint(2, 5) |
| ID = random.randint(2, 7) |
| IH = random.randint(2, 7) |
| IW = random.randint(2, 7) |
| D = random.randint(3, ID + 2) |
| H = random.randint(3, IH + 2) |
| W = random.randint(3, IW + 2) |
| test_shape(0, C, ID, IH, IW, D, H, W, mode, padding_mode, align_corners) |
| |
| for mode in ('bilinear', 'nearest'): |
| for padding_mode in ('zeros', 'border', 'reflection'): |
| for align_corners in (True, False): |
| # do gradcheck |
| N = random.randint(2, 5) |
| C = random.randint(2, 4) |
| D = random.randint(2, 5) |
| H = random.randint(2, 5) |
| W = random.randint(2, 5) |
| input = torch.randn(N, C, D, H, W, requires_grad=True) |
| grid = torch.randn(N, D, H, W, 3, requires_grad=True) |
| self.assertTrue(gradcheck( |
| lambda inp, grid: F.grid_sample(inp, grid, mode=mode, padding_mode=padding_mode, |
| align_corners=align_corners), |
| (input, grid))) |
| input = input.requires_grad_(False) |
| self.assertTrue(gradcheck( |
| lambda grid: F.grid_sample(input, grid, mode=mode, padding_mode=padding_mode, |
| align_corners=align_corners), |
| (grid,))) |
| |
| for input_requires_grad in [False, True]: |
| test(N, C, D, H, W, mode, padding_mode, align_corners, input_requires_grad) |
| |
| def test_grid_sample_nearest_neighbor_rounding_mode_consistency(self): |
| |
| device_list = ['cpu'] |
| if TEST_CUDA: |
| device_list.append('cuda') |
| |
| def normalize_indices(indices_unnormalized: torch.Tensor, dim_size: int, align_corners: bool): |
| if align_corners: |
| indices_normalized = 2 * indices_unnormalized / (dim_size - 1) - 1 |
| else: |
| indices_normalized = (indices_unnormalized * 2 + 1) / dim_size - 1 |
| return indices_normalized |
| |
| test_dim_size = 10 |
| non_test_dim_size = 9 |
| step_size = 0.1 |
| |
| batch_size = 1 |
| channel_size = 1 |
| |
| mode = 'nearest' |
| for device in device_list: |
| for padding_mode in ('zeros', 'border', 'reflection'): |
| for align_corners in (True, False): |
| # Unnormalized inquiry indices |
| inquiry_indices_unnormalized = torch.arange( |
| 0, |
| test_dim_size - 1 + step_size, step_size, |
| dtype=torch.float32, |
| device=device |
| ) |
| # Note that even though we are trying to create normalized indices |
| # which results in x.0 and x.5 indices after unnormalization, |
| # because of the numerical error, |
| # the rounding direction might not always be expected as designed. |
| # The best we could do is to ensure the rounding behaviors across |
| # different implementations for different dimensions are |
| # exactly the same. |
| inquiry_indices = normalize_indices( |
| indices_unnormalized=inquiry_indices_unnormalized, |
| dim_size=test_dim_size, |
| align_corners=align_corners |
| ) |
| num_inqueries = inquiry_indices.shape[0] |
| inquiry_fixed_indices = torch.full((num_inqueries,), 0.5, dtype=torch.float32, device=device) |
| array_data = torch.rand(test_dim_size, dtype=torch.float32, device=device) |
| # 2D grid sample x-dim interpolation |
| # The input_tensor_2d_x is of shape |
| # [batch_size, channel_size, non_test_dim_size, test_dim_size] |
| input_tensor_2d_x = array_data.reshape(1, test_dim_size).repeat( |
| batch_size, |
| channel_size, |
| non_test_dim_size, |
| 1 |
| ) |
| # The grid_tensor_2d_x is of shape |
| # [batch_size, 1, num_inqueries] |
| grid_tensor_2d_x = torch.cat( |
| tensors=( |
| inquiry_indices.reshape(num_inqueries, 1), |
| inquiry_fixed_indices.reshape(num_inqueries, 1), |
| ), |
| dim=1 |
| ).repeat(batch_size, 1, 1, 1) |
| # The output_tensor_2d_x is of shape |
| # [batch_size, channel_size, 1, num_inqueries] |
| output_tensor_2d_x = F.grid_sample( |
| input=input_tensor_2d_x, |
| grid=grid_tensor_2d_x, |
| mode=mode, |
| padding_mode=padding_mode, |
| align_corners=align_corners, |
| ) |
| # 2D grid sample y-dim interpolation |
| # The input_tensor_2d_y is of shape |
| # [batch_size, channel_size, test_dim_size, non_test_dim_size] |
| input_tensor_2d_y = torch.transpose(input_tensor_2d_x, 3, 2) |
| # The grid_tensor_2d_y is of shape |
| # [batch_size, 1, num_inqueries] |
| grid_tensor_2d_y = torch.index_select( |
| grid_tensor_2d_x, |
| -1, |
| torch.tensor([1, 0], dtype=torch.int64, device=device) |
| ) |
| # The output_tensor_2d_y is of shape |
| # [batch_size, channel_size, 1, num_inqueries] |
| output_tensor_2d_y = F.grid_sample( |
| input=input_tensor_2d_y, |
| grid=grid_tensor_2d_y, |
| mode=mode, |
| padding_mode=padding_mode, |
| align_corners=align_corners, |
| ) |
| self.assertEqual(output_tensor_2d_x[0, 0, 0, :], output_tensor_2d_y[0, 0, 0, :], atol=0, rtol=0) |
| # 3D grid sample x-dim interpolation |
| # The input_tensor_3d_x is of shape |
| # [batch_size, channel_size, non_test_dim_size, non_test_dim_size, test_dim_size] |
| input_tensor_3d_x = array_data.reshape(1, test_dim_size).repeat( |
| batch_size, channel_size, non_test_dim_size, non_test_dim_size, 1) |
| # The grid_tensor_3d_x is of shape |
| # [batch_size, 1, 1, num_inqueries] |
| grid_tensor_3d_x = torch.cat( |
| tensors=( |
| inquiry_indices.reshape(num_inqueries, 1), |
| inquiry_fixed_indices.reshape(num_inqueries, 1), |
| inquiry_fixed_indices.reshape(num_inqueries, 1), |
| ), |
| dim=1 |
| ).repeat(batch_size, 1, 1, 1, 1) |
| # The output_tensor_3d_x is of shape |
| # [batch_size, channel_size, 1, 1, num_inqueries] |
| output_tensor_3d_x = F.grid_sample( |
| input=input_tensor_3d_x, |
| grid=grid_tensor_3d_x, |
| mode=mode, |
| padding_mode=padding_mode, |
| align_corners=align_corners, |
| ) |
| self.assertEqual(output_tensor_2d_x[0, 0, 0, :], output_tensor_3d_x[0, 0, 0, 0, :], atol=0, rtol=0) |
| # 3D grid sample y-dim interpolation |
| # The input_tensor_3d_y is of shape |
| # [batch_size, channel_size, non_test_dim_size, test_dim_size, non_test_dim_size] |
| input_tensor_3d_y = torch.transpose(input_tensor_3d_x, 4, 3) |
| # The grid_tensor_3d_y is of shape |
| # [batch_size, 1, 1, num_inqueries] |
| grid_tensor_3d_y = torch.index_select( |
| grid_tensor_3d_x, |
| -1, |
| torch.tensor([1, 0, 2], dtype=torch.int64, device=device) |
| ) |
| # The output_tensor_3d_y is of shape |
| # [batch_size, channel_size, 1, 1, num_inqueries] |
| output_tensor_3d_y = F.grid_sample( |
| input=input_tensor_3d_y, |
| grid=grid_tensor_3d_y, |
| mode=mode, |
| padding_mode=padding_mode, |
| align_corners=align_corners, |
| ) |
| self.assertEqual(output_tensor_2d_x[0, 0, 0, :], output_tensor_3d_y[0, 0, 0, 0, :], atol=0, rtol=0) |
| # 3D grid sample z-dim interpolation |
| # The input_tensor_3d_z is of shape |
| # [batch_size, channel_size, non_test_dim_size, non_test_dim_size, test_dim_size] |
| input_tensor_3d_z = torch.transpose(input_tensor_3d_x, 4, 2) |
| # The grid_tensor_3d_z is of shape |
| # [batch_size, 1, 1, num_inqueries] |
| grid_tensor_3d_z = torch.index_select( |
| grid_tensor_3d_x, |
| -1, |
| torch.tensor([1, 2, 0], dtype=torch.int64, device=device) |
| ) |
| # The output_tensor_3d_z is of shape |
| # [batch_size, channel_size, 1, 1, num_inqueries] |
| output_tensor_3d_z = F.grid_sample( |
| input=input_tensor_3d_z, |
| grid=grid_tensor_3d_z, |
| mode=mode, |
| padding_mode=padding_mode, |
| align_corners=align_corners, |
| ) |
| self.assertEqual(output_tensor_2d_x[0, 0, 0, :], output_tensor_3d_z[0, 0, 0, 0, :], atol=0, rtol=0) |
| |
| @set_default_dtype(torch.double) |
| def test_affine_grid(self): |
| # test known input on CPU |
| input = torch.arange(1., 7).view(1, 2, 3) |
| output = F.affine_grid(input, torch.Size([1, 1, 2, 2]), align_corners=True) |
| groundtruth = torch.tensor( |
| [[[0., -3.], [2., 5.]], [[4., 7.], [6., 15.]]]).view(1, 2, 2, 2) |
| self.assertEqual(output, groundtruth) |
| output = F.affine_grid(input, torch.Size([1, 1, 2, 2]), align_corners=False) |
| groundtruth = torch.tensor( |
| [[[1.5, 1.5], [2.5, 5.5]], [[3.5, 6.5], [4.5, 10.5]]]).view(1, 2, 2, 2) |
| self.assertEqual(output, groundtruth) |
| |
| for align_corners in (True, False): |
| # do gradcheck |
| N = random.randint(1, 8) |
| C = random.randint(1, 8) |
| H = random.randint(1, 8) |
| W = random.randint(1, 8) |
| sz = torch.Size([N, C, H, W]) |
| inp = torch.randn(N, 2, 3, requires_grad=True) |
| with warnings.catch_warnings(record=True): |
| warnings.simplefilter("always") # python2 requires this so other tests can trigger |
| self.assertTrue(gradcheck( |
| lambda inp: F.affine_grid(inp, sz, align_corners=align_corners), |
| (inp,))) |
| |
| # test CPU against CUDA |
| if TEST_CUDA: |
| N = random.randint(1, 8) |
| C = random.randint(1, 8) |
| H = random.randint(1, 8) |
| W = random.randint(1, 8) |
| sz = torch.Size([N, C, H, W]) |
| for align_corners in (True, False): |
| input_cpu = torch.randn(N, 2, 3, requires_grad=True) |
| with warnings.catch_warnings(record=True): |
| warnings.simplefilter("always") # python2 requires this so other tests can trigger |
| out_cpu = F.affine_grid(input_cpu, sz, align_corners=align_corners) |
| gradients = torch.randn(out_cpu.size()) |
| out_cpu.backward(gradients) |
| input_gpu = input_cpu.detach().cuda().requires_grad_() |
| with warnings.catch_warnings(record=True): |
| warnings.simplefilter("always") # python2 requires this so other tests can trigger |
| out_cuda = F.affine_grid(input_gpu, sz, align_corners=align_corners) |
| out_cuda.backward(gradients.cuda()) |
| self.assertEqual(out_cpu, out_cuda) |
| self.assertEqual(input_cpu.grad, input_gpu.grad) |
| |
| @set_default_dtype(torch.double) |
| def test_affine_grid_3d(self): |
| # test known input on CPU |
| input = torch.arange(1., 13).view(1, 3, 4) |
| output = F.affine_grid(input, torch.Size([1, 1, 2, 2, 2]), align_corners=True) |
| groundtruth = torch.tensor( |
| [[[[[-2., -10., -18.], [0., 0., 0.]], [[2., 2., 2.], [4., 12., 20.]]], |
| [[[4., 4., 4.], [6., 14., 22.]], [[8., 16., 24.], [10., 26., 42.]]]]]).view(1, 2, 2, 2, 3) |
| self.assertEqual(output, groundtruth) |
| output = F.affine_grid(input, torch.Size([1, 1, 2, 2, 2]), align_corners=False) |
| groundtruth = torch.tensor( |
| [[[[[1., -1., -3.], [2., 4., 6.]], [[3., 5., 7.], [4., 10., 16.]]], |
| [[[4., 6., 8.], [5., 11., 17.]], [[6., 12., 18.], [7., 17., 27.]]]]]).view(1, 2, 2, 2, 3) |
| self.assertEqual(output, groundtruth) |
| |
| for align_corners in (True, False): |
| # do gradcheck |
| N = random.randint(1, 8) |
| C = random.randint(1, 8) |
| D = random.randint(1, 8) |
| H = random.randint(1, 8) |
| W = random.randint(1, 8) |
| sz = torch.Size([N, C, D, H, W]) |
| inp = torch.randn(N, 3, 4, requires_grad=True) |
| with warnings.catch_warnings(record=True): |
| warnings.simplefilter("always") # python2 requires this so other tests can trigger |
| self.assertTrue(gradcheck( |
| lambda inp: F.affine_grid(inp, sz, align_corners=align_corners), |
| (inp,))) |
| |
| # test CPU against CUDA |
| if TEST_CUDA: |
| N = random.randint(1, 8) |
| C = random.randint(1, 8) |
| D = random.randint(1, 8) |
| H = random.randint(1, 8) |
| W = random.randint(1, 8) |
| sz = torch.Size([N, C, D, H, W]) |
| for align_corners in (True, False): |
| input_cpu = torch.randn(N, 3, 4, requires_grad=True) |
| with warnings.catch_warnings(record=True): |
| warnings.simplefilter("always") # python2 requires this so other tests can trigger |
| out_cpu = F.affine_grid(input_cpu, sz, align_corners=align_corners) |
| gradients = torch.randn(out_cpu.size()) |
| out_cpu.backward(gradients) |
| input_gpu = input_cpu.detach().cuda().requires_grad_() |
| with warnings.catch_warnings(record=True): |
| warnings.simplefilter("always") # python2 requires this so other tests can trigger |
| out_cuda = F.affine_grid(input_gpu, sz, align_corners=align_corners) |
| out_cuda.backward(gradients.cuda()) |
| self.assertEqual(out_cpu, out_cuda) |
| self.assertEqual(input_cpu.grad, input_gpu.grad) |
| |
| def test_channel_shuffle_return_alias_of_self(self): |
| # gh-76616: nn.ChannelShuffle will return alias of self with an empty input tensor |
| groups = 3 |
| input_tensor = torch.rand([0, 9, 4, 4]) |
| output = torch.nn.ChannelShuffle(groups)(input_tensor) |
| torch.testing.assert_close(output, input_tensor) |
| |
| @skipIfTorchDynamo("TorchDynamo fails here for unknown reasons") |
| def test_native_channel_shuffle_return_alias_of_self(self): |
| groups = 3 |
| input_tensor = torch.rand([0, 9, 4, 4]) |
| output = torch.native_channel_shuffle(input_tensor, groups) |
| torch.testing.assert_close(output, input_tensor) |
| |
| @set_default_dtype(torch.double) |
| def test_upsamplingLinear1d(self): |
| for align_corners in [True, False]: |
| for recompute_scale_factor in [True, False]: |
| kwargs = dict( |
| mode='linear', align_corners=align_corners, recompute_scale_factor=recompute_scale_factor |
| ) |
| # test float scale factor up & downsampling |
| for scale_factor in [0.5, 1.5, 2]: |
| m = nn.Upsample(scale_factor=scale_factor, **kwargs) |
| in_t = torch.ones(1, 1, 2) |
| out_size = int(math.floor(in_t.shape[-1] * scale_factor)) |
| with warnings.catch_warnings(record=True) as w: |
| out_t = m(in_t) |
| self.assertEqual(torch.ones(1, 1, out_size), out_t.data) |
| |
| input = torch.randn(1, 1, 2, requires_grad=True) |
| if not recompute_scale_factor: |
| gradcheck(lambda x: F.interpolate(x, out_size, **kwargs), (input,)) |
| else: |
| gradcheck(lambda x: F.interpolate(x, scale_factor=scale_factor, **kwargs), (input,)) |
| |
| def test_upsamplingLinear1d_spatial_invariance(self): |
| m = nn.Upsample(scale_factor=3, mode='linear', align_corners=False) |
| in_t_9 = torch.zeros(1, 1, 9) |
| in_t_9[:, :, :4].normal_() |
| with warnings.catch_warnings(record=True) as w: |
| out_t_9 = m(in_t_9) |
| out_t_5 = m(in_t_9[:, :, :5]) |
| self.assertEqual(out_t_9[:, :, :15], out_t_5) |
| |
| @set_default_dtype(torch.double) |
| def test_upsampling_not_recompute_scale_factor(self): |
| # test output against known input: result must match opencv |
| in_t = torch.arange(8.).view(1, 2, 2, 2) |
| expected_out_t = torch.tensor( |
| [[[[-0.32725, -0.08843, 0.37933, 0.79744], |
| [0.15039, 0.38921, 0.85697, 1.27508], |
| [1.08591, 1.32473, 1.79249, 2.21060], |
| [1.92213, 2.16095, 2.62871, 3.04682]], |
| |
| [[3.67275, 3.91157, 4.37933, 4.79744], |
| [4.15039, 4.38921, 4.85697, 5.27508], |
| [5.08591, 5.32473, 5.79249, 6.21060], |
| [5.92213, 6.16095, 6.62871, 7.04682]]]]) |
| if IS_PPC: |
| # Both OpenCV and PyTorch give a slightly different result on PPC |
| expected_out_t = torch.tensor( |
| [[[[-0.32725, -0.08843, 0.37933, 0.79744], |
| [0.15039, 0.38921, 0.85697, 1.27508], |
| [1.08591, 1.32473, 1.79249, 2.21060], |
| [1.92212, 2.16094, 2.62870, 3.04681]], |
| |
| [[3.67275, 3.91157, 4.37933, 4.79743], |
| [4.15039, 4.38921, 4.85697, 5.27508], |
| [5.08591, 5.32473, 5.79249, 6.21059], |
| [5.92212, 6.16094, 6.62870, 7.04680]]]]) |
| out_t = F.interpolate(in_t, scale_factor=2.3, mode='bicubic', align_corners=False, recompute_scale_factor=False) |
| torch.set_printoptions(precision=5) |
| self.assertEqual(out_t, expected_out_t, atol=1e-4, rtol=0) |
| |
| device_list = ['cpu'] |
| if TEST_CUDA: |
| device_list.append('cuda') |
| |
| for align_corners in [True, False]: |
| kwargs = dict(mode='bicubic', align_corners=align_corners) |
| # test float scale factor up & downsampling |
| for device in device_list: |
| for scale_factor in [0.6, 1.6, 2.3]: |
| in_t = torch.ones(2, 2, 2, 2).to(device) |
| out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs) |
| out_size = int(math.floor(in_t.shape[-1] * scale_factor)) |
| self.assertEqual(torch.ones(2, 2, out_size, out_size), out_t.data, atol=1e-5, rtol=0) |
| |
| input = torch.randn(2, 2, 2, 2, requires_grad=True) |
| gradcheck(lambda x: F.interpolate(x, out_size, **kwargs), [input]) |
| |
| def test_upsamplingBilinear2d_spatial_invariance(self): |
| m = nn.Upsample(scale_factor=3, mode='bilinear', align_corners=False) |
| in_t_9 = torch.zeros(1, 1, 9, 9) |
| in_t_9[:, :, :4, :4].normal_() |
| with warnings.catch_warnings(record=True) as w: |
| out_t_9 = m(in_t_9) |
| out_t_5 = m(in_t_9[:, :, :5, :5]) |
| self.assertEqual(out_t_9[:, :, :15, :15], out_t_5) |
| |
| def test_upsamplingTrilinear3d_spatial_invariance(self): |
| m = nn.Upsample(scale_factor=3, mode='trilinear', align_corners=False) |
| in_t_9 = torch.zeros(1, 1, 9, 9, 9) |
| in_t_9[:, :, :4, :4, :4].normal_() |
| with warnings.catch_warnings(record=True) as w: |
| out_t_9 = m(in_t_9) |
| out_t_5 = m(in_t_9[:, :, :5, :5, :5]) |
| self.assertEqual(out_t_9[:, :, :15, :15, :15], out_t_5) |
| |
| def test_upsampling_small_scale(self): |
| m = torch.nn.Upsample(scale_factor=0.5, mode="bilinear") |
| in_t = torch.arange(1, 5, dtype=torch.get_default_dtype()).reshape(1, 1, 2, 2) |
| out_t = m(in_t) |
| expected_out_t = torch.tensor([[[[2.5]]]]) |
| self.assertEqual(expected_out_t, out_t) |
| |
| def test_upsampling_bfloat16(self, dtype=torch.bfloat16): |
| def helper(size, scale_factor, mode, device, memory_format=torch.contiguous_format): |
| input = torch.randn(size, device=device, dtype=dtype).to(memory_format=memory_format).detach().requires_grad_(True) |
| inputf = input.to(torch.float32).to(memory_format=torch.contiguous_format).detach().requires_grad_(True) |
| m = nn.Upsample(scale_factor=scale_factor, mode=mode) |
| |
| outf = m(inputf) |
| out = m(input) |
| self.assertEqual(out.to(torch.float32), outf, atol=0.05, rtol=0) |
| |
| ginput = torch.randn(out.shape, device=device, dtype=dtype).to(memory_format=memory_format) |
| ginputf = ginput.to(torch.float32).to(memory_format=torch.contiguous_format) |
| out.backward(ginput) |
| outf.backward(ginputf) |
| self.assertEqual(input.grad.to(torch.float32), inputf.grad, atol=0.01, rtol=0.01) |
| |
| for device in ['cpu']: |
| helper([3, 20, 11, 7], 2, 'nearest', device) |
| helper([3, 20, 11, 7], 2, 'nearest', device, torch.channels_last) |
| helper([3, 20, 11, 7, 3], 2, 'nearest', device) |
| helper([3, 20, 30], 2, 'linear', device) |
| helper([3, 20, 11, 7], 2, 'bilinear', device) |
| helper([3, 20, 11, 7], 2, 'bilinear', device, torch.channels_last) |
| helper([1, 3, 11, 7], 2, 'bicubic', device) |
| helper([1, 3, 11, 7], 2, 'bicubic', device, torch.channels_last) |
| helper([3, 20, 11, 7, 3], 2, 'trilinear', device) |
| |
| helper([3, 5, 5], 257., 'nearest', device) |
| helper([3, 20, 11, 7], 20, 'nearest', device) |
| helper([3, 20, 11, 7, 3], 20, 'nearest', device) |
| helper([1, 2, 11, 7], 257, 'nearest', device, torch.channels_last) |
| helper([1, 2, 2000, 2000], 1 / 377., 'nearest', device) |
| helper([1, 2, 2000, 2000], 1 / 257., 'nearest', device, torch.channels_last) |
| helper([3, 2, 11, 7, 3], 20, 'nearest', device, torch.channels_last_3d) |
| helper([3, 5, 5], 10, 'linear', device) |
| helper([3, 5, 5], 257, 'linear', device) |
| helper([1, 2, 11, 7], 257, 'bilinear', device) |
| helper([1, 2, 11, 7], 257, 'bilinear', device, torch.channels_last) |
| helper([1, 3, 11, 7], 10, 'bicubic', device) |
| helper([1, 3, 11, 7], 10, 'bicubic', device, torch.channels_last) |
| helper([1, 1, 11, 7], 257, 'bicubic', device) |
| helper([3, 2, 11, 7, 3], 20, 'trilinear', device) |
| helper([3, 2, 11, 7, 3], 20, 'trilinear', device, torch.channels_last_3d) |
| |
| @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") |
| def test_interpolate_illegal_memory_access(self): |
| in_s = 45 |
| out_s = 14 |
| |
| input = torch.ones((1, 1, in_s), device='cuda', requires_grad=True) |
| # note we allocated grad_output to be larger so out of bound access |
| # would be visible in grad_input |
| grad = torch.ones((1, 1, out_s * 2), device='cuda', requires_grad=True) |
| grad = grad[:, :, :out_s] |
| |
| input_ref = input.detach().cpu().requires_grad_() |
| grad_ref = grad.cpu() |
| |
| out = F.interpolate(input, size=(out_s,), mode='nearest') |
| out.backward(grad) |
| |
| out_ref = F.interpolate(input_ref, size=(out_s,), mode='nearest') |
| out_ref.backward(grad_ref) |
| |
| self.assertEqual(out_ref, out) |
| self.assertEqual(input_ref.grad, input.grad) |
| |
| def test_interpolate_undefined_behavior_casting(self): |
| x = torch.ones([1, 1, 16, 16]) |
| self.assertRaises(RuntimeError, lambda: F.interpolate(x, scale_factor=-1e20, mode="bilinear")) |
| self.assertRaises(RuntimeError, lambda: F.interpolate(x, scale_factor=1e20, mode="bilinear")) |
| |
| def test_interpolate_buffer_overflow(self): |
| # Test buffer overflow issue due to inaccurate floating point |
| # representation for integer values. See issue below for details. |
| # https://github.com/pytorch/pytorch/issues/88939 |
| |
| def helper(size, dtype, mode, device, is_channels_last): |
| input = torch.ones(size, dtype=dtype, device=device) |
| if is_channels_last: |
| if len(size) == 3: |
| input = input.transpose(1, 2).contiguous().transpose(1, 2) |
| elif len(size) == 4: |
| input = input.to(memory_format=torch.channels_last) |
| else: |
| input = input.to(memory_format=torch.channels_last_3d) |
| output1 = F.interpolate(input, 2, mode=mode, align_corners=True) |
| # reset the corner value and expect the output is changed as well |
| # the output won't be changed on buffer overflow |
| input[(-1,) * len(size)] = 0.5 |
| output2 = F.interpolate(input, 2, mode=mode, align_corners=True) |
| self.assertNotEqual(output1, output2) |
| |
| size_dtype_list = [] |
| # We set the size larger than the floating point exactly representable range |
| # float: exact representable range (-2**24,2**24) |
| size_dtype_list.append(([1, 10, 2**24 + 4], torch.float)) |
| size_dtype_list.append(([1, 10, 2, 2**24 + 4], torch.float)) |
| size_dtype_list.append(([1, 10, 2, 2, 2**24 + 4], torch.float)) |
| # bfloat16: exact representable range (-2**8, 2**8) |
| size_dtype_list.append(([1, 10, 2**8 + 4], torch.bfloat16)) |
| size_dtype_list.append(([1, 10, 2, 2**8 + 4], torch.bfloat16)) |
| size_dtype_list.append(([1, 10, 2, 2, 2**8 + 4], torch.bfloat16)) |
| # half: exact representable range (-2**11, 2**11) |
| size_dtype_list.append(([1, 10, 2**11 + 4], torch.half)) |
| size_dtype_list.append(([1, 10, 2, 2**11 + 4], torch.half)) |
| size_dtype_list.append(([1, 10, 2, 2, 2**11 + 4], torch.half)) |
| |
| # TODO: turn on cuda test after buffer overflow issue is fixed in cuda kernel |
| # devices = ['cpu'] + (['cuda'] if torch.cuda.is_available() else []) |
| devices = ['cpu'] |
| |
| for mode in ('linear', 'bilinear', 'bicubic', 'trilinear'): |
| for size_dtype in size_dtype_list: |
| size, dtype = size_dtype |
| if ( |
| mode == 'linear' and len(size) != 3 |
| or (mode == 'bilinear' and len(size) != 4) |
| or (mode == 'bicubic' and len(size) != 4) |
| or (mode == 'trilinear' and len(size) != 5) |
| ): |
| continue |
| for device in devices: |
| if ( |
| device == 'cpu' and dtype == torch.half |
| or (device == 'cuda' and dtype == torch.bfloat16) |
| ): |
| # no half precision support on cpu or bfloat16 on cuda yet |
| continue |
| for is_channels_last in (True, False): |
| helper(size, dtype, mode, device, is_channels_last) |
| |
| |
| @set_default_dtype(torch.double) |
| def test_interpolate(self): |
| def _test_interpolate_non_integer_size_warning(in_t, out_size, dim, **kwargs): |
| test_sizes = [float(out_size), |
| torch.tensor(out_size, dtype=torch.float)] |
| for size in test_sizes: |
| self.assertRaisesRegex(TypeError, |
| "(expected size to be one of int or).*", |
| F.interpolate, in_t, size=(size,) * dim, **kwargs) |
| |
| def _test_interpolate_helper(in_t, scale_factor, layer): |
| out_size = int(math.floor(in_t.shape[-1] * scale_factor)) |
| dim = len(in_t.shape) - 2 |
| out_shape = [1, 1] + [out_size] * dim |
| with warnings.catch_warnings(record=True) as w: |
| out_t = layer(in_t) |
| self.assertEqual(torch.ones(out_shape), out_t) |
| |
| self.assertEqual( |
| F.interpolate(in_t, (out_size,) * dim, **kwargs), |
| F.interpolate(in_t, scale_factor=scale_factor, **kwargs)) |
| gradcheck(lambda x: F.interpolate(x, out_size, **kwargs), [in_t], nondet_tol=GRADCHECK_NONDET_TOL) |
| gradgradcheck(lambda x: F.interpolate(x, out_size, **kwargs), [in_t], nondet_tol=GRADCHECK_NONDET_TOL) |
| _test_interpolate_non_integer_size_warning(in_t, out_size, dim, **kwargs) |
| |
| def _make_input(dim, device): |
| size = [1, 1] |
| size += [2] * dim |
| return torch.ones(size, requires_grad=True, device=device) |
| |
| device_list = ['cpu'] |
| if TEST_CUDA: |
| device_list.append('cuda') |
| |
| for device in device_list: |
| for scale_factor in [0.5, 1.5, 2]: |
| for mode in ['nearest', 'area']: |
| kwargs = dict(mode=mode) |
| m = nn.Upsample(scale_factor=scale_factor, **kwargs).to(device) |
| for input in [_make_input(1, device), _make_input(2, device), _make_input(3, device)]: |
| _test_interpolate_helper(input, scale_factor, m) |
| |
| for align_corners in [True, False]: |
| kwargs = dict(mode='linear', align_corners=align_corners) |
| m = nn.Upsample(scale_factor=scale_factor, **kwargs).to(device) |
| _test_interpolate_helper(_make_input(1, device), scale_factor, m) |
| |
| kwargs = dict(mode='bilinear', align_corners=align_corners) |
| m = nn.Upsample(scale_factor=scale_factor, **kwargs).to(device) |
| _test_interpolate_helper(_make_input(2, device), scale_factor, m) |
| |
| kwargs = dict(mode='bicubic', align_corners=align_corners) |
| |
| def m(t): |
| return F.interpolate(t, scale_factor=scale_factor, **kwargs).to(device) |
| _test_interpolate_helper(_make_input(2, device), scale_factor, m) |
| |
| kwargs = dict(mode='trilinear', align_corners=align_corners) |
| m = nn.Upsample(scale_factor=scale_factor, **kwargs).to(device) |
| _test_interpolate_helper(_make_input(3, device), scale_factor, m) |
| |
| def test_linear_broadcasting(self): |
| m = nn.Linear(5, 8) |
| inp = torch.randn(2, 3, 5) |
| expected = m(inp.view(6, 5)).view(2, 3, 8) |
| self.assertEqual(expected, m(inp)) |
| |
| def test_linear_raise_on_scalar_input(self): |
| # This used to cause an int underflow issue when reshaping the input |
| # see https://github.com/pytorch/pytorch/issues/119161 |
| m = nn.Linear(1, 1) |
| inp = torch.ones(1).squeeze() |
| with self.assertRaisesRegex(RuntimeError, ".*both arguments.*1D.*"): |
| m(inp) |
| |
| @parametrize_test('device', ['cpu'] + (['cuda'] if TEST_CUDA else [])) |
| @parametrize_test('bias', [ |
| subtest(False, name='nobias'), subtest(True, name='bias')]) |
| @parametrize_test('weight_layout', [ |
| subtest(torch.strided, name='weightStrided'), |
| subtest(torch.sparse_coo, name='weightCOO'), |
| subtest(torch.sparse_csr, name='weightCSR'), |
| subtest(torch.sparse_csc, name='weightCSC'), |
| # TODO: addmm: computation on CPU is not implemented for Strided + Strided @ SparseBsr |
| # subtest(torch.sparse_bsr, name='weightBSR'), |
| # subtest(torch.sparse_bsc, name='weightBSC'), |
| ]) |
| def test_linear_autograd(self, device, bias, weight_layout): |
| module = nn.Linear(4, 4, bias=bias, device=device) |
| if weight_layout == torch.strided: |
| pass |
| elif weight_layout == torch.sparse_csr: |
| module.weight = nn.Parameter(module.weight.to_sparse_csr()) |
| elif weight_layout == torch.sparse_csc: |
| module.weight = nn.Parameter(module.weight.to_sparse_csc()) |
| elif weight_layout == torch.sparse_bsr: |
| module.weight = nn.Parameter(module.weight.to_sparse_bsr((2, 2))) |
| elif weight_layout == torch.sparse_bsc: |
| module.weight = nn.Parameter(module.weight.to_sparse_bsc((2, 2))) |
| elif weight_layout == torch.sparse_coo: |
| module.weight = nn.Parameter(module.weight.to_sparse_coo()) |
| else: |
| raise AssertionError |
| |
| inp = torch.randn(4, requires_grad=True, device=device) |
| res = module(inp) |
| if bias: |
| expected = (torch.einsum("i,ji->j", inp, module.weight.to_dense())) + module.bias |
| else: |
| expected = (torch.einsum("i,ji->j", inp, module.weight.to_dense())) |
| self.assertEqual(res, expected) |
| |
| grad_output = torch.randn(4, device=device) |
| grads = torch.autograd.grad(res, [module.weight, inp], grad_output) |
| grads_expected = torch.autograd.grad(expected, [module.weight, inp], grad_output) |
| |
| self.assertEqual(grads_expected[0].layout, weight_layout) |
| |
| for g, ge in zip(grads, grads_expected): |
| self.assertEqual(g, ge) |
| |
| def test_bilinear(self): |
| module = nn.Bilinear(10, 10, 8) |
| input1 = torch.randn(4, 10, requires_grad=True) |
| input2 = torch.randn(4, 10, requires_grad=True) |
| grad_output = torch.randn(4, 8) |
| res = module(input1, input2) |
| expected = (torch.einsum("bi,kij,bj->bk", input1, module.weight, input2) + |
| module.bias) |
| self.assertEqual(res, expected) |
| grads = torch.autograd.grad(res, [module.weight, module.bias, input1, input2], grad_output) |
| grads_expected = torch.autograd.grad(expected, [module.weight, module.bias, input1, input2], grad_output) |
| for g, ge in zip(grads, grads_expected): |
| self.assertEqual(g, ge) |
| |
| def test_bilinear_non_contiguous(self): |
| module = nn.Bilinear(7, 7, 5) |
| input1 = torch.randn(4, 7, 10, requires_grad=True) |
| input2 = torch.randn(4, 7, 10, requires_grad=True) |
| input1_tp = input1.transpose(1, 2) |
| input2_tp = input2.transpose(1, 2) |
| |
| grad_output = torch.randn(4, 10, 5) |
| |
| def run(input1_tp, input2_tp): |
| input1.grad = input2.grad = None |
| output = module(input1_tp, input2_tp) |
| output.backward(grad_output) |
| |
| return output.data, input1.grad.data, input2.grad.data |
| |
| out_nc, g1_nc, g2_nc = run(input1_tp, input2_tp) |
| input1_tp = input1_tp.contiguous() |
| input2_tp = input2_tp.contiguous() |
| out, g1, g2 = run(input1_tp, input2_tp) |
| |
| self.assertEqual(out, out_nc) |
| self.assertEqual(g1, g1_nc) |
| self.assertEqual(g2, g2_nc) |
| |
| def test_bilinear_no_bias(self): |
| module = nn.Bilinear(10, 10, 8, dtype=torch.double) |
| module_no_bias = nn.Bilinear(10, 10, 8, False, dtype=torch.double) |
| |
| module.bias.data.zero_() |
| module.weight.data.copy_(module_no_bias.weight) |
| |
| input1 = torch.randn(4, 10, requires_grad=True, dtype=torch.double) |
| input2 = torch.randn(4, 10, requires_grad=True, dtype=torch.double) |
| grad_output = torch.randn(4, 8, dtype=torch.double) |
| |
| def run(net): |
| input1.grad = input2.grad = None |
| output = net(input1, input2) |
| output.backward(grad_output) |
| |
| return output.data, input1.grad.data, input2.grad.data |
| |
| out, g1, g2 = run(module) |
| out_nb, g1_nb, g2_nb = run(module_no_bias) |
| |
| self.assertEqual(out, out_nb) |
| self.assertEqual(g1, g1_nb) |
| self.assertEqual(g2, g2_nb) |
| |
| _assertGradAndGradgradChecks(self, |
| lambda x1, x2: F.bilinear(x1, x2, module_no_bias.weight, module_no_bias.bias), |
| (input1, input2)) |
| |
| def test_bilinear_broadcasting(self): |
| m = nn.Bilinear(5, 6, 8) |
| input1 = torch.randn(2, 3, 5) |
| input2 = torch.randn(2, 3, 6) |
| expected = m(input1.view(6, 5), input2.view(6, 6)).view(2, 3, 8) |
| self.assertEqual(expected, m(input1, input2)) |
| |
| def test_fold_invalid_arg(self): |
| # input.size(1) not divisible by \prod(kernel_size) |
| |
| fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 3)) |
| with self.assertRaisesRegex(RuntimeError, r"be divisible by the product of kernel_size"): |
| fold(torch.randn(1, 5, 9)) |
| |
| with self.assertRaisesRegex(RuntimeError, r"be divisible by the product of kernel_size"): |
| fold(torch.randn(1, 19, 9)) |
| |
| # input.size(2) not matching the total number of sliding blocks |
| |
| with self.assertRaisesRegex(RuntimeError, r"match the calculated number of sliding blocks"): |
| fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 3)) |
| fold(torch.randn(1, 6, 10)) |
| |
| with self.assertRaisesRegex(RuntimeError, r"match the calculated number of sliding blocks"): |
| fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 3), stride=(2, 2)) |
| fold(torch.randn(1, 6, 5)) |
| |
| with self.assertRaisesRegex(RuntimeError, r"match the calculated number of sliding blocks"): |
| fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 3), stride=(2, 2), dilation=(1, 2), padding=(2, 0)) |
| fold(torch.randn(1, 6, 5)) # should be 4 * 1 = 4 sliding blocks |
| |
| fold = nn.Fold(output_size=(4, 5), kernel_size=(2, 2), stride=1, dilation=8, padding=0) |
| with self.assertRaisesRegex(RuntimeError, r"calculated shape of the array of sliding blocks as"): |
| fold(torch.randn(1, 12, 12)) |
| |
| def test_unfold_invalid_arg(self): |
| # input wrong dimension |
| |
| unfold = nn.Unfold(kernel_size=(2, 3)) |
| |
| # calculated output shape is too small |
| with self.assertRaisesRegex(RuntimeError, r"its components must be at least one"): |
| unfold = nn.Unfold(kernel_size=(2, 3)) |
| unfold(torch.randn(1, 2, 2, 2)) |
| |
| with self.assertRaisesRegex(RuntimeError, r"its components must be at least one"): |
| unfold = nn.Unfold(kernel_size=(5, 3), padding=(1, 1)) |
| unfold(torch.randn(1, 2, 2, 3)) |
| |
| with self.assertRaisesRegex(RuntimeError, r"its components must be at least one"): |
| unfold = nn.Unfold(kernel_size=(1, 3), padding=(1, 1), dilation=(1, 2)) |
| unfold(torch.randn(1, 2, 2, 2)) |
| |
| def test_softmin(self): |
| x = torch.randn(2, 16) |
| self.assertEqual(F.softmin(x, 1), F.softmax(-x, 1)) |
| self.assertEqual(F.softmin(x, 0), F.softmax(-x, 0)) |
| |
| def test_adaptive_log_softmax(self): |
| # args validation |
| with self.assertRaises(ValueError): |
| _ = nn.AdaptiveLogSoftmaxWithLoss(16, 20, [5, 15, 15], div_value=2.) |
| |
| with self.assertRaises(ValueError): |
| _ = nn.AdaptiveLogSoftmaxWithLoss(16, 20, [5, 15, 10], div_value=2.) |
| |
| with self.assertRaises(ValueError): |
| _ = nn.AdaptiveLogSoftmaxWithLoss(16, 20, [5, 10, 25], div_value=2.) |
| |
| with self.assertRaisesRegex(ValueError, "cutoffs should be a sequence of unique,"): |
| _ = nn.AdaptiveLogSoftmaxWithLoss(16, 20, [5, 10, 20], div_value=2.) |
| |
| # not raise |
| _ = nn.AdaptiveLogSoftmaxWithLoss(16, 20, [5, 10, 19], div_value=2.) |
| |
| # input shapes |
| with self.assertRaisesRegex(RuntimeError, r"Input and target should have the same size"): |
| asfm = nn.AdaptiveLogSoftmaxWithLoss(16, 20, [5, 10, 15], div_value=2.) |
| x = torch.randn(2, 16) |
| y = torch.tensor([0, 5, 10]) |
| asfm(x, y) |
| |
| # out-of-bound targets |
| with self.assertRaisesRegex(RuntimeError, r"Target values should be in"): |
| asfm = nn.AdaptiveLogSoftmaxWithLoss(16, 20, [5, 10, 15], div_value=2.) |
| x = torch.randn(2, 16) |
| y = torch.tensor([0, 20]) |
| asfm(x, y) |
| |
| # cluster sizes |
| asfm = nn.AdaptiveLogSoftmaxWithLoss(16, 20, [5, 10, 15], div_value=2.) |
| x = torch.randn(2, 16) |
| y = torch.tensor([0, 17]) |
| |
| self.assertEqual(asfm.head.weight.size(), (5 + 3, 16)) # 5 targets in head, 3 clusters, dimensionality 16 |
| self.assertEqual(asfm.tail[0][1].weight.size(), (5, 8)) # 5 targets in this cluster, dimensionality 8 |
| self.assertEqual(asfm.tail[1][1].weight.size(), (5, 4)) |
| self.assertEqual(asfm.tail[2][1].weight.size(), (5, 2)) |
| self.assertEqual(asfm(x, y).output.size(), (2, )) |
| |
| # test no_batch_dim support |
| asfm = nn.AdaptiveLogSoftmaxWithLoss(16, 20, [5, 10, 15], div_value=2.) |
| x = torch.randn(1, 16) |
| y = torch.tensor([17]) |
| x2 = x.squeeze(0) |
| y2 = y.squeeze(0) |
| self.assertEqual(asfm(x, y).output.squeeze(0), asfm(x2, y2).output) |
| |
| # log_probs actually returns log_proba |
| asfm = nn.AdaptiveLogSoftmaxWithLoss(8, 4, [2], div_value=2.) |
| x = torch.randn(4, 8) |
| logprob_out = asfm.log_prob(x) |
| |
| self.assertEqual(torch.exp(logprob_out).data.sum(1), torch.ones(4)) |
| |
| # forward returns the same thing as log_probs |
| for v in [0, 1, 2, 3]: |
| y = torch.full((4,), v, dtype=torch.long) |
| out, loss = asfm(x, y) |
| |
| self.assertEqual(out, logprob_out.gather(1, y.unsqueeze(1)).squeeze()) |
| self.assertEqual(loss, F.nll_loss(logprob_out, y)) |
| |
| # predict |
| x = torch.randn(64, 8).abs_() |
| |
| # argmax in shortlist |
| asfm = nn.AdaptiveLogSoftmaxWithLoss(8, 10, [4, 8], div_value=2., head_bias=True) |
| asfm.head.weight.data.abs_() |
| asfm.head.bias.data.abs_() |
| asfm.head.weight.data[asfm.shortlist_size:, :].zero_() |
| |
| out = asfm.predict(x) |
| self.assertEqual(out, asfm.log_prob(x).argmax(dim=1)) |
| |
| # argmax outside of shortlist |
| asfm = nn.AdaptiveLogSoftmaxWithLoss(8, 10, [4, 8], div_value=2., head_bias=True) |
| asfm.head.weight.data.abs_() |
| asfm.head.bias.data.abs_() |
| asfm.head.weight.data[:asfm.shortlist_size, :].zero_() |
| |
| out = asfm.predict(x) |
| self.assertEqual(out, asfm.log_prob(x).argmax(dim=1)) |
| |
| # half of the argmax in shortlist, half in clusters |
| asfm = nn.AdaptiveLogSoftmaxWithLoss(8, 10, [4, 8], div_value=2., head_bias=True) |
| asfm.head.weight.data.abs_() |
| asfm.head.bias.data.abs_() |
| |
| x[:32, :asfm.shortlist_size].zero_() |
| x[32:, asfm.shortlist_size:].zero_() |
| |
| asfm.head.weight.data[:asfm.shortlist_size, asfm.shortlist_size:].zero_() |
| asfm.head.weight.data[asfm.shortlist_size:, :asfm.shortlist_size].zero_() |
| |
| out = asfm.predict(x) |
| self.assertEqual(out, asfm.log_prob(x).argmax(dim=1)) |
| |
| def test_cross_entropy_loss(self, dtype=torch.bfloat16): |
| loss_cpu = nn.CrossEntropyLoss().cpu() |
| inputf = torch.randn(15, 10, device="cpu", dtype=torch.float, requires_grad=True) |
| input = inputf.to(dtype).detach().requires_grad_(True) |
| target = torch.empty(15, dtype=torch.long).random_(10) |
| |
| outf = loss_cpu(inputf, target) |
| out = loss_cpu(input, target) |
| self.assertEqual(out, outf.to(dtype=dtype), atol=1e-1, rtol=0) |
| |
| outf.backward() |
| out.backward() |
| self.assertEqual(input.grad, inputf.grad.to(dtype=dtype), atol=1e-1, rtol=0) |
| |
| def test_cross_entropy_loss_precision(self): |
| # Regression test for #55657 |
| loss_cpu = nn.CrossEntropyLoss().cpu() |
| inputf = torch.randn(128, 2, 768, 768, device="cpu", dtype=torch.float) |
| inputd = inputf.double() |
| target = torch.randint(2, (128, 768, 768), dtype=torch.long) |
| |
| outf = loss_cpu(inputf, target) |
| outd = loss_cpu(inputd, target) |
| self.assertEqual(outf, outd, exact_dtype=False) |
| |
| def test_cross_entropy_loss_zero_div(self): |
| # Test for issue #73165 |
| input_1 = torch.rand([5, 0], dtype=torch.float32) |
| input_2 = torch.rand([5, 0], dtype=torch.float32) |
| torch.nn.CrossEntropyLoss()(input_1, input_2) |
| |
| @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") |
| def test_convert_sync_batchnorm(self): |
| module = torch.nn.Sequential( |
| torch.nn.BatchNorm1d(100), |
| torch.nn.InstanceNorm1d(100) |
| ).cuda() |
| |
| # necessary to have an anchor point for comparison, in case the |
| # convert_sync_batchnorm updates in place |
| comp_module = torch.nn.Sequential( |
| torch.nn.BatchNorm1d(100), |
| torch.nn.InstanceNorm1d(100) |
| ).cuda() |
| comp_module.load_state_dict(module.state_dict()) |
| |
| sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module) |
| children = list(sync_bn_module.children()) |
| self.assertEqual(children[0].__class__, torch.nn.SyncBatchNorm) |
| self.assertEqual(children[1].__class__, torch.nn.InstanceNorm1d) |
| |
| for layer, converted_layer in zip(comp_module.children(), sync_bn_module.children()): |
| for key in layer.state_dict().keys(): |
| self.assertEqual(layer.state_dict()[key].device, converted_layer.state_dict()[key].device) |
| self.assertEqual(layer.state_dict()[key], converted_layer.state_dict()[key]) |
| |
| @unittest.skipIf(not TEST_CUDA, "CUDA not available") |
| def test_sync_batchnorm_backward_elemt(self): |
| device = 'cuda' |
| saved_input = torch.rand(2, 3, 2, 1, device=device) |
| grad_output = torch.rand(2, 3, 2, 1, device=device) |
| mean = torch.rand(3, device=device) |
| invstd = torch.rand(3, device=device) |
| weight = torch.rand(3, device=device) |
| sum_dy = torch.rand(3, device=device) |
| sum_dy_xmu = torch.rand(3, device=device) |
| count_tensor = torch.tensor([5, 5, 5], dtype=torch.int32, device=device) |
| |
| gI_contiguous = torch.batch_norm_backward_elemt( |
| grad_output, |
| saved_input, |
| mean, |
| invstd, |
| weight, |
| sum_dy, |
| sum_dy_xmu, |
| count_tensor |
| ) |
| |
| # Test batch_norm_backward_elemt gives the same answer for all |
| # combinations of contiguous as channels_last input |
| for a, b in [ |
| (torch.channels_last, torch.contiguous_format), |
| (torch.contiguous_format, torch.channels_last), |
| (torch.channels_last, torch.channels_last), |
| ]: |
| gI_actual = torch.batch_norm_backward_elemt( |
| grad_output.contiguous(memory_format=a), |
| saved_input.contiguous(memory_format=b), |
| mean, |
| invstd, |
| weight, |
| sum_dy, |
| sum_dy_xmu, |
| count_tensor |
| ) |
| self.assertEqual(gI_actual, gI_contiguous) |
| |
| @unittest.skipIf(not TEST_CUDA, "CUDA not available") |
| def test_sync_batchnorm_accuracy_cuda(self): |
| # The target of this test is to test the functionality and accuracy of |
| # those single-GPU cuda kernels used in SyncBatchNorm |
| # They are: |
| # fwd: torch.batch_norm_stats, torch.batch_norm_gather_stats_with_counts, torch.batch_norm_elemt |
| # bwd: torch.batch_norm_backward_reduce, torch.batch_norm_backward_elemt |
| |
| def _batch_norm_stats(data, memory_format, mean_axes): |
| mean1, _ = torch.batch_norm_stats(data, 1e-5) |
| mean2, _ = torch.batch_norm_stats(data.to(memory_format=memory_format), 1e-5) |
| mean_ref = torch.mean(data, mean_axes, keepdim=False) |
| |
| self.assertEqual(mean_ref, mean1) |
| self.assertEqual(mean_ref, mean2) |
| |
| _batch_norm_stats(torch.randn(1, 96, 112, 112, dtype=torch.float, device='cuda'), torch.channels_last, (0, 2, 3)) |
| _batch_norm_stats(torch.randn(1, 96, 112, 112, 112, dtype=torch.float, device='cuda'), torch.channels_last_3d, (0, 2, 3, 4)) |
| |
| def test_flatten(self): |
| tensor_input = torch.randn(2, 1, 2, 3) |
| |
| # Flatten Tensor |
| |
| flatten = nn.Flatten(start_dim=1, end_dim=-1) |
| tensor_output = flatten(tensor_input) |
| self.assertEqual(tensor_output.size(), torch.Size([2, 6])) |
| |
| def test_unflatten(self): |
| tensor_input = torch.randn(2, 50) |
| |
| # Unflatten Tensor (unflattened_size as a tuple of ints and list of ints) |
| |
| for us in ((2, 5, 5), [2, 5, 5]): |
| unflatten = nn.Unflatten(dim=1, unflattened_size=us) |
| tensor_output = unflatten(tensor_input) |
| self.assertEqual(tensor_output.size(), torch.Size([2, 2, 5, 5])) |
| |
| # Unflatten NamedTensor |
| |
| unflatten = nn.Unflatten(dim='features', unflattened_size=(('C', 2), ('H', 5), ('W', 5))) |
| named_tensor_input = tensor_input.refine_names('N', 'features') |
| named_tensor_output = unflatten(named_tensor_input) |
| self.assertEqual(named_tensor_output.size(), torch.Size([2, 2, 5, 5])) |
| |
| def test_unflatten_invalid_arg(self): |
| # Wrong type for unflattened_size (tuple of floats) |
| |
| with self.assertRaisesRegex( |
| TypeError, |
| r"unflattened_size must be tuple of ints, but found element of type float at pos 2"): |
| nn.Unflatten(dim=1, unflattened_size=(2, 5, 5.0)) |
| |
| # Wrong type for unflattened_size (list of lists and list of tuples) |
| for us in ([['C', 2], ['W', 5], ['H', 5]], [('C', 2), ('W', 5), ('H', 5)]): |
| with self.assertRaisesRegex( |
| TypeError, |
| r"unflattened_size must be a tuple of tuples, but found type list"): |
| nn.Unflatten(dim='features', unflattened_size=us) |
| |
| # Wrong type for unflattened_size (tuple of lists) |
| |
| with self.assertRaisesRegex( |
| TypeError, |
| r"unflattened_size must be tuple of tuples, but found element of type list at pos 0"): |
| nn.Unflatten(dim='features', unflattened_size=(['C', 2], ['W', 5], ['H', 5])) |
| |
| # Wrong type for unflattened_size (tuple of dicts) |
| |
| with self.assertRaisesRegex( |
| TypeError, |
| r"unflattened_size must be tuple of tuples, but found element of type dict at pos 0"): |
| nn.Unflatten(dim='features', unflattened_size=({'C': 2}, {'W': 5}, {'H': 5})) |
| |
| def test_layer_norm_grads_with_create_graph_flag(self): |
| atol = 1e-5 |
| rtol = 1e-3 |
| |
| x = torch.randn((4, 4, 16), requires_grad=True) |
| layer_norm = nn.LayerNorm((16,), 1e-5, True) |
| with torch.no_grad(): |
| layer_norm.weight = torch.nn.Parameter(0.1 * torch.ones_like(layer_norm.weight)) |
| |
| grads1 = torch.autograd.grad(layer_norm(x).sum(), x, create_graph=False)[0] |
| grads2 = torch.autograd.grad(layer_norm(x).sum(), x, create_graph=True)[0] |
| |
| self.assertEqual(grads1, grads2, rtol=rtol, atol=atol) |
| |
| if TEST_CUDA: |
| x = x.to('cuda') |
| layer_norm = layer_norm.to('cuda') |
| |
| grads1 = torch.autograd.grad(layer_norm(x).sum(), x, create_graph=False)[0] |
| grads2 = torch.autograd.grad(layer_norm(x).sum(), x, create_graph=True)[0] |
| |
| self.assertEqual(grads1, grads2, rtol=rtol, atol=atol) |
| |
| def test_layer_norm_eps(self): |
| # test for https://github.com/pytorch/pytorch/issues/108072 |
| x = torch.Tensor([[[2.0, 2.0], [14.0, 14.0]], [[2.0, 2.0], [14.0, 14.0]]]) |
| ln = torch.nn.LayerNorm(2, eps=1e-6, elementwise_affine=False) |
| self.assertEqual(ln.forward(x), torch.zeros_like(x)) |
| |
| def test_padding_list(self): |
| # Padding can be a list, or tuple (regression test for gh-54452) |
| x = torch.randn(4, 8, 32, 32) |
| net = torch.nn.ConvTranspose2d(8, 16, kernel_size=3, padding=[3, 3]) |
| y = net(x) |
| |
| net = torch.nn.ConvTranspose2d(8, 16, kernel_size=3, padding=(3, 3)) |
| y = net(x) |
| |
| def test_fractional_max_pool2d_invalid_output_ratio(self): |
| arg_1 = [2, 1] |
| arg_2 = [0.5, 0.5, 0.6] |
| arg_class = torch.nn.FractionalMaxPool2d(kernel_size=arg_1, output_ratio=arg_2,) |
| arg_3_0_tensor = torch.rand([20, 16, 50, 32], dtype=torch.float32) |
| arg_3_0 = arg_3_0_tensor.clone() |
| arg_3 = [arg_3_0,] |
| |
| with self.assertRaisesRegex(ValueError, |
| "fractional_max_pool2d requires output_ratio to either be a single Int or tuple of Ints."): |
| res = arg_class(*arg_3) |
| |
| def test_max_pool1d_invalid_output_size(self): |
| arg_1 = 3 |
| arg_2 = 255 |
| arg_3 = False |
| arg_class = torch.nn.MaxPool1d(kernel_size=arg_1, stride=arg_2, return_indices=arg_3) |
| arg_4_0 = torch.as_tensor([[0.3204]]) |
| arg_4 = [arg_4_0,] |
| |
| with self.assertRaises(RuntimeError): |
| res = arg_class(*arg_4) |
| |
| def test_pickle_module_no_weights_only_warning(self): |
| with warnings.catch_warnings(record=True) as w: |
| pickle.loads(pickle.dumps(torch.nn.Linear(10, 10))) |
| self.assertEqual(len(w), 0) |
| |
| class TestFusionEval(TestCase): |
| @set_default_dtype(torch.double) |
| @given(X=hu.tensor(shapes=((5, 3, 5, 5),), dtype=np.double), |
| running_mean=hu.tensor(shapes=(6,), dtype=np.double), |
| running_var=hu.tensor(shapes=(6,), dtype=np.double)) |
| def test_fuse_module_eval_numerics(self, X, running_mean, running_var): |
| inputs, _ = X |
| |
| iC, oC = inputs.shape[1], len(running_mean[0]) |
| inputs = torch.from_numpy(inputs) |
| kernel_size = (3, 3) |
| |
| conv_ref = torch.nn.Conv2d(iC, oC, bias=True, kernel_size=kernel_size) |
| bn_ref = torch.nn.BatchNorm2d(oC) |
| bn_ref.running_mean = torch.from_numpy(running_mean[0]) |
| bn_ref.running_var = torch.from_numpy(running_var[0]) |
| |
| conv_ref.eval() |
| bn_ref.eval() |
| |
| Y_ref = bn_ref(conv_ref(inputs)) |
| conv_bn_fused = torch.nn.utils.fusion.fuse_conv_bn_eval(conv_ref, |
| bn_ref) |
| Y_hat = conv_bn_fused(inputs) |
| |
| self.assertEqual(Y_ref, Y_hat, msg="Conv+BN fusion results are off") |
| |
| na_bn_ref = torch.nn.BatchNorm2d(oC, affine=False) |
| na_bn_ref.running_mean = torch.from_numpy(running_mean[0]) |
| na_bn_ref.running_var = torch.from_numpy(running_var[0]) |
| na_bn_ref.eval() |
| |
| Y_ref = na_bn_ref(conv_ref(inputs)) |
| conv_na_bn_fused = torch.nn.utils.fusion.fuse_conv_bn_eval(conv_ref, |
| na_bn_ref) |
| Y_hat = conv_na_bn_fused(inputs) |
| |
| self.assertEqual(Y_ref, Y_hat, msg="Conv+BN(non-affine) fusion results are off") |
| |
| |
| class TestConstantPadNd(TestCase): |
| def test_constant_pad_nd(self): |
| a = torch.tensor([[1, 2], [3, 4]]) |
| res = torch.constant_pad_nd(a, [1, 2, 1, 0], 9) |
| expected = torch.tensor([ |
| [9, 9, 9, 9, 9], |
| [9, 1, 2, 9, 9], |
| [9, 3, 4, 9, 9] |
| ]) |
| self.assertEqual(res, expected) |
| |
| def test_preserves_memory_format(self): |
| nchw_tensor = torch.rand((1, 2, 5, 3)) |
| nchw_padded = torch.constant_pad_nd(nchw_tensor, [1, 2], 0.5) |
| self.assertTrue(nchw_padded.is_contiguous(memory_format=torch.contiguous_format)) |
| |
| nhwc_tensor = nchw_tensor.contiguous(memory_format=torch.channels_last) |
| nhwc_padded = torch.constant_pad_nd(nhwc_tensor, [1, 2], 0.5) |
| self.assertTrue(nhwc_padded.is_contiguous(memory_format=torch.channels_last)) |
| |
| |
| class TestAddRelu(TestCase): |
| def test_add_relu(self): |
| a = torch.rand((7, 11)) |
| b = torch.rand((7, 11)) |
| a = a.float() |
| b = b.float() |
| a = a * -10 |
| a = a + 5 |
| add_res = a + b |
| relu_res = torch.relu(add_res) |
| add_relu_res = torch._VF._add_relu(a, b) |
| |
| self.assertEqual(add_relu_res, relu_res) |
| |
| def test_add_relu_broadcasting(self): |
| a = torch.rand((1, 32)) |
| b = 1 |
| b_scalar = torch.ones(1, 32) |
| res = torch._VF._add_relu(a, b) |
| broadcasted_res = torch._VF._add_relu(a, b_scalar) |
| |
| self.assertEqual(broadcasted_res, res) |
| |
| |
| def add_test(test, decorator=None): |
| def add(test_name, fn): |
| if hasattr(TestNN, test_name): |
| raise RuntimeError('Found two tests with the same name: ' + test_name) |
| if decorator is not None: |
| fn = decorator(fn) |
| setattr(TestNN, test_name, fn) |
| |
| test_name = test.get_name() |
| if not hasattr(test, 'test_cpu') or test.test_cpu: |
| add(test_name, lambda self, test=test: test(self)) |
| cuda_test_name = test_name + '_cuda' |
| # With dtype enable, it's good enough to test against three floating types |
| kwargs = {} |
| if 'extra_args' in get_function_arglist(test.test_cuda): |
| kwargs['extra_args'] = test.extra_args |
| |
| if 'dtype' in get_function_arglist(test.test_cuda): |
| if tf32_is_not_fp32() and test.with_tf32: |
| |
| def with_tf32_off(self, test=test, kwargs=kwargs): |
| with tf32_off(): |
| test.test_cuda(self, dtype=torch.float, **kwargs) |
| |
| add(cuda_test_name + '_fp32', with_tf32_off) |
| |
| def with_tf32_on(self, test=test, kwargs=kwargs): |
| with tf32_on(self, test.tf32_precision): |
| test.test_cuda(self, dtype=torch.float, **kwargs) |
| |
| add(cuda_test_name + '_tf32', with_tf32_on) |
| else: |
| add(cuda_test_name + '_float', lambda self, |
| test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.float, **kwargs)) |
| add(cuda_test_name + '_double', lambda self, |
| test=test, kwargs=kwargs: test.test_cuda(self, dtype=torch.double, **kwargs)) |
| |
| def test_half(self, test=test, kwargs=kwargs): |
| test.test_cuda(self, dtype=torch.half, **kwargs) |
| if getattr(test, 'check_half', True): |
| add(cuda_test_name + '_half', test_half) |
| |
| def test_bfloat16(self, test=test, kwargs=kwargs): |
| test.test_cuda(self, dtype=torch.bfloat16, **kwargs) |
| if getattr(test, 'check_bfloat16', True): |
| add(cuda_test_name + '_bfloat16', test_bfloat16) |
| |
| def test_cfloat(self, test=test, kwargs=kwargs): |
| test.test_cuda(self, dtype=torch.cfloat, **kwargs) |
| |
| def test_cdouble(self, test=test, kwargs=kwargs): |
| test.test_cuda(self, dtype=torch.cdouble, **kwargs) |
| if getattr(test, 'check_complex', False): |
| add(cuda_test_name + '_cfloat', test_cfloat) |
| add(cuda_test_name + '_cdouble', test_cdouble) |
| |
| else: |
| def with_tf32_off(self, test=test, kwargs=kwargs): |
| with tf32_off(): |
| test.test_cuda(self, **kwargs) |
| |
| if tf32_is_not_fp32() and test.with_tf32: |
| add(cuda_test_name + '_fp32', with_tf32_off) |
| |
| def with_tf32_on(self, test=test, kwargs=kwargs): |
| with tf32_on(self, test.tf32_precision): |
| test.test_cuda(self, **kwargs) |
| |
| add(cuda_test_name + '_tf32', with_tf32_on) |
| else: |
| add(cuda_test_name, with_tf32_off) |
| |
| for test_params in module_tests + new_module_tests: |
| # TODO: CUDA is not implemented yet |
| if 'constructor' not in test_params: |
| name = test_params.pop('module_name') |
| test_params['constructor'] = getattr(nn, name) |
| decorator = test_params.pop('decorator', None) |
| test = NewModuleTest(**test_params) |
| add_test(test, decorator) |
| if 'check_eval' in test_params: |
| # create a new test that is identical but that sets module.training to False |
| desc = test_params.get('desc', None) |
| test_params['desc'] = 'eval' if desc is None else desc + '_eval' |
| |
| def gen_eval_constructor(constructor): |
| def eval_constructor(*args, **kwargs): |
| cons = constructor(*args, **kwargs) |
| cons.training = False |
| return cons |
| eval_constructor.__name__ = constructor.__name__ |
| return eval_constructor |
| |
| test_params['constructor'] = gen_eval_constructor(test_params['constructor']) |
| test = NewModuleTest(**test_params) |
| add_test(test, decorator) |
| if 'check_with_long_tensor' in test_params: |
| fullname = test_params.get('fullname', None) |
| if fullname: |
| test_params['fullname'] = fullname + '_with_long_tensor' |
| else: |
| desc = test_params.get('desc', None) |
| test_params['desc'] = 'with_long_tensor' if desc is None else desc + '_with_long_tensor' |
| |
| def double_equivalent_of_long_tensor(size): |
| return torch.randint(-1000, 1000, size=size).double() |
| |
| def apply_to_cons(t): |
| if t.is_floating_point(): |
| if isinstance(t, Parameter): |
| return Parameter(double_equivalent_of_long_tensor(t.size())) |
| elif isinstance(t, torch.Tensor): |
| return double_equivalent_of_long_tensor(t.size()) |
| else: |
| return t |
| |
| def gen_long_tensor_constructor(constructor): |
| def long_tensor_constructor(*args, **kwargs): |
| cons = constructor(*args, **kwargs) |
| cons._apply(apply_to_cons) |
| return cons |
| long_tensor_constructor.__name__ = constructor.__name__ |
| return long_tensor_constructor |
| |
| def gen_long_tensor_input(input_size): |
| def input_func(): |
| return double_equivalent_of_long_tensor(input_size) |
| return input_func |
| |
| def reference_fn(i, p, m): |
| # For bad reasons this would create LongTensors that requires gradients |
| # Remove requires_grad to avoid this |
| for p in m.parameters(): |
| p.requires_grad_(False) |
| m._apply(lambda t: t.long()) |
| input = i.long() |
| out = m.forward(input) |
| return out |
| |
| test_params['constructor'] = gen_long_tensor_constructor(test_params['constructor']) |
| test_params['input_fn'] = gen_long_tensor_input(test_params['input_size']) |
| test_params['reference_fn'] = reference_fn |
| test_params['check_forward_only'] = True |
| # Currently we don't support conv2d/conv3d for LongTensor in CUDA |
| test_params['test_cuda'] = False |
| test = NewModuleTest(**test_params) |
| |
| add_test(test, decorator) |
| |
| for test_params in criterion_tests: |
| if 'constructor' not in test_params: |
| name = test_params.pop('module_name') |
| test_params['constructor'] = getattr(nn, name) |
| test = CriterionTest(**test_params) |
| decorator = test_params.pop('decorator', None) |
| add_test(test, decorator) |
| if 'check_sum_reduction' in test_params: |
| desc = test_params.get('desc', None) |
| test_params['desc'] = 'sum_reduction' if desc is None else desc + '_sum_reduction' |
| |
| def gen_sum_reduction_constructor(constructor): |
| def sum_reduction_constructor(*args, **kwargs): |
| cons = constructor(*args, reduction='sum', **kwargs) |
| return cons |
| sum_reduction_constructor.__name__ = constructor.__name__ |
| return sum_reduction_constructor |
| |
| test_params['constructor'] = gen_sum_reduction_constructor(test_params['constructor']) |
| test = CriterionTest(**test_params) |
| add_test(test, decorator) |
| |
| |
| class UnpoolingNet(nn.Module): |
| def __init__(self, pool, unpool): |
| super().__init__() |
| self.pool = pool |
| self.unpool = unpool |
| |
| def forward(self, input): |
| return self.unpool(*self.pool(input)) |
| |
| |
| add_test(NewModuleTest( |
| constructor=lambda: UnpoolingNet( |
| nn.MaxPool1d(2, return_indices=True), |
| nn.MaxUnpool1d(2)), |
| input_size=(1, 1, 4), |
| fullname='MaxUnpool1d_net', |
| default_dtype=torch.double,)) |
| add_test(NewModuleTest( |
| constructor=lambda: UnpoolingNet( |
| nn.MaxPool2d(2, return_indices=True), |
| nn.MaxUnpool2d(2)), |
| input_size=(1, 1, 2, 4), |
| fullname='MaxUnpool2d_net', |
| default_dtype=torch.double,)) |
| add_test(NewModuleTest( |
| constructor=lambda: UnpoolingNet( |
| nn.MaxPool3d(2, return_indices=True), |
| nn.MaxUnpool3d(2)), |
| input_size=(1, 1, 2, 4, 6), |
| fullname='MaxUnpool3d_net', |
| check_gradgrad=False, |
| default_dtype=torch.double,)) |
| |
| add_test(NewModuleTest( |
| constructor=lambda: UnpoolingNet( |
| nn.MaxPool1d(2, return_indices=True), |
| nn.MaxUnpool1d(2)), |
| input_size=(1, 4), |
| reference_fn=single_batch_reference_fn, |
| fullname='MaxUnpool1d_net_no_batch_dim', |
| default_dtype=torch.double,)) |
| add_test(NewModuleTest( |
| constructor=lambda: UnpoolingNet( |
| nn.MaxPool2d(2, return_indices=True), |
| nn.MaxUnpool2d(2)), |
| input_size=(1, 2, 4), |
| reference_fn=single_batch_reference_fn, |
| fullname='MaxUnpool2d_net_no_batch_dim', |
| default_dtype=torch.double,)) |
| |
| add_test(NewModuleTest( |
| constructor=lambda: UnpoolingNet( |
| nn.MaxPool3d(2, return_indices=True), |
| nn.MaxUnpool3d(2)), |
| input_size=(1, 2, 4, 6), |
| reference_fn=single_batch_reference_fn, |
| fullname='MaxUnpool3d_net_no_batch_dim', |
| check_gradgrad=False, |
| default_dtype=torch.double,)) |
| |
| class _AdaptiveLogSoftmaxWithLoss(nn.AdaptiveLogSoftmaxWithLoss): |
| def __call__(self, input): |
| t = torch.tensor([0, 1, 4, 8]).to(input.device) |
| return nn.AdaptiveLogSoftmaxWithLoss.__call__(self, input, t).output |
| |
| add_test(NewModuleTest( |
| constructor=lambda: _AdaptiveLogSoftmaxWithLoss(16, 10, [2, 6]), |
| input_size=(4, 16), |
| fullname='AdaptiveLogSoftmax', |
| with_tf32=True, |
| tf32_precision=0.005, |
| default_dtype=torch.double)) |
| |
| |
| # The following are helpers for TestNN.test_affine_* |
| if torch.cuda.is_available(): |
| def device_(): |
| return ['cpu', 'cuda'] |
| else: |
| def device_(): |
| return ['cpu'] |
| |
| |
| def angle_rad_(): |
| return [r * math.pi * 2 for r in [0.0, 0.5, 0.25, 0.125, random.random()]] |
| |
| |
| def axis_vector_(): |
| t = (random.random(), random.random(), random.random()) |
| l = sum(x ** 2 for x in t) ** 0.5 |
| |
| return [(1.0, 0.0, 0.0), (0.0, 1.0, 0.0), (0.0, 0.0, 1.0), tuple(x / l for x in t)] |
| |
| |
| def input_size2d_(): |
| return [[1, 1, 3, 5], [1, 1, 3, 3], [1, 1, 4, 4], [1, 1, 3, 4]] |
| |
| |
| def output_size2d_(): |
| return [[1, 1, 5, 3], [1, 1, 3, 5], [1, 1, 4, 3], [1, 1, 5, 5], [1, 1, 6, 6]] |
| |
| |
| def input_size2dsq_(): |
| return [[1, 1, 2, 2], [1, 1, 3, 3], [1, 1, 4, 4], [1, 1, 6, 6]] |
| |
| |
| def output_size2dsq_(): |
| return [[1, 1, 2, 2], [1, 1, 3, 3], [1, 1, 4, 4], [1, 1, 5, 5], [1, 1, 6, 6]] |
| |
| |
| def input_size3d_(): |
| return [[1, 1, 2, 2, 2], [1, 1, 2, 3, 4], [1, 1, 3, 3, 3], [1, 1, 4, 4, 4], [1, 1, 3, 4, 5]] |
| |
| |
| def input_size3dsq_(): |
| return [[1, 1, 2, 2, 2], [1, 1, 3, 3, 3], [1, 1, 4, 4, 4], [1, 1, 6, 6, 6]] |
| |
| |
| def output_size3dsq_(): |
| return [[1, 1, 2, 2, 2], [1, 1, 3, 3, 3], [1, 1, 4, 4, 4], [1, 1, 5, 5, 5], [1, 1, 6, 6, 6]] |
| |
| |
| def output_size3d_(): |
| return [[1, 1, 2, 2, 2], [1, 1, 3, 3, 3], [1, 1, 3, 4, 5], [1, 1, 4, 3, 2], [1, 1, 5, 5, 5], [1, 1, 6, 6, 6]] |
| |
| |
| def _buildEquivalentAffineTransforms2d(device, input_size, output_size, angle_rad): |
| input_center = [(x - 1) / 2.0 for x in input_size] |
| output_center = [(x - 1) / 2.0 for x in output_size] |
| |
| s = math.sin(angle_rad) |
| c = math.cos(angle_rad) |
| |
| intrans_ary = np.array([ |
| [1, 0, input_center[2]], |
| [0, 1, input_center[3]], |
| [0, 0, 1], |
| ], dtype=np.float64) |
| |
| inscale_ary = np.array([ |
| [input_center[2], 0, 0], |
| [0, input_center[3], 0], |
| [0, 0, 1], |
| ], dtype=np.float64) |
| |
| rotation_ary = np.array([ |
| [c, -s, 0], |
| [s, c, 0], |
| [0, 0, 1], |
| ], dtype=np.float64) |
| |
| outscale_ary = np.array([ |
| [1.0 / output_center[2], 0, 0], |
| [0, 1.0 / output_center[3], 0], |
| [0, 0, 1], |
| ], dtype=np.float64) |
| |
| outtrans_ary = np.array([ |
| [1, 0, -output_center[2]], |
| [0, 1, -output_center[3]], |
| [0, 0, 1], |
| ], dtype=np.float64) |
| |
| reorder_ary = np.array([ |
| [0, 1, 0], |
| [1, 0, 0], |
| [0, 0, 1], |
| ], dtype=np.float64) |
| |
| transform_ary = np.dot(np.dot(np.dot(np.dot( |
| intrans_ary, |
| inscale_ary), |
| rotation_ary.T), |
| outscale_ary), |
| outtrans_ary) |
| grid_ary = np.dot(np.dot(np.dot(reorder_ary, rotation_ary.T), outscale_ary), outtrans_ary) |
| |
| transform_tensor = torch.from_numpy(rotation_ary).to(device, torch.float32) |
| transform_tensor = transform_tensor[:2].unsqueeze(0) |
| |
| return transform_tensor, transform_ary, grid_ary |
| |
| |
| def _buildEquivalentAffineTransforms3d(device, input_size, output_size, angle_rad, axis_vector): |
| input_center = [(x - 1) / 2.0 for x in input_size] |
| output_center = [(x - 1) / 2.0 for x in output_size] |
| |
| s = math.sin(angle_rad) |
| c = math.cos(angle_rad) |
| c1 = 1 - c |
| |
| intrans_ary = np.array([ |
| [1, 0, 0, input_center[2]], |
| [0, 1, 0, input_center[3]], |
| [0, 0, 1, input_center[4]], |
| [0, 0, 0, 1], |
| ], dtype=np.float64) |
| |
| inscale_ary = np.array([ |
| [input_center[2], 0, 0, 0], |
| [0, input_center[3], 0, 0], |
| [0, 0, input_center[4], 0], |
| [0, 0, 0, 1], |
| ], dtype=np.float64) |
| |
| l, m, n = axis_vector |
| scipyRotation_ary = np.array([ |
| [l * l * c1 + c, m * l * c1 - n * s, n * l * c1 + m * s, 0], |
| [l * m * c1 + n * s, m * m * c1 + c, n * m * c1 - l * s, 0], |
| [l * n * c1 - m * s, m * n * c1 + l * s, n * n * c1 + c, 0], |
| [0, 0, 0, 1], |
| ], dtype=np.float64) |
| |
| z, y, x = axis_vector |
| torchRotation_ary = np.array([ |
| [x * x * c1 + c, y * x * c1 - z * s, z * x * c1 + y * s, 0], |
| [x * y * c1 + z * s, y * y * c1 + c, z * y * c1 - x * s, 0], |
| [x * z * c1 - y * s, y * z * c1 + x * s, z * z * c1 + c, 0], |
| [0, 0, 0, 1], |
| ], dtype=np.float64) |
| |
| outscale_ary = np.array([ |
| [1.0 / output_center[2], 0, 0, 0], |
| [0, 1.0 / output_center[3], 0, 0], |
| [0, 0, 1.0 / output_center[4], 0], |
| [0, 0, 0, 1], |
| ], dtype=np.float64) |
| |
| outtrans_ary = np.array([ |
| [1, 0, 0, -output_center[2]], |
| [0, 1, 0, -output_center[3]], |
| [0, 0, 1, -output_center[4]], |
| [0, 0, 0, 1], |
| ], dtype=np.float64) |
| |
| reorder_ary = np.array([ |
| [0, 0, 1, 0], |
| [0, 1, 0, 0], |
| [1, 0, 0, 0], |
| [0, 0, 0, 1], |
| ], dtype=np.float64) |
| |
| transform_ary = np.dot(np.dot(np.dot(np.dot( |
| intrans_ary, |
| inscale_ary), |
| np.linalg.inv(scipyRotation_ary)), |
| outscale_ary), |
| outtrans_ary) |
| grid_ary = np.dot(np.dot(np.dot(reorder_ary, np.linalg.inv(scipyRotation_ary)), outscale_ary), outtrans_ary) |
| |
| transform_tensor = torch.from_numpy(torchRotation_ary).to(device, torch.float32) |
| transform_tensor = transform_tensor[:3].unsqueeze(0) |
| |
| return transform_tensor, transform_ary, grid_ary |
| # end TestNN.test_affine_* helpers |
| |
| |
| class TestNNDeviceType(NNTestCase): |
| def _test_InstanceNorm_general(self, cls, input, device, dtype=torch.float): |
| # default case track_running_stats=False |
| b, c = input.size(0), input.size(1) |
| input_var = input.to(device=device, dtype=dtype).requires_grad_() |
| |
| IN = cls(c, eps=0).to(device, dtype) |
| |
| output = IN(input_var) |
| out_reshaped = output.view(b * c, -1) |
| |
| mean = out_reshaped.mean(1) |
| var = out_reshaped.var(1, unbiased=False) |
| |
| self.assertEqual(torch.abs(mean.data).mean(), 0, atol=1e-5, rtol=0) |
| self.assertEqual(torch.abs(var.data).mean(), 1, atol=1e-5, rtol=0) |
| |
| # check that eval mode doesn't change behavior |
| grad_out = torch.randn_like(output) |
| res1 = output.data.clone() |
| output.backward(grad_out) |
| grad1 = input_var.grad.data.clone() |
| |
| IN.eval() |
| output = IN(input_var) |
| input_var.grad = None |
| output.backward(grad_out) |
| res2 = output.data |
| grad2 = input_var.grad.data |
| self.assertEqual(res1, res2) |
| self.assertEqual(grad1, grad2) |
| |
| # If track_running_stats=True and momentum=1, running_mean/var should be |
| # equal to mean/var of the input (with unbias correction) |
| IN = cls(c, momentum=1, eps=0, track_running_stats=True).to(device, dtype) |
| |
| output = IN(input_var) |
| |
| input_reshaped = input_var.transpose(1, 0).reshape(c, -1) |
| mean = input_reshaped.mean(1) |
| |
| input_reshaped = input_var.transpose(1, 0).reshape(c, b, -1) |
| var = input_reshaped.var(2, unbiased=True)[:, :] |
| |
| self.assertEqual(torch.abs(mean.data - IN.running_mean).mean(), 0, atol=1e-5, rtol=0) |
| self.assertEqual(torch.abs(var.data.mean(1) - IN.running_var).mean(), 0, atol=1e-5, rtol=0) |
| |
| # in eval mode, adding X * std to a channel in input should make the |
| # corresponding channel in output have mean X |
| IN.eval() |
| delta = IN.running_var.sqrt() * torch.arange(c, device=device, dtype=dtype) |
| delta = delta.view(-1, *[1 for _ in range(2, input.dim())]) |
| output = IN(input_var + delta) |
| self.assertEqual(output.transpose(0, 1).reshape(c, -1).mean(1), torch.arange(c, dtype=dtype)) |
| |
| def _test_InstanceNorm_cuda_half(self, cls, input, device): |
| # THNN |
| input = input.to(device=device, dtype=torch.half).random_(1, 10).requires_grad_(True) |
| m = cls(input.size(1), affine=True, track_running_stats=True).to(device, torch.half) |
| thnn_output = m(input) |
| thnn_output.sum().backward() |
| thnn_input_grad = input.grad.data.clone() |
| self.assertEqualTypeString(thnn_output, input) |
| # cuDNN |
| if TEST_CUDNN: |
| input.grad = None |
| m = m.float() |
| cudnn_output = m(input) |
| cudnn_output.sum().backward() |
| cudnn_input_grad = input.grad.data.clone() |
| self.assertEqualTypeString(cudnn_output, input) |
| self.assertEqual(cudnn_output, thnn_output, atol=1e-4, rtol=0) |
| self.assertEqual(cudnn_input_grad, thnn_input_grad, atol=1e-3, rtol=0) |
| |
| def _test_LayerNorm_general(self, device, dtype=torch.float): |
| for i in range(2, 6): |
| shape = torch.randint(3, 6, (i,), dtype=torch.long).tolist() |
| x = torch.empty(*shape, device=device, dtype=dtype).uniform_(0, 10) |
| normalized_ndim = random.randint(1, i - 1) # inclusive |
| normalized_shape = shape[-normalized_ndim:] |
| unnormalized_shape = shape[:-normalized_ndim] |
| |
| # test that LN normalizes to mean 0 and stddev 1 |
| ln = nn.LayerNorm(normalized_shape, eps=0).to(device, dtype) |
| ln.weight.data.fill_(1) |
| ln.bias.data.fill_(0) |
| output = ln(x) |
| out_reshaped = output.view(*(unnormalized_shape + [-1])) |
| mean = out_reshaped.mean(-1) |
| var = out_reshaped.var(-1, unbiased=False) |
| |
| delta = 1e-1 if (dtype == torch.bfloat16 or dtype == torch.half) else 1e-5 |
| self.assertEqual(torch.abs(mean.data).mean(), 0, atol=delta, rtol=0) |
| self.assertEqual(torch.abs(var.data).mean(), 1, atol=delta, rtol=0) |
| |
| # test that LN applies weight and bias correctly |
| scale, bias = torch.empty(2).uniform_(0.2, 2).tolist() |
| ln.weight.data.fill_(scale) |
| ln.bias.data.fill_(bias) |
| output = ln(x) |
| out_reshaped = output.view(*(unnormalized_shape + [-1])) |
| mean = out_reshaped.mean(-1) |
| var = out_reshaped.var(-1, unbiased=False) |
| self.assertEqual(torch.abs(mean.data).mean(), bias, atol=delta, rtol=0) |
| self.assertEqual(torch.abs(var.data).mean(), scale ** 2, atol=delta, rtol=0) |
| |
| bad_norm_shape_input_shape = { |
| (): (), |
| (2, 3): (3,), |
| (2,): (1, 2, 3), |
| (10,): (2, 3), |
| 10: (2, 3), |
| } |
| for norm_shape, input_shape in bad_norm_shape_input_shape.items(): |
| ln = nn.LayerNorm(norm_shape) |
| input = torch.empty(input_shape, device=device, dtype=dtype).uniform_(0, 10) |
| self.assertRaises(RuntimeError, lambda: ln(input)) |
| |
| def _test_LayerNorm_cuda_half(self, device): |
| input = torch.empty(2, 3, 3, 2, device=device, dtype=torch.half).random_(1, 10).requires_grad_(True) |
| m = nn.LayerNorm([3, 2]).to(device, torch.half) |
| output = m(input) |
| output.sum().backward() |
| self.assertEqualTypeString(output, input) |
| |
| def _test_LayerNorm_cpu_mixed_dtype(self, device, dtype): |
| for elementwise_affine in [True, False]: |
| # layer norm input shape is normalized to m x n, cpu vectorized on n, |
| # so make sure n exceeds vector length |
| input = torch.empty(2, 3, 11, 3, device=device, dtype=dtype).random_(1, 10) |
| m = nn.LayerNorm([11, 3], elementwise_affine=elementwise_affine).to(device, dtype) |
| |
| # fp32 |
| m_fp32 = deepcopy(m).to(device, torch.float) |
| x_fp32 = input.clone().detach().float().requires_grad_() |
| out_fp32 = m_fp32(x_fp32) |
| out_fp32.sum().backward() |
| |
| # bf16/half |
| m_bf16 = deepcopy(m) |
| x_bf16 = input.clone().detach().requires_grad_() |
| out_bf16 = m_bf16(x_bf16) |
| out_bf16.sum().backward() |
| |
| # bf16/half mixed type |
| m_mix = deepcopy(m).to(device, torch.float) |
| x_mix = input.clone().detach().requires_grad_() |
| out_mix = m_mix(x_mix) |
| out_mix.sum().backward() |
| self.assertEqual(out_fp32.to(dtype=dtype), out_bf16) |
| self.assertEqual(out_fp32.to(dtype=dtype), out_mix) |
| self.assertEqual(x_fp32.grad.to(dtype=dtype), x_bf16.grad, atol=1e-1, rtol=1e-1) |
| self.assertEqual(x_fp32.grad.to(dtype=dtype), x_mix.grad, atol=1e-1, rtol=1e-1) |
| |
| def _test_GroupNorm_general(self, device, dtype=torch.float): |
| good_shape_g = { |
| (1, 2, 3, 4): 2, |
| (2, 3, 10): 3, |
| (3, 1, 1, 1, 2): 1, |
| (2, 6, 4, 2, 2): 3, |
| (1, 256, 1, 1): 32, |
| } |
| for shape_g, grad in product(good_shape_g.items(), [True, False]): |
| shape, g = shape_g |
| x = torch.empty(*shape, device=device, dtype=dtype).uniform_(0, 10) |
| x.requires_grad_(grad) |
| b = shape[0] |
| c = shape[1] |
| |
| # test that GN normalizes to mean 0 and stddev 1 |
| gn = nn.GroupNorm(g, c, eps=0).to(device, dtype) |
| gn.weight.data.fill_(1) |
| gn.bias.data.fill_(0) |
| output = gn(x) |
| out_reshaped = output.view(b, g, -1) |
| mean = out_reshaped.mean(-1) |
| var = out_reshaped.var(-1, unbiased=False) |
| self.assertEqual(torch.abs(mean).mean(), 0, atol=1e-5, rtol=0) |
| self.assertEqual(torch.abs(var).mean(), 1, atol=1e-5, rtol=0) |
| |
| output.backward(torch.randn_like(output)) |
| if output.is_cuda: |
| torch.cuda.synchronize() |
| |
| # test that GN applies weight and bias correctly |
| scale = torch.empty(c, device=device, dtype=dtype).uniform_(0.2, 2) |
| bias = torch.empty(c, device=device, dtype=dtype).uniform_(0.2, 2) |
| gn.weight.data.copy_(scale) |
| gn.bias.data.copy_(bias) |
| output = gn(x) |
| out_reshaped = output.view(b, c, -1) |
| out_normed = (out_reshaped - bias.view(c, 1)) / scale.view(c, 1) |
| out_normed_reshaped = out_normed.view(b, g, -1) |
| mean = out_normed_reshaped.mean(-1) |
| var = out_normed_reshaped.var(-1, unbiased=False) |
| self.assertEqual(torch.abs(mean).mean(), 0, atol=1e-5, rtol=0) |
| self.assertEqual(torch.abs(var).mean(), 1, atol=1e-5, rtol=0) |
| |
| bad_shape_g = { |
| (1, 2, 3, 4): 3, |
| (2, 3, 10): 2, |
| (3, 1, 1, 1, 2): 10, |
| (2, 6, 4, 2, 2): 4, |
| } |
| for shape, g in bad_shape_g.items(): |
| with self.assertRaises(ValueError): |
| gn = nn.GroupNorm(g, shape[1]) |
| |
| def _test_GroupNorm_cuda_half(self): |
| input = torch.zeros(2, 4, 3, 2, requires_grad=True).cuda().half().random_(1, 10) |
| m = nn.GroupNorm(2, 4).to("cuda", torch.half) |
| output = m(input) |
| output.sum().backward() |
| self.assertEqualTypeString(output, input) |
| |
| def _test_GroupNorm_cpu_mixed_dtype(self): |
| def helper(self, size, groups, memory_format, dtype): |
| channels = size[1] |
| input = torch.randn(size).cpu().to(dtype=dtype) |
| input_bf1 = input.contiguous(memory_format=memory_format).detach().requires_grad_(True) |
| input_bf2 = input_bf1.clone().detach().requires_grad_(True) |
| input_f = input_bf1.float().detach().requires_grad_(True) |
| m_bf = nn.GroupNorm(groups, channels).cpu().to(dtype=dtype) |
| m_f = deepcopy(m_bf).float() |
| m_f2 = deepcopy(m_f) |
| # bfloat16 input and bfloat16 parameters |
| out = m_bf(input_bf1) |
| # bfloat16 input and float parameters |
| out2 = m_f(input_bf2) |
| # float input and float parameters |
| out3 = m_f2(input_f) |
| self.assertEqual(out, out2, atol=5e-3, rtol=5e-3) |
| self.assertEqual(out2.float(), out3, atol=5e-3, rtol=5e-3) |
| grad_out = torch.randn(out2.shape).cpu().to(dtype=dtype) |
| grad_out_bf1 = grad_out.contiguous(memory_format=memory_format).detach().requires_grad_(True) |
| grad_out_bf2 = grad_out_bf1.clone().detach().requires_grad_(True) |
| grad_out_f = grad_out_bf2.clone().float().detach().requires_grad_(True) |
| # bfloat16/half input grad and float parameters |
| out2.backward(grad_out_bf2, retain_graph=True) |
| # float input grad and float parameters |
| out3.backward(grad_out_f, retain_graph=True) |
| # bfloat16/half input grad and bfloat16/half parameters |
| out.backward(grad_out_bf1, retain_graph=True) |
| # Need higher tolerances atol=1e-4 and rtol=1e-4 on macos |
| self.assertEqual(m_f.weight.grad, m_f2.weight.grad, atol=1e-4, rtol=1e-4) |
| self.assertEqual(m_f.bias.grad, m_f2.bias.grad, atol=1e-5, rtol=1e-5) |
| self.assertEqual(input_bf2.grad.float(), input_f.grad, atol=5e-5, rtol=5e-3) |
| # Full bf16/half has lower precision compared with mixed bf16/half and fp32. |
| # Use Amp to keep module parameters in acc dtype, i.e. float, for better numerical stability |
| atol = None |
| rtol = None |
| if dtype == torch.bfloat16: |
| atol = 1e-2 |
| rtol = 1.2e-1 |
| else: |
| assert dtype == torch.half |
| atol = 5e-3 |
| rtol = 1.5e-2 |
| self.assertEqual(m_bf.weight.grad, m_f.weight.grad.to(dtype=dtype), atol=atol, rtol=rtol) |
| self.assertEqual(m_bf.bias.grad, m_f.bias.grad.to(dtype=dtype), atol=atol, rtol=rtol) |
| self.assertEqual(input_bf1.grad, input_bf2.grad, atol=atol, rtol=rtol) |
| |
| cl_formats = {4: torch.channels_last, 5: torch.channels_last_3d} |
| for dtype in [torch.bfloat16, torch.half]: |
| for shape, g in [((1, 8, 4, 3), 2), ((1, 8, 3, 4), 4), |
| ((4, 40, 40, 40), 2), ((4, 8, 40, 40), 4), |
| ((1, 8, 40, 40), 4), ((1, 8, 40, 40), 2), |
| ((1, 8, 50, 50), 2), ((1, 8, 50, 50), 4), |
| ((1, 40, 50, 50), 2), ((1, 9, 3, 4, 5), 3), |
| ((1, 60, 10, 10, 10), 3), ((1, 9, 10, 50, 50), 3), |
| ((1, 60, 10, 50, 50), 3), ((1, 8, 65, 55), 2), |
| ((1, 3, 65, 55), 1), ((1, 3, 20, 20), 1)]: |
| for is_cl in [False, True]: |
| format = cl_formats[len(shape)] if is_cl else torch.contiguous_format |
| helper(self, shape, g, format, dtype) |
| |
| def _test_module_empty_inputs(self, module, inputs): |
| for _inp in inputs: |
| _inp.requires_grad_(True) |
| out = module(*inputs) |
| gO = torch.rand_like(out) |
| out.backward(gO) |
| |
| for p in module.parameters(): |
| if p.requires_grad: |
| self.assertEqual(p.grad, torch.zeros_like(p.grad)) |
| |
| for _inp in inputs: |
| self.assertEqual(_inp.grad, torch.zeros_like(_inp)) |
| |
| @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'), |
| "Scipy v1.0 and/or numpy not found") |
| @expectedFailureMPS # Unsupported Border padding mode https://github.com/pytorch/pytorch/issues/125098 |
| @tf32_on_and_off() |
| @bf32_on_and_off() |
| def test_affine_2d_rotate0(self, device): |
| # scipy before 1.0.0 do not support homogeneous coordinate |
| # scipy.ndimage.affine_transform, so we need to skip. |
| input_size = [1, 1, 3, 3] |
| input_ary = np.array(np.random.random(input_size), dtype=np.float32) |
| output_size = [1, 1, 5, 5] |
| angle_rad = 0. |
| |
| transform_tensor, transform_ary, offset = \ |
| _buildEquivalentAffineTransforms2d(device, input_size, output_size, angle_rad) |
| |
| scipy_ary = torch.from_numpy(scipy.ndimage.affine_transform( |
| input_ary[0, 0], |
| transform_ary, |
| offset=offset, |
| output_shape=output_size[2:], |
| order=1, |
| mode='nearest', |
| prefilter=False)) |
| |
| affine_tensor = torch.nn.functional.affine_grid( |
| transform_tensor, |
| torch.Size(output_size), |
| align_corners=True |
| ) |
| |
| gridsample_ary = torch.nn.functional.grid_sample( |
| torch.tensor(input_ary, device=device).to(device), |
| affine_tensor, |
| padding_mode='border', |
| align_corners=True |
| ).to('cpu') |
| |
| self.assertEqual(scipy_ary.mean(), gridsample_ary.mean()) |
| self.assertEqual(scipy_ary, gridsample_ary.reshape_as(scipy_ary)) |
| |
| @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'), |
| "Scipy v1.0 and/or numpy not found") |
| @expectedFailureMPS # Unsupported Border padding mode https://github.com/pytorch/pytorch/issues/125098 |
| @tf32_on_and_off(0.001) |
| @bf32_on_and_off(0.001) |
| def test_affine_2d_rotate90(self, device): |
| # scipy before 1.0.0 do not support homogeneous coordinate |
| # scipy.ndimage.affine_transform, so we need to skip. |
| for input_size2dsq, output_size2dsq in \ |
| itertools.product(input_size2dsq_(), output_size2dsq_()): |
| input_size = input_size2dsq |
| input_ary = np.array(np.random.random(input_size), dtype=np.float32) |
| output_size = output_size2dsq |
| angle_rad = 0.25 * math.pi * 2 |
| |
| transform_tensor, transform_ary, offset = \ |
| _buildEquivalentAffineTransforms2d(device, input_size, output_size, angle_rad) |
| |
| scipy_ary = torch.from_numpy(scipy.ndimage.affine_transform( |
| input_ary[0, 0], |
| transform_ary, |
| offset=offset, |
| output_shape=output_size[2:], |
| order=1, |
| mode='nearest', |
| prefilter=True)) |
| |
| if input_size2dsq == output_size2dsq: |
| self.assertEqual(scipy_ary.mean(), input_ary.mean()) |
| self.assertEqual(scipy_ary[0, 0], input_ary[0, 0, 0, -1]) |
| self.assertEqual(scipy_ary[0, -1], input_ary[0, 0, -1, -1]) |
| self.assertEqual(scipy_ary[-1, -1], input_ary[0, 0, -1, 0]) |
| self.assertEqual(scipy_ary[-1, 0], input_ary[0, 0, 0, 0]) |
| |
| affine_tensor = torch.nn.functional.affine_grid( |
| transform_tensor, |
| torch.Size(output_size), |
| align_corners=True |
| ) |
| |
| gridsample_ary = torch.nn.functional.grid_sample( |
| torch.tensor(input_ary, device=device).to(device), |
| affine_tensor, |
| padding_mode='border', |
| align_corners=True |
| ).to('cpu') |
| |
| self.assertEqual(scipy_ary.mean(), gridsample_ary.mean()) |
| self.assertEqual(scipy_ary, gridsample_ary.reshape_as(scipy_ary)) |
| |
| @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'), |
| "Scipy v1.0 and/or numpy not found") |
| @expectedFailureMPS # Unsupported Border padding mode https://github.com/pytorch/pytorch/issues/125098 |
| @tf32_on_and_off(0.005) |
| @bf32_on_and_off(0.005) |
| def test_affine_2d_rotate45(self, device): |
| # scipy before 1.0.0 do not support homogeneous coordinate |
| # scipy.ndimage.affine_transform, so we need to skip. |
| input_size = [1, 1, 3, 3] |
| input_ary = np.array(np.zeros(input_size), dtype=np.float32) |
| input_ary[0, 0, 0, :] = 0.5 |
| input_ary[0, 0, 2, 2] = 1.0 |
| output_size = [1, 1, 3, 3] |
| angle_rad = 0.125 * math.pi * 2 |
| |
| transform_tensor, transform_ary, offset = \ |
| _buildEquivalentAffineTransforms2d(device, input_size, output_size, angle_rad) |
| |
| scipy_ary = torch.from_numpy(scipy.ndimage.affine_transform( |
| input_ary[0, 0], |
| transform_ary, |
| offset=offset, |
| output_shape=output_size[2:], |
| order=1, |
| mode='nearest', |
| prefilter=False)) |
| |
| affine_tensor = torch.nn.functional.affine_grid( |
| transform_tensor, |
| torch.Size(output_size), |
| align_corners=True |
| ) |
| |
| gridsample_ary = torch.nn.functional.grid_sample( |
| torch.tensor(input_ary, device=device).to(device), |
| affine_tensor, |
| padding_mode='border', |
| align_corners=True |
| ).to('cpu') |
| |
| self.assertEqual(scipy_ary, gridsample_ary.reshape_as(scipy_ary)) |
| |
| @onlyCUDA |
| @largeTensorTest("60GB", "cpu") |
| @largeTensorTest("16GB", "cuda") |
| def test_avg_pool_large_tensor(self, device): |
| # test for https://github.com/pytorch/pytorch/issues/113833 |
| a = torch.randn(128, 256, 256, 256, dtype=torch.half, device=device, requires_grad=True) |
| a_cpu = a.detach().cpu().float() |
| m = torch.nn.AvgPool2d(2) |
| o = m(a) |
| a_cpu.requires_grad = True |
| o.sum().backward() |
| o_cpu = m(a_cpu) |
| o_cpu.sum().backward() |
| # workaround for memory usage overhead of assertEqual |
| self.assertTrue(torch.allclose(a.grad.cpu(), a_cpu.grad.half())) |
| |
| @onlyCUDA |
| @largeTensorTest("48GB", "cpu") |
| @largeTensorTest("48GB", "cuda") |
| def test_avg_pool_large_tensor2(self, device): |
| # test for https://github.com/pytorch/pytorch/issues/129785 |
| out_size = [2048, 64, 104, 79] |
| size = [2048, 64, 209, 159] |
| inp = torch.randn(size, device=device, requires_grad=True, dtype=torch.float) |
| inp_cpu = inp.detach().cpu() |
| m = torch.nn.AvgPool2d([2, 2], [2, 2], [0, 0], False, True, None) |
| o = m(inp) |
| inp_cpu.requires_grad = True |
| o.sum().backward() |
| o_cpu = m(inp_cpu) |
| o_cpu.sum().backward() |
| self.assertEqual(o.shape, out_size) |
| self.assertEqual(o_cpu.shape, out_size) |
| # reduce memory usage |
| self.assertEqual(inp.grad.sum(), inp_cpu.grad.sum()) |
| |
| @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'), |
| "Scipy v1.0 and/or numpy not found") |
| @expectedFailureMPS # Unsupported Border padding mode https://github.com/pytorch/pytorch/issues/125098 |
| @tf32_on_and_off(0.005) |
| @bf32_on_and_off(0.005) |
| def test_affine_2d_rotateRandom(self, device): |
| # scipy before 1.0.0 do not support homogeneous coordinate |
| # scipy.ndimage.affine_transform, so we need to skip. |
| for angle_rad, input_size2d, output_size2d in \ |
| itertools.product(angle_rad_(), input_size2d_(), output_size2d_()): |
| |
| input_size = input_size2d |
| input_ary = np.array(np.random.random(input_size), dtype=np.float32).round(3) |
| output_size = output_size2d |
| |
| input_ary[0, 0, 0, 0] = 2 |
| input_ary[0, 0, 0, -1] = 4 |
| input_ary[0, 0, -1, 0] = 6 |
| input_ary[0, 0, -1, -1] = 8 |
| |
| transform_tensor, transform_ary, grid_ary = \ |
| _buildEquivalentAffineTransforms2d(device, input_size, output_size, angle_rad) |
| |
| scipy_ary = torch.from_numpy(scipy.ndimage.affine_transform( |
| input_ary[0, 0], |
| transform_ary, |
| output_shape=output_size[2:], |
| order=1, |
| mode='nearest', |
| prefilter=False)) |
| |
| affine_tensor = torch.nn.functional.affine_grid( |
| transform_tensor, |
| torch.Size(output_size), |
| align_corners=True |
| ) |
| |
| gridsample_ary = torch.nn.functional.grid_sample( |
| torch.tensor(input_ary, device=device).to(device), |
| affine_tensor, |
| padding_mode='border', |
| align_corners=True |
| ).to('cpu') |
| |
| affine_tensor = affine_tensor.to('cpu') |
| |
| for r in range(affine_tensor.size(1)): |
| for c in range(affine_tensor.size(2)): |
| grid_out = np.dot(grid_ary, [r, c, 1]) |
| self.assertEqual(affine_tensor[0, r, c], grid_out[:2], exact_dtype=False) |
| |
| self.assertEqual(scipy_ary, gridsample_ary.reshape_as(scipy_ary)) |
| |
| @unittest.skipIf((not TEST_NUMPY) or (not TEST_SCIPY) or (scipy.__version__ < '1.0.0'), |
| "Scipy v1.0 and/or numpy not found") |
| @expectedFailureMPS # aten::grid_sampler_3d not implemented https://github.com/pytorch/pytorch/issues/77764 |
| @tf32_on_and_off(0.005) |
| @bf32_on_and_off(0.005) |
| def test_affine_3d_rotateRandom(self, device): |
| # scipy before 1.0.0 do not support homogeneous coordinate |
| # scipy.ndimage.affine_transform, so we need to skip. |
| for angle_rad, axis_vector, input_size3d, output_size3d in \ |
| itertools.product(angle_rad_(), axis_vector_(), input_size3d_(), output_size3d_()): |
| input_size = input_size3d |
| input_ary = np.array(np.random.random(input_size), dtype=np.float32) |
| output_size = output_size3d |
| |
| input_ary[0, 0, 0, 0, 0] = 2 |
| input_ary[0, 0, 0, 0, -1] = 3 |
| input_ary[0, 0, 0, -1, 0] = 4 |
| input_ary[0, 0, 0, -1, -1] = 5 |
| input_ary[0, 0, -1, 0, 0] = 6 |
| input_ary[0, 0, -1, 0, -1] = 7 |
| input_ary[0, 0, -1, -1, 0] = 8 |
| input_ary[0, 0, -1, -1, -1] = 9 |
| |
| transform_tensor, transform_ary, grid_ary = \ |
| _buildEquivalentAffineTransforms3d(device, input_size, output_size, angle_rad, axis_vector) |
| |
| scipy_ary = torch.from_numpy(scipy.ndimage.affine_transform( |
| input_ary[0, 0], |
| transform_ary, |
| output_shape=output_size[2:], |
| order=1, |
| mode='nearest', |
| prefilter=False)) |
| |
| affine_tensor = torch.nn.functional.affine_grid( |
| transform_tensor, |
| torch.Size(output_size), |
| align_corners=True |
| ) |
| |
| gridsample_ary = torch.nn.functional.grid_sample( |
| torch.tensor(input_ary, device=device).to(device), |
| affine_tensor, |
| padding_mode='border', |
| align_corners=True |
| ).to('cpu') |
| |
| affine_tensor = affine_tensor.to('cpu') |
| |
| for i in range(affine_tensor.size(1)): |
| for r in range(affine_tensor.size(2)): |
| for c in range(affine_tensor.size(3)): |
| grid_out = np.dot(grid_ary, [i, r, c, 1]) |
| self.assertEqual(affine_tensor[0, i, r, c], grid_out[:3], exact_dtype=False) |
| |
| self.assertEqual(scipy_ary, gridsample_ary.reshape_as(scipy_ary)) |
| |
| |
| @onlyCUDA |
| @dtypes(torch.float, torch.half) |
| def test_batchnorm_large_batch(self, device, dtype): |
| bn = nn.BatchNorm2d(1).to(device, dtype) |
| data = torch.rand(880801, 1, 1, 1, device=device, dtype=dtype) |
| out = bn(data).sum().backward() |
| |
| @dtypesIfCUDA(torch.float, torch.double, torch.half, torch.complex128) |
| @dtypesIfMPS(torch.float, torch.half, torch.complex64) |
| @dtypes(torch.float, torch.double, torch.bfloat16, torch.complex128) |
| def test_conv_empty_input(self, device, dtype): |
| def help(input, conv, memory_format): |
| ref_out = conv(input) |
| conv_cl = conv.to(memory_format=memory_format) |
| out_cl = conv_cl(input) |
| self.assertEqual(ref_out, out_cl) |
| input_cl = input.to(memory_format=memory_format) |
| out_cl2 = conv(input_cl) |
| self.assertEqual(out_cl, out_cl2) |
| out_cl3 = conv_cl(input_cl) |
| self.assertEqual(out_cl, out_cl3) |
| |
| # channels_last case |
| input2d = torch.randn((0, 4, 20, 20)).to(device=device, dtype=dtype) |
| conv2d = torch.nn.Conv2d(4, 4, 3, 1).to(device=device, dtype=dtype) |
| help(input2d, conv2d, torch.channels_last) |
| # channels_last_3d case |
| input3d = torch.randn((0, 4, 20, 20, 20)).to(device=device, dtype=dtype) |
| conv3d = torch.nn.Conv3d(4, 4, 3, 1).to(device=device, dtype=dtype) |
| help(input3d, conv3d, torch.channels_last_3d) |
| # non-contiguous case |
| weight = torch.rand(4, 8, 3, 3)[:, ::2, :, :].to(device=device, dtype=dtype) |
| bias = torch.rand(4).to(device=device, dtype=dtype) |
| out = F.conv2d(input2d, weight, bias, (1, 1), 0, (1, 1), 1) |
| weight = weight.contiguous() |
| out_ref = F.conv2d(input2d, weight, bias, (1, 1), 0, (1, 1), 1) |
| self.assertEqual(out_ref, out) |
| # sigfpe reported in https://github.com/pytorch/pytorch/issues/94125 |
| with self.assertRaises(RuntimeError): |
| inp = torch.empty([1, 1, 1, 0], dtype=dtype, device=device) |
| weight = torch.empty([1, 0, 1], dtype=dtype, device=device) |
| torch._C._nn.slow_conv3d(inp, weight, 1) |
| |
| with self.assertRaisesRegex(RuntimeError, re.escape("2D kernel_size expected")): |
| torch._C._nn.thnn_conv2d(torch.rand([1, 1, 1, 1]), kernel_size=[], padding=[1, 1], stride=[1, 1], |
| weight=torch.rand([1, 1])) |
| with self.assertRaisesRegex(RuntimeError, re.escape("2D stride expected")): |
| torch._C._nn.thnn_conv2d(torch.rand([1, 1, 1, 1]), kernel_size=[1, 1], padding=[1, 1], stride=[], |
| weight=torch.rand([1, 1])) |
| with self.assertRaisesRegex(RuntimeError, re.escape("2D padding expected")): |
| torch._C._nn.thnn_conv2d(torch.rand([1, 1, 1, 1]), kernel_size=[1, 1], padding=[], stride=[1, 1], |
| weight=torch.rand([1, 1])) |
| |
| def test_InstanceNorm1d_general(self, device): |
| b = random.randint(3, 5) |
| c = random.randint(3, 5) |
| d = random.randint(8, 10) |
| |
| input = torch.rand(b, c, d) |
| self._test_InstanceNorm_general(nn.InstanceNorm1d, input, device) |
| |
| if self.device_type == 'cuda': |
| self._test_InstanceNorm_cuda_half(nn.InstanceNorm1d, input, device) |
| |
| def test_InstanceNorm2d_general(self, device): |
| b = random.randint(3, 5) |
| c = random.randint(3, 5) |
| w = random.randint(3, 6) |
| h = random.randint(6, 8) |
| |
| input = torch.rand(b, c, h, w) |
| self._test_InstanceNorm_general(nn.InstanceNorm2d, input, device) |
| |
| if self.device_type == 'cuda': |
| self._test_InstanceNorm_cuda_half(nn.InstanceNorm2d, input, device) |
| |
| def test_InstanceNorm3d_general(self, device): |
| b = random.randint(3, 5) |
| c = random.randint(3, 5) |
| w = random.randint(2, 5) |
| h = random.randint(2, 5) |
| d = random.randint(2, 5) |
| |
| input = torch.rand(b, c, h, w, d) |
| self._test_InstanceNorm_general(nn.InstanceNorm3d, input, device) |
| |
| if self.device_type == 'cuda': |
| self._test_InstanceNorm_cuda_half(nn.InstanceNorm3d, input, device) |
| |
| @parametrize_test("instance_norm_cls", [nn.InstanceNorm1d, nn.InstanceNorm2d, nn.InstanceNorm3d], name_fn=lambda c: c.__name__) |
| @parametrize_test("no_batch_dim", [True, False]) |
| @parametrize_test("affine", [True, False]) |
| def test_instancenorm_raises_error_if_input_channels_is_not_num_features(self, device, instance_norm_cls, no_batch_dim, affine): |
| inst_norm = instance_norm_cls(4, affine=affine) |
| size = [2] * inst_norm._get_no_batch_dim() |
| if not no_batch_dim: |
| size = [3] + size |
| t = torch.randn(size) |
| if affine: |
| with self.assertRaisesRegex(ValueError, "expected input's size at dim="): |
| inst_norm(t) |
| else: |
| with warnings.catch_warnings(record=True) as w: |
| inst_norm(t) |
| self.assertIn("which is not used because affine=False", str(w[0].message)) |
| |
| def test_instancenorm_raises_error_if_less_than_one_value_per_channel(self, device): |
| x = torch.rand(10)[None, :, None] |
| with self.assertRaises(ValueError): |
| torch.nn.InstanceNorm1d(10)(x).to(device) |
| |
| def test_instancenorm_raises_error_for_single_spatial_element_during_training(self, device): |
| BATCH_SIZE = 10 |
| norms = [torch.nn.InstanceNorm1d, torch.nn.InstanceNorm2d, torch.nn.InstanceNorm3d] |
| for i, norm in enumerate(norms): |
| m = norm(NUM_CHANNELS, track_running_stats=True) |
| m.to(device) |
| |
| # Create an appropriately-sized input with a single spatial element. |
| input = torch.randn(BATCH_SIZE, NUM_CHANNELS, *[1 for _ in range(i + 1)], |
| device=device) |
| with self.assertRaises(ValueError): |
| m(input) |
| |
| # Single spatial element should be fine in eval. |
| m.eval() |
| m(input) |
| |
| def test_LayerNorm_general(self, device): |
| self._test_LayerNorm_general(device) |
| |
| if self.device_type == 'cuda' or self.device_type == 'cpu': |
| for dtype in [torch.half, torch.bfloat16]: |
| self._test_LayerNorm_general(device, dtype=dtype) |
| |
| if self.device_type == 'cuda': |
| self._test_LayerNorm_cuda_half(device) |
| |
| if self.device_type == 'cpu': |
| for dtype in [torch.half, torch.bfloat16]: |
| self._test_LayerNorm_cpu_mixed_dtype(device, dtype=dtype) |
| |
| @onlyNativeDeviceTypes |
| def test_LayerNorm_numeric(self, device): |
| def layer_norm_ref(X, gamma, beta, normalized_shape, eps): |
| feature_size = np.prod(normalized_shape) |
| X_view = X.view(-1, feature_size) |
| mean = X_view.mean(dim=-1, keepdim=True) |
| var = X_view.var(dim=-1, unbiased=False, keepdim=True) |
| Y = (X_view - mean) / torch.sqrt(var + eps) |
| Y = Y * gamma.view(-1) + beta.view(-1) |
| return Y.view(*X.size()) |
| |
| normalized_shape = [256, 256, 144] |
| layer_norm = nn.LayerNorm(normalized_shape).float().to(device) |
| X = torch.rand(2, *normalized_shape, dtype=torch.float32, |
| device=device) |
| |
| Y = layer_norm(X) |
| Y_ref = layer_norm_ref(X, layer_norm.weight.data, layer_norm.bias.data, |
| normalized_shape, layer_norm.eps) |
| self.assertEqual(Y, Y_ref, rtol=0, atol=1e-5) |
| |
| if self.device_type == 'cuda': |
| layer_norm.cpu() |
| Y_cpu = layer_norm(X.cpu()) |
| self.assertEqual(Y_cpu, Y, rtol=0, atol=1e-5) |
| |
| @onlyCPU |
| def test_glu_bfloat16(self, device): |
| def test_dtype(fn, input, dtype): |
| input = input.detach().clone().to(dtype=dtype).requires_grad_(True) |
| input2 = input.detach().clone().float().requires_grad_(True) |
| out = fn(input) |
| out.sum().backward() |
| out2 = fn(input2) |
| out2.sum().backward() |
| self.assertEqual(out.dtype, dtype) |
| self.assertEqual(input.grad.dtype, dtype) |
| self.assertEqual(out, out2, exact_dtype=False) |
| self.assertEqual(input.grad, input2.grad, atol=1e-2, rtol=0, exact_dtype=False) |
| |
| def func(device): |
| return torch.nn.GLU(dim=-1).to(device) |
| |
| shapes = [[1, 3, 1, 6], [1, 3, 1, 128], [1, 3, 256, 256]] |
| for shape in shapes: |
| x = torch.randn(shape, device=device) |
| test_dtype(func(device), x, torch.bfloat16) |
| |
| @onlyNativeDeviceTypes |
| def test_GroupNorm_general(self, device): |
| self._test_GroupNorm_general(device) |
| |
| if self.device_type == 'cuda': |
| self._test_GroupNorm_cuda_half() |
| |
| if self.device_type == 'cpu': |
| self._test_GroupNorm_cpu_mixed_dtype() |
| |
| def test_GroupNorm_raises_error_if_one_value_per_group(self, device): |
| x = torch.rand(10)[None, :, None] |
| with self.assertRaises(ValueError): |
| torch.nn.GroupNorm(10, 10)(x).to(device) |
| |
| def test_GroupNorm_empty(self, device): |
| mod = torch.nn.GroupNorm(2, 4).to(device) |
| inp = torch.randn(0, 4, 2, 2, device=device) |
| _test_module_empty_input(self, mod, inp) |
| if self.device_type == 'cuda' and self.has_cudnn(): |
| with torch.backends.cudnn.flags(enabled=False): |
| _test_module_empty_input(self, mod, inp) |
| |
| @onlyCPU |
| @dtypes(torch.float, torch.double, torch.bfloat16, torch.half) |
| def test_groupnorm_nhwc(self, device, dtype): |
| def helper(self, size, groups, memory_format, is_mixed): |
| channels = size[1] |
| input = torch.randn(size, dtype=dtype, device=device, requires_grad=True) |
| input = input.contiguous(memory_format=memory_format) |
| input.retain_grad() |
| grad = torch.randn(size, dtype=dtype, device=device) |
| grad = grad.contiguous(memory_format=memory_format) |
| if dtype == torch.bfloat16 and is_mixed: |
| gn = nn.GroupNorm(groups, channels).to(device).to(torch.float) |
| else: |
| gn = nn.GroupNorm(groups, channels).to(device).to(dtype) |
| gn.weight.data.uniform_() |
| gn.bias.data.uniform_() |
| |
| ref_input = input.detach().clone().contiguous(memory_format=torch.contiguous_format).requires_grad_(True) |
| ref_grad = grad.detach().clone().contiguous(memory_format=torch.contiguous_format) |
| if dtype == torch.bfloat16 and is_mixed: |
| ref_gn = nn.GroupNorm(groups, channels).to(device).to(torch.float) |
| else: |
| ref_gn = nn.GroupNorm(groups, channels).to(device).to(dtype) |
| ref_gn.load_state_dict(gn.state_dict()) |
| out = gn(input) |
| out.backward(grad) |
| ref_out = ref_gn(ref_input) |
| ref_out.backward(ref_grad) |
| |
| self.assertTrue(out.is_contiguous(memory_format=memory_format)) |
| self.assertTrue(ref_out.is_contiguous(memory_format=torch.contiguous_format)) |
| self.assertEqual(out, ref_out) |
| # parameters in bfloat16/Half is not recommended |
| atol = 5e-4 |
| rtol = 8e-3 |
| |
| self.assertEqual(gn.weight.grad, ref_gn.weight.grad, atol=atol, rtol=rtol) |
| self.assertEqual(gn.bias.grad, ref_gn.bias.grad, atol=atol, rtol=rtol) |
| self.assertEqual(input.grad, ref_input.grad, atol=atol, rtol=rtol) |
| |
| for is_mixed in [True, False]: |
| helper(self, (4, 8, 10, 10), 4, torch.channels_last, is_mixed) |
| helper(self, (2, 30, 9, 9), 3, torch.channels_last, is_mixed) |
| helper(self, (4, 8, 40, 40), 4, torch.channels_last, is_mixed) |
| helper(self, (4, 40, 40, 40), 2, torch.channels_last, is_mixed) |
| helper(self, (2, 30, 50, 50), 3, torch.channels_last, is_mixed) |
| helper(self, (2, 60, 50, 50), 3, torch.channels_last, is_mixed) |
| helper(self, (2, 9, 7, 11, 15), 3, torch.channels_last_3d, is_mixed) |
| helper(self, (2, 9, 7, 200, 15), 3, torch.channels_last_3d, is_mixed) |
| helper(self, (2, 60, 7, 200, 15), 3, torch.channels_last_3d, is_mixed) |
| |
| @onlyNativeDeviceTypes |
| def test_GroupNorm_memory_format(self, device): |
| # Tests for regression reported in https://github.com/pytorch/pytorch/issues/92166 |
| |
| def helper(input_format, grad_format, B=2, C=4, W=4, H=4): |
| import copy |
| net_orig = torch.nn.GroupNorm(B, C).to(device=device) |
| net = copy.deepcopy(net_orig) |
| x_orig = torch.rand(B, C, W, H, device=device, requires_grad=True) |
| grad_orig = torch.rand(B, C, W, H, device=device) |
| x = x_orig.clone().detach().to(memory_format=input_format).requires_grad_(True) |
| grad = grad_orig.detach().to(memory_format=grad_format) |
| |
| y = net(x) |
| y.backward(grad) |
| |
| y_orig = net_orig(x_orig) |
| y_orig.backward(grad_orig) |
| |
| self.assertEqual(y, y_orig) |
| self.assertEqual(x.grad, x_orig.grad) |
| |
| for input_format in [torch.contiguous_format, torch.channels_last]: |
| for grad_format in [torch.contiguous_format, torch.channels_last]: |
| helper(input_format, grad_format) |
| |
| @onlyNativeDeviceTypes |
| def test_GroupNorm_numeric(self, device): |
| def group_norm_ref(X, gamma, beta, groups, channels, eps): |
| batch_size = X.size()[0] |
| X_view = X.view(batch_size, groups, -1) |
| mean = X_view.mean(dim=-1, keepdim=True) |
| var = X_view.var(dim=-1, unbiased=False, keepdim=True) |
| Y = ((X_view - mean) / torch.sqrt(var + eps)).view( |
| batch_size, channels, -1) |
| Y = Y * gamma.view(channels, 1) + beta.view(channels, 1) |
| return Y.view(*X.size()) |
| |
| batch_size = 1 |
| groups = 2 |
| channels = 8 |
| group_norm = nn.GroupNorm(groups, channels).float().to(device) |
| X = torch.rand(batch_size, channels, 256, 256, 72, |
| dtype=torch.float32, device=device) |
| |
| Y = group_norm(X) |
| Y_ref = group_norm_ref( |
| X, group_norm.weight.data, group_norm.bias.data, groups, |
| channels, group_norm.eps) |
| self.assertEqual(Y, Y_ref, rtol=0, atol=1e-5) |
| |
| if self.device_type == 'cuda': |
| group_norm.cpu() |
| Y_cpu = group_norm(X.cpu()) |
| self.assertEqual(Y_cpu, Y, rtol=0, atol=1e-5) |
| |
| @onlyNativeDeviceTypes |
| @dtypes(torch.float64, torch.complex128) |
| def test_pad(self, device, dtype): |
| # Assert assertion errors are raised for invalid circular padding values |
| inputs = torch.randn(1, 1, 4, device=device, dtype=dtype, requires_grad=True) |
| # Should raise error when trying to wrap around more than once |
| self.assertRaises(RuntimeError, lambda: F.pad(inputs, (5, 4), mode='circular')) |
| self.assertRaises(RuntimeError, lambda: F.pad(inputs, (3, 6), mode='circular')) |
| # Should raise error when negative padding results in negative output shape |
| self.assertRaises(RuntimeError, lambda: F.pad(inputs, (-3, -2), mode='circular')) |
| |
| # assert that relfection padding errors when pad >= input size |
| expected_err_msg = r"Padding size should be less than the corresponding input dimension" |
| inputs = torch.randn(1, 1, 2, 3, device=device, dtype=dtype) |
| self.assertRaisesRegex(RuntimeError, expected_err_msg, |
| lambda: F.pad(inputs, (1, 1, 3, 0), mode='reflect')) |
| inputs = torch.randn(1, 1, 2, device=device, dtype=dtype) |
| self.assertRaisesRegex(RuntimeError, expected_err_msg, |
| lambda: F.pad(inputs, (2, 1), mode='reflect')) |
| |
| inputs = torch.rand(1, 3, 4, 4, device=device, dtype=dtype) |
| # assert that pad doesn't return a view into the input tensor |
| for mode in 'constant', 'reflect', 'replicate', 'circular': |
| out = F.pad(inputs, (0, 0, 0, 0), mode=mode) |
| out.fill_(4) |
| self.assertTrue(torch.all(torch.abs(inputs) < 2)) |
| |
| out = F.pad(inputs, (0, 0, -1, -1), mode=mode) |
| out.fill_(4) |
| self.assertTrue(torch.all(torch.abs(inputs) < 2)) |
| |
| @onlyNativeDeviceTypes |
| @dtypes(torch.float64, torch.complex128) |
| def test_ReplicationPad_empty(self, device, dtype): |
| for mod, inp in [ |
| (torch.nn.ReplicationPad1d(3), torch.randn(0, 3, 10, device=device, dtype=dtype)), |
| (torch.nn.ReplicationPad2d(3), torch.randn(0, 3, 10, 10, device=device, dtype=dtype)), |
| (torch.nn.ReplicationPad3d(3), torch.randn(0, 3, 10, 10, 10, device=device, dtype=dtype))]: |
| _test_module_empty_input(self, mod, inp, check_size=False) |
| |
| with self.assertRaisesRegex(RuntimeError, 'Expected 2D or 3D'): |
| mod = torch.nn.ReplicationPad1d(2) |
| inp = torch.randn(3, 0, 10, device=device, dtype=dtype) |
| mod(inp) |
| |
| with self.assertRaisesRegex(RuntimeError, 'Expected 3D or 4D'): |
| mod = torch.nn.ReplicationPad2d((2, 2, 2, 2)) |
| inp = torch.randn(43, 0, 10, 10, device=device, dtype=dtype) |
| mod(inp) |
| |
| with self.assertRaisesRegex(RuntimeError, 'Expected 4D or 5D'): |
| mod = torch.nn.ReplicationPad3d((2, 2, 2, 2, 2, 2)) |
| inp = torch.randn(3, 0, 10, 10, 10, device=device, dtype=dtype) |
| mod(inp) |
| |
| with self.assertRaisesRegex(RuntimeError, 'padding size is expected to be 2'): |
| torch._C._nn.replication_pad1d(torch.randn([2]), padding=[]) |
| |
| with self.assertRaisesRegex(RuntimeError, 'padding size is expected to be 4'): |
| torch._C._nn.replication_pad2d(torch.randn([2]), padding=[]) |
| |
| with self.assertRaisesRegex(RuntimeError, 'padding size is expected to be 6'): |
| torch._C._nn.replication_pad3d(torch.randn([2]), padding=[]) |
| |
| @expectedFailureMPS # TODO(hvaara): Investigate as possible bug. |
| def test_ReplicationPad1d_large(self, device): |
| shapes = ([2, 65736, 4], [65736, 2, 4]) |
| pl, pr = 3, 4 |
| for shape in shapes: |
| x = torch.randn(shape, device=device, requires_grad=True) |
| model = torch.nn.ReplicationPad1d((pl, pr)) |
| |
| # forward |
| out = model(x) |
| self.assertEqual(out[:, :, pl : -pr], x) |
| |
| left_padding = out[:, :, : pl] |
| self.assertEqual(left_padding, x[:, :, :1].expand_as(left_padding)) |
| right_padding = out[:, :, -pr :] |
| self.assertEqual(right_padding, x[:, :, -1:].expand_as(right_padding)) |
| |
| # backward |
| g = torch.randn_like(out) |
| out.backward(g) |
| self.assertEqual(x.grad[:, :, 1 : -1], g[:, :, pl + 1 : -pr - 1]) |
| |
| self.assertEqual(x.grad[:, :, 0], g[:, :, : pl + 1].sum(-1)) |
| self.assertEqual(x.grad[:, :, -1], g[:, :, -pr - 1:].sum(-1)) |
| |
| @expectedFailureMPS # TODO(hvaara): Investigate as possible bug. |
| def test_ReplicationPad2d_large(self, device): |
| shapes = ([2, 65736, 4, 4], [65736, 2, 4, 4]) |
| pl, pr, pt, pb = 3, 4, 5, 6 |
| for shape in shapes: |
| x = torch.randn(shape, device=device, requires_grad=True) |
| model = torch.nn.ReplicationPad2d((pl, pr, pt, pb)) |
| |
| # forward center, edge |
| out = model(x) |
| self.assertEqual(out[:, :, pt : -pb, pl : -pr], x) |
| |
| left_padding = out[:, :, pt : -pb, : pl] |
| self.assertEqual(left_padding, x[:, :, :, :1].expand_as(left_padding)) |
| right_padding = out[:, :, pt : -pb, -pr :] |
| self.assertEqual(right_padding, x[:, :, :, -1:].expand_as(right_padding)) |
| top_padding = out[:, :, : pt, pl : -pr] |
| self.assertEqual(top_padding, x[:, :, :1, :].expand_as(top_padding)) |
| bottom_padding = out[:, :, -pb : , pl : -pr] |
| self.assertEqual(bottom_padding, x[:, :, -1:, :].expand_as(bottom_padding)) |
| |
| # forward corner |
| tl_padding = out[:, :, : pt + 1, : pl + 1] |
| self.assertEqual(tl_padding, x[:, :, :1, :1].expand_as(tl_padding)) |
| tr_padding = out[:, :, : pt + 1, -pr - 1:] |
| self.assertEqual(tr_padding, x[:, :, :1, -1:].expand_as(tr_padding)) |
| bl_padding = out[:, :, -pb - 1:, : pl + 1] |
| self.assertEqual(bl_padding, x[:, :, -1:, :1].expand_as(bl_padding)) |
| br_padding = out[:, :, -pb - 1:, -pr - 1:] |
| self.assertEqual(br_padding, x[:, :, -1:, -1:].expand_as(br_padding)) |
| |
| # backward center, edge |
| g = torch.randn_like(out) |
| out.backward(g) |
| self.assertEqual(x.grad[:, :, 1:-1, 1:-1], g[:, :, pt + 1 : -pb - 1, pl + 1 : -pr - 1]) |
| |
| self.assertEqual(x.grad[:, :, 1:-1, 0], g[:, :, pt + 1 : -pb - 1, : pl + 1].sum(-1)) |
| self.assertEqual(x.grad[:, :, 1:-1, -1], g[:, :, pt + 1 : -pb - 1, -pr - 1 :].sum(-1)) |
| self.assertEqual(x.grad[:, :, 0, 1:-1], g[:, :, : pt + 1, pl + 1 : -pr - 1].sum(-2)) |
| self.assertEqual(x.grad[:, :, -1, 1:-1], g[:, :, -pb - 1 :, pl + 1 : -pr - 1].sum(-2)) |
| |
| # backward corner |
| self.assertEqual(x.grad[:, :, 0, 0], g[:, :, : pt + 1, : pl + 1].sum((-2, -1))) |
| self.assertEqual(x.grad[:, :, 0, -1], g[:, :, : pt + 1, -pr - 1 :].sum((-2, -1))) |
| self.assertEqual(x.grad[:, :, -1, 0], g[:, :, -pb - 1 :, : pl + 1].sum((-2, -1))) |
| self.assertEqual(x.grad[:, :, -1, -1], g[:, :, -pb - 1 :, -pr - 1 :].sum((-2, -1))) |
| |
| @largeTensorTest("6GB") |
| def test_ReplicationPad3d_large(self, device): |
| shapes = ([1, 65736, 2, 2, 2], [65736, 1, 2, 2, 2]) |
| pl, pr, pt, pbt, pf, pbk = 3, 4, 5, 6, 7, 8 |
| |
| for shape in shapes: |
| x = torch.randn(shape, device=device, requires_grad=True) |
| model = torch.nn.ReplicationPad3d((pl, pr, pt, pbt, pf, pbk)) |
| |
| # forward center |
| out = model(x) |
| self.assertEqual(out[:, :, pf : -pbk, pt : -pbt, pl : -pr], x) |
| |
| # backward center |
| g = torch.randn_like(out) |
| out.backward(g) |
| self.assertEqual(x.grad[:, :, 1:-1, 1:-1, 1:-1], g[:, :, pf + 1 : -pbk - 1, pt + 1 : -pbt - 1, pl + 1 : -pr - 1]) |
| |
| @onlyNativeDeviceTypes |
| def test_Bilinear_empty(self, device): |
| mod = torch.nn.Bilinear(20, 30, 40).to(device) |
| inp1 = torch.randn(0, 10, 20, requires_grad=True, device=device) |
| inp2 = torch.randn(0, 10, 30, requires_grad=True, device=device) |
| |
| output = mod(inp1, inp2) |
| output.sum().backward() |
| |
| self.assertEqual(inp1, torch.zeros_like(inp1)) |
| self.assertEqual(inp2, torch.zeros_like(inp2)) |
| |
| self.assertEqual(inp1.grad, torch.zeros_like(inp1)) |
| self.assertEqual(inp2.grad, torch.zeros_like(inp2)) |
| |
| @expectedFailureMeta # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1] |
| @onlyNativeDeviceTypes |
| def test_TransformerEncoderLayer_empty(self, device): |
| for training in (True, False): |
| for batch_first, input_shape in [(True, (0, 10, 512)), |
| (False, (10, 0, 512))]: |
| input = torch.rand(*input_shape, device=device, dtype=torch.double) |
| encoder_layer = nn.TransformerEncoderLayer( |
| d_model=512, nhead=8, batch_first=batch_first, dtype=torch.double).to(device) |
| if not training: |
| encoder_layer = encoder_layer.eval() |
| with torch.no_grad(): |
| _test_module_empty_input(self, encoder_layer, input, check_size=False, inference=True) |
| if batch_first and not TEST_WITH_CROSSREF: |
| with torch.no_grad(): |
| # A NestedTensor with no tensors inside it doesn't have dim 3 (or dim |
| # 2, for that matter) so it can't hit the fast path, nor can we give a |
| # result. |
| with self.assertRaisesRegex( |
| AssertionError, 'MultiheadAttention does not support NestedTensor outside'): |
| nt = torch.nested.nested_tensor([], device=device) |
| _test_module_empty_input(self, encoder_layer, nt, check_size=False, inference=True) |
| |
| nt = torch.nested.nested_tensor([torch.rand(0, 512, device=device, dtype=torch.double)], device=device) |
| _test_module_empty_input(self, encoder_layer, nt, check_size=False, inference=True) |
| else: |
| _test_module_empty_input(self, encoder_layer, input, check_size=False) |
| |
| @expectedFailureMeta # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1] |
| @onlyNativeDeviceTypes |
| def test_TransformerEncoder_empty(self, device): |
| for batch_first, input_shape in [(True, (0, 10, 512)), |
| (False, (10, 0, 512))]: |
| input = torch.rand(*input_shape, device=device, dtype=torch.double) |
| encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, batch_first=batch_first, dtype=torch.double).to(device) |
| transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6).to(device) |
| _test_module_empty_input(self, transformer_encoder, input, check_size=False) |
| |
| @expectedFailureMeta # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1] |
| @onlyNativeDeviceTypes |
| def test_TransformerDecoderLayer_empty(self, device): |
| for batch_first, memory_shape, tgt_shape in [(True, (0, 10, 512), (0, 20, 512)), |
| (False, (10, 0, 512), (20, 0, 512))]: |
| memory = torch.rand(*memory_shape, device=device, dtype=torch.double) |
| tgt = torch.rand(*tgt_shape, requires_grad=True, device=device, dtype=torch.double) |
| decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=batch_first, dtype=torch.double).to(device) |
| self._test_module_empty_inputs(decoder_layer, [tgt, memory]) |
| |
| @expectedFailureMeta # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1] |
| @onlyNativeDeviceTypes |
| def test_TransformerDecoder_empty(self, device): |
| for batch_first, memory_shape, tgt_shape in [(True, (0, 10, 512), (0, 20, 512)), |
| (False, (10, 0, 512), (20, 0, 512))]: |
| memory = torch.rand(*memory_shape, device=device, dtype=torch.double) |
| tgt = torch.rand(*tgt_shape, requires_grad=True, device=device, dtype=torch.double) |
| decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8, batch_first=batch_first, dtype=torch.double).to(device) |
| transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6).to(device) |
| self._test_module_empty_inputs(transformer_decoder, [tgt, memory]) |
| |
| @expectedFailureMeta # RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, -1] |
| @onlyNativeDeviceTypes |
| def test_Transformer_empty(self, device): |
| for batch_first, src_shape, tgt_shape in [(True, (10, 0, 512), (20, 0, 512))]: |
| transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12, dtype=torch.double).to(device) |
| src = torch.rand(*src_shape, requires_grad=True, device=device, dtype=torch.double) |
| tgt = torch.rand(*tgt_shape, requires_grad=True, device=device, dtype=torch.double) |
| self._test_module_empty_inputs(transformer_model, [src, tgt]) |
| |
| @onlyNativeDeviceTypes |
| @dtypes(torch.float32, torch.complex64) |
| def test_ReflectionPad_empty(self, device, dtype): |
| for mod, inp in [ |
| (torch.nn.ReflectionPad1d(2), torch.randn(0, 3, 10, device=device, dtype=dtype)), |
| (torch.nn.ReflectionPad2d(2), torch.randn(0, 3, 10, 10, device=device, dtype=dtype)), |
| (torch.nn.ReflectionPad3d(3), torch.randn(0, 3, 10, 10, 10, device=device, dtype=dtype))]: |
| _test_module_empty_input(self, mod, inp, check_size=False) |
| |
| with self.assertRaisesRegex(RuntimeError, '2D or 3D'): |
| mod = torch.nn.ReflectionPad1d(2) |
| inp = torch.randn(3, 0, 10, device=device, dtype=dtype) |
| mod(inp) |
| |
| with self.assertRaisesRegex(RuntimeError, '3D or 4D'): |
| mod = torch.nn.ReflectionPad2d(2) |
| inp = torch.randn(3, 0, 10, 10, device=device, dtype=dtype) |
| mod(inp) |
| |
| with self.assertRaisesRegex(RuntimeError, '4D or 5D'): |
| mod = torch.nn.ReflectionPad3d(3) |
| inp = torch.randn(3, 0, 10, 10, 10, device=device, dtype=dtype) |
| mod(inp) |
| |
| @onlyCUDA # Test if CPU and GPU results match |
| def test_ReflectionPad2d_large(self, device): |
| shapes = ([2, 65736, 6, 6], [65736, 2, 6, 6]) |
| pad = (1, 2, 3, 4) |
| for shape in shapes: |
| x = torch.randn(shape, device=device, requires_grad=True) |
| ref_x = x.detach().cpu().requires_grad_() |
| |
| out = F.pad(x, pad, mode='reflect') |
| ref_out = F.pad(ref_x, pad, mode='reflect') |
| |
| self.assertEqual(out, ref_out) |
| |
| g = torch.randn_like(out) |
| ref_g = g.cpu() |
| |
| out.backward(g) |
| ref_out.backward(ref_g) |
| |
| self.assertEqual(x.grad, ref_x.grad) |
| |
| @onlyNativeDeviceTypes |
| def test_LocalResponseNorm_empty(self, device): |
| mod = torch.nn.LocalResponseNorm(2).to(device) |
| inp = torch.ones(0, 5, 24, 24, device=device) |
| _test_module_empty_input(self, mod, inp, check_size=False) |
| |
| @onlyCUDA # Test if CPU and GPU results match |
| def test_ReflectionPad3d_large(self, device): |
| shapes = ([2, 1000, 7, 7, 7], [1000, 2, 7, 7, 7]) |
| pad = (1, 2, 3, 4, 5, 6) |
| for shape in shapes: |
| x = torch.randn(shape, device=device, requires_grad=True) |
| ref_x = x.detach().cpu().requires_grad_() |
| |
| out = F.pad(x, pad, mode='reflect') |
| ref_out = F.pad(ref_x, pad, mode='reflect') |
| |
| self.assertEqual(out, ref_out) |
| |
| g = torch.randn_like(out) |
| ref_g = g.cpu() |
| |
| out.backward(g) |
| ref_out.backward(ref_g) |
| |
| self.assertEqual(x.grad, ref_x.grad) |
| |
| @onlyNativeDeviceTypes |
| @dtypes(torch.float, torch.double) |
| def test_MarginLoss_empty(self, device, dtype): |
| for mod, x, y in [ |
| (torch.nn.MultiMarginLoss().to(device), |
| torch.randn(0, 10, requires_grad=True, device=device, dtype=dtype), |
| torch.ones(0, device=device).type(torch.long)), |
| (torch.nn.MultiLabelMarginLoss().to(device), |
| torch.randn(0, 10, requires_grad=True, device=device, dtype=dtype), |
| torch.ones(0, 10, device=device).type(torch.long))]: |
| |
| out = mod(x, y) |
| out.sum().backward() |
| |
| self.assertEqual(x, torch.zeros_like(x)) |
| self.assertEqual(x.grad, torch.zeros_like(x)) |
| |
| with self.assertRaisesRegex(RuntimeError, 'Expected'): |
| x = torch.randn(0, requires_grad=True, device=device, dtype=dtype) |
| y = torch.ones(10, device=device).type(torch.long) |
| mod(x, y) |
| |
| with self.assertRaisesRegex(RuntimeError, 'Expected'): |
| x = torch.randn(10, 0, requires_grad=True, device=device, dtype=dtype) |
| y = torch.ones(10, 0, device=device).type(torch.long) |
| mod(x, y) |
| |
| @onlyCUDA |
| def test_MarginLoss_warnings(self, device): |
| model = torch.nn.Linear(128, 22, device=device) |
| loss = torch.nn.MultiMarginLoss() |
| x = torch.rand((56, 128), device=device) |
| targets = torch.randint(22, (56,), device=device) |
| f = io.StringIO() |
| with contextlib.redirect_stderr(f): |
| out = model(x) |
| l = loss(out, targets) |
| l.backward() |
| self.assertTrue(len(f.getvalue()) == 0) |
| |
| @onlyNativeDeviceTypes |
| def test_Unfold_empty(self, device): |
| inp = torch.randn(0, 3, 3, 4, device=device) |
| unfold = torch.nn.Unfold(kernel_size=(2, 3)).to(device) |
| _test_module_empty_input(self, unfold, inp, check_size=False) |
| |
| with self.assertRaisesRegex(RuntimeError, 'Expected 3D or 4D'): |
| inp = torch.randn(3, 0, 3, 4, device=device) |
| unfold = torch.nn.Unfold(kernel_size=(2, 3)).to(device) |
| unfold(inp) |
| |
| @onlyCUDA |
| @dtypes(torch.float, torch.double) |
| @tf32_on_and_off(0.005) |
| def test_rnn_fused(self, device, dtype): |
| |
| def copy_rnn(rnn1, rnn2): |
| for x_layer, y_layer in zip(rnn1.all_weights, rnn2.all_weights): |
| for x, y in zip(x_layer, y_layer): |
| x.data.copy_(y.data) |
| |
| def check_rnn_grads(rnn1, rnn2): |
| for x_layer, y_layer in zip(rnn1.all_weights, rnn2.all_weights): |
| for x, y in zip(x_layer, y_layer): |
| self.assertEqual(x.grad, y.grad, atol=5e-5, rtol=0) |
| |
| input_size = 10 |
| hidden_size = 6 |
| num_layers = 2 |
| seq_length = 7 |
| batch = 6 |
| input_val = torch.randn(seq_length, batch, input_size, dtype=dtype) |
| grad_output = torch.randn(seq_length, batch, hidden_size, dtype=dtype) |
| hx_val = torch.randn(num_layers, batch, hidden_size, dtype=dtype) |
| grad_hy = torch.randn(num_layers, batch, hidden_size, dtype=dtype) |
| with torch.backends.cudnn.flags(enabled=False, allow_tf32=None): |
| for module in (nn.GRU, nn.LSTM): |
| for bias in (True, False): |
| rnn = module(input_size, hidden_size, num_layers, bias=bias).to(dtype) |
| rnn_device = module(input_size, hidden_size, num_layers, bias=bias).to(device, dtype) |
| copy_rnn(rnn, rnn_device) |
| |
| is_lstm = isinstance(rnn, nn.LSTM) |
| if is_lstm: |
| hx = (hx_val.clone().requires_grad_(True), |
| hx_val.clone().add(1).requires_grad_(True)) |
| hx_device = (hx_val.clone().to(device).requires_grad_(True), |
| hx_val.clone().to(device).add(1).requires_grad_(True)) |
| else: |
| hx = hx_val.clone().requires_grad_(True) |
| hx_device = hx_val.clone().to(device).requires_grad_(True) |
| |
| inp = input_val.clone().requires_grad_(True) |
| inp_cu = input_val.clone().to(device).requires_grad_(True) |
| output1, hy1 = rnn(inp, hx) |
| output2, hy2 = rnn_device(inp_cu, hx_device) |
| if is_lstm: |
| torch.autograd.backward( |
| [output1, hy1[0], hy1[1]], [grad_output, grad_hy, grad_hy + 1] |
| ) |
| torch.autograd.backward( |
| [output2, hy2[0], hy2[1]], |
| [grad_output.to(device), grad_hy.to(device), (grad_hy + 1).to(device)] |
| ) |
| else: |
| torch.autograd.backward([output1, hy1], [grad_output, grad_hy]) |
| torch.autograd.backward([output2, hy2], [grad_output.to(device), grad_hy.to(device)]) |
| |
| self.assertEqual(output1, output2) |
| self.assertEqual(hy1, hy2) |
| |
| check_rnn_grads(rnn, rnn_device) |
| self.assertEqual(inp.grad, inp_cu.grad) |
| if is_lstm: |
| self.assertEqual(hx[0].grad, hx_device[0].grad) |
| self.assertEqual(hx[1].grad, hx_device[1].grad) |
| else: |
| self.assertEqual(hx.grad, hx_device.grad) |
| |
| @dtypesIfMPS(torch.float) |
| @dtypes(torch.double) |
| def test_BatchNorm_empty(self, device, dtype): |
| mod = torch.nn.BatchNorm2d(3).to(device) |
| inp = torch.randn(0, 3, 2, 2, device=device, dtype=dtype) |
| _test_module_empty_input(self, mod, inp) |
| if self.device_type == 'cuda' and self.has_cudnn(): |
| with torch.backends.cudnn.flags(enabled=False): |
| _test_module_empty_input(self, mod, inp) |
| |
| self.assertEqual(mod.running_mean, torch.tensor([0., 0, 0], device=device)) |
| self.assertEqual(mod.running_var, torch.tensor([1., 1, 1], device=device)) |
| self.assertEqual(mod.weight.grad, torch.tensor([0., 0, 0], device=device)) |
| self.assertEqual(mod.bias.grad, torch.tensor([0., 0, 0], device=device)) |
| |
| @onlyCUDA |
| @largeTensorTest('16GB') |
| def test_prelu_backward_32bit_indexing(self, device): |
| m = torch.nn.PReLU().cuda().half() |
| input_ = torch.ones((1024, 1024, 1024, 2), dtype=torch.half, device=device) |
| output = m(input_) |
| output.backward(input_) |
| |
| def test_linear_empty(self, device): |
| mod = torch.nn.Linear(7, 7).to(device) |
| inp = torch.randn(0, 7, device=device) |
| _test_module_empty_input(self, mod, inp) |
| |
| def test_one_hot(self, device): |
| # cuda throws device assert for invalid data |
| # xla ignores out of bound indices |
| if self.device_type not in ('cuda', 'mps', 'xla'): |
| with self.assertRaises(RuntimeError): |
| torch.nn.functional.one_hot(torch.tensor([3, 4, -1, 0], device=device), -1) |
| |
| with self.assertRaises(RuntimeError): |
| torch.nn.functional.one_hot(torch.tensor([3, 4, 1, 0], device=device), 3) |
| |
| t = torch.nn.functional.one_hot(torch.tensor([3, 4, 1, 0], device=device)) |
| expected = torch.tensor([[0, 0, 0, 1, 0], |
| [0, 0, 0, 0, 1], |
| [0, 1, 0, 0, 0], |
| [1, 0, 0, 0, 0]], device=device) |
| self.assertEqual(t, expected) |
| |
| t = torch.nn.functional.one_hot(torch.tensor([3, 4, 1, 0], device=device), -1) |
| expected = torch.tensor([[0, 0, 0, 1, 0], |
| [0, 0, 0, 0, 1], |
| [0, 1, 0, 0, 0], |
| [1, 0, 0, 0, 0]], device=device) |
| self.assertEqual(t, expected) |
| |
| t = torch.nn.functional.one_hot(torch.tensor([3, 4, 1, 0], device=device), 6) |
| expected = torch.tensor([[0, 0, 0, 1, 0, 0], |
| [0, 0, 0, 0, 1, 0], |
| [0, 1, 0, 0, 0, 0], |
| [1, 0, 0, 0, 0, 0]], device=device) |
| self.assertEqual(t, expected) |
| |
| t = torch.nn.functional.one_hot(torch.tensor([[3, 4], [1, 0]], device=device)) |
| expected = torch.tensor([[[0, 0, 0, 1, 0], |
| [0, 0, 0, 0, 1]], |
| [[0, 1, 0, 0, 0], |
| [1, 0, 0, 0, 0]]], device=device) |
| self.assertEqual(t, expected) |
| |
| t = torch.nn.functional.one_hot(torch.tensor(4, device=device)) |
| expected = torch.tensor([0, 0, 0, 0, 1], device=device) |
| self.assertEqual(t, expected) |
| |
| t = torch.nn.functional.one_hot(torch.empty([4, 0], dtype=torch.long, device=device), 100) |
| expected = torch.empty([4, 0, 100], dtype=torch.long) |
| self.assertEqual(t, expected) |
| |
| with self.assertRaises(RuntimeError): |
| torch.nn.functional.one_hot(torch.empty([4, 0], dtype=torch.long, device=device)) |
| |
| with self.assertRaises(RuntimeError): |
| torch.nn.functional.one_hot(torch.tensor([3, 4, 1, 0], device=device), -2) |
| |
| @expectedFailureMPS # NotImplementedError: aten::rrelu_with_noise https://github.com/pytorch/pytorch/issues/77764 |
| def test_nn_empty(self, device): |
| # One off tests to ensure scalars from nn.yaml are properly applied |
| def verify_scalars(input, output): |
| self.assertEqual(input.shape, output.shape) |
| self.assertEqual(0, output.numel()) |
| |
| for input_shape in [(0), (0, 2)]: |
| for module in [torch.nn.ELU, torch.nn.Hardtanh, torch.nn.LeakyReLU, torch.nn.LogSigmoid, |
| torch.nn.RReLU, torch.nn.Softshrink, torch.nn.Softplus, torch.nn.Sigmoid, |
| torch.nn.Tanh]: |
| input = torch.randn(input_shape, device=device, requires_grad=True) |
| m = module() |
| output = m(input) |
| verify_scalars(input, output) |
| |
| @expectedFailureMPS # NotImplementedError: aten::rrelu_with_noise https://github.com/pytorch/pytorch/issues/77764 |
| def test_nn_scalars(self, device): |
| # One off tests to ensure scalars from nn.yaml are properly applied |
| def verify_scalars(input, output): |
| if input.dim() == 0: |
| self.assertEqual((), output.shape) |
| else: |
| self.assertNotEqual((), output.shape) |
| output.sum().backward() |
| self.assertEqual(input.shape, input.grad.shape) |
| |
| for input_shape in [(5, 6), ()]: |
| for module in [torch.nn.ELU, torch.nn.Hardtanh, torch.nn.LeakyReLU, torch.nn.LogSigmoid, |
| torch.nn.RReLU, torch.nn.Softshrink, torch.nn.Softplus, torch.nn.Sigmoid, |
| torch.nn.Tanh]: |
| input = torch.randn(input_shape, device=device, requires_grad=True) |
| m = module() |
| output = m(input) |
| verify_scalars(input, output) |
| |
| def test_nn_scalars_reductions(self, device): |
| # One off tests to ensure scalars from nn.yaml are properly applied |
| def verify_reduction_scalars(input, reduction, output): |
| if reduction != 'none' or input.dim() == 0: |
| self.assertEqual((), output.shape) |
| else: |
| self.assertNotEqual((), output.shape) |
| output.sum().backward() |
| self.assertEqual(input.shape, input.grad.shape) |
| |
| for input_shape in [(5, 6), ()]: |
| for reduction in ['none', 'mean', 'sum']: |
| for module in [torch.nn.BCELoss, torch.nn.L1Loss, torch.nn.MSELoss, |
| torch.nn.SmoothL1Loss, torch.nn.SoftMarginLoss]: |
| input = torch.randn(input_shape, device=device, requires_grad=True) |
| target = torch.empty(input_shape, device=device).random_(2) |
| sigmoid = nn.Sigmoid() |
| |
| input = torch.randn(input_shape, device=device, requires_grad=True) |
| m = module(reduction=reduction) |
| output = m(sigmoid(input), target) |
| verify_reduction_scalars(input, reduction, output) |
| |
| # verify that bogus reduction strings are errors |
| @onlyNativeDeviceTypes |
| def test_invalid_reduction_strings(self, device): |
| input = torch.randn(3, 5, requires_grad=True, device=device) |
| cinput = torch.randn(3, 5, requires_grad=True, device=device, dtype=torch.cfloat) |
| target = torch.tensor([1, 0, 4], device=device) |
| var = torch.ones(size=input.size(), requires_grad=True, device=device) |
| |
| for reduction in ['none', 'invalid']: |
| def v(fn): |
| if reduction == 'invalid': |
| self.assertRaises(ValueError, lambda: fn()) |
| else: |
| fn() |
| |
| v(lambda: F.nll_loss(input, target, reduction=reduction)) |
| v(lambda: F.cross_entropy(input, target, reduction=reduction)) |
| |
| v(lambda: F.kl_div(input, input, reduction=reduction)) |
| v(lambda: F.huber_loss(input, input, reduction=reduction)) |
| v(lambda: F.smooth_l1_loss(input, input, reduction=reduction)) |
| v(lambda: F.l1_loss(input, input, reduction=reduction)) |
| v(lambda: F.l1_loss(cinput, cinput, reduction=reduction)) |
| v(lambda: F.mse_loss(input, input, reduction=reduction)) |
| v(lambda: F.hinge_embedding_loss(input, input, reduction=reduction)) |
| v(lambda: F.poisson_nll_loss(input, input, reduction=reduction)) |
| v(lambda: F.gaussian_nll_loss(input, input, var, reduction=reduction)) |
| v(lambda: F.binary_cross_entropy(torch.sigmoid(input), input.gt(0).to(torch.get_default_dtype()), reduction=reduction)) |
| v(lambda: F.binary_cross_entropy_with_logits(input, input, reduction=reduction)) |
| |
| zeros = torch.zeros_like(input).to(torch.int64) |
| v(lambda: F.multilabel_soft_margin_loss(input, zeros, reduction=reduction)) |
| |
| v(lambda: F.triplet_margin_loss(input, input, input, reduction=reduction)) |
| v(lambda: F.triplet_margin_with_distance_loss(input, input, input, reduction=reduction)) |
| v(lambda: F.margin_ranking_loss(input, input, input.sign(), reduction=reduction)) |
| v(lambda: F.cosine_embedding_loss(input, input, input[:, 0].sign(), reduction=reduction)) |
| |
| log_probs = torch.randn(50, 16, 20, requires_grad=True, device=device).log_softmax(2) |
| targets = torch.randint(1, 20, (16, 30), dtype=torch.long, device=device) |
| input_lengths = torch.full((16,), 50, dtype=torch.long, device=device) |
| target_lengths = torch.randint(10, 30, (16,), dtype=torch.long, device=device) |
| v(lambda: F.ctc_loss(log_probs, targets, input_lengths, target_lengths, reduction=reduction)) |
| |
| # FIXME: should we allow derivatives on these? |
| v(lambda: F.soft_margin_loss(input, input.sign().detach(), reduction=reduction)) |
| |
| @onlyNativeDeviceTypes |
| def test_smooth_l1_loss_vs_huber_loss(self, device): |
| def _make_test_tensor(shape, contiguous=True): |
| if contiguous: |
| test_tensor = torch.randn(shape, device=device) |
| else: |
| # Select every other element in the innermost dimension to |
| # make it non-contiguous. |
| doubled_shape = list(shape) |
| doubled_shape[-1] *= 2 |
| test_tensor = torch.randn(doubled_shape, device=device) |
| test_tensor = test_tensor[..., ::2] |
| return test_tensor |
| |
| def _test_smooth_l1_loss_vs_huber_loss_helper(input, target, beta, require_equal): |
| for reduction in ['mean', 'sum', 'none']: |
| smooth_l1 = torch.nn.SmoothL1Loss(beta=beta, reduction=reduction) |
| # beta hyper-parameter is called delta for Huber |
| huber = torch.nn.HuberLoss(delta=beta, reduction=reduction) |
| smooth_l1_loss = smooth_l1(input, target) |
| huber_loss = huber(input, target) |
| |
| if require_equal: |
| self.assertEqual(smooth_l1_loss, huber_loss) |
| else: |
| # Huber loss should be larger than smooth L1 loss by a factor of beta. |
| self.assertEqual(smooth_l1_loss * beta, huber_loss) |
| |
| def _test_smooth_l1_loss_vs_huber_loss_multi_input_helper(beta, require_equal): |
| # Test the non-vectorized case. |
| shape = (2, 2) |
| _test_smooth_l1_loss_vs_huber_loss_helper(input=_make_test_tensor(shape), |
| target=_make_test_tensor(shape), |
| beta=beta, |
| require_equal=require_equal) |
| |
| # Test the vectorized case (innermost dim > 32). |
| shape = (64, 64) |
| _test_smooth_l1_loss_vs_huber_loss_helper(input=_make_test_tensor(shape), |
| target=_make_test_tensor(shape), |
| beta=beta, |
| require_equal=require_equal) |
| |
| # Test the non-contiguous case. |
| _test_smooth_l1_loss_vs_huber_loss_helper(input=_make_test_tensor(shape, contiguous=False), |
| target=_make_test_tensor(shape, contiguous=False), |
| beta=beta, |
| require_equal=require_equal) |
| |
| def test_equal_when_beta_is_one(): |
| _test_smooth_l1_loss_vs_huber_loss_multi_input_helper(beta=1.0, require_equal=True) |
| |
| def test_unequal_when_beta_is_less_than_one(): |
| _test_smooth_l1_loss_vs_huber_loss_multi_input_helper(beta=0.5, require_equal=False) |
| |
| def test_unequal_when_beta_is_greater_than_one(): |
| _test_smooth_l1_loss_vs_huber_loss_multi_input_helper(beta=1.5, require_equal=False) |
| |
| test_equal_when_beta_is_one() |
| test_unequal_when_beta_is_less_than_one() |
| test_unequal_when_beta_is_greater_than_one() |
| |
| @onlyCPU |
| def test_smooth_l1_loss_bfloat16(self, device): |
| def test_dtype(fn, input, target, dtype): |
| input = input.detach().clone().to(dtype=dtype).requires_grad_(True) |
| input2 = input.detach().clone().float().requires_grad_(True) |
| target = target.detach().clone().to(dtype=dtype) |
| target2 = target.detach().clone().float() |
| out = fn(input, target) |
| out.sum().backward() |
| out2 = fn(input2, target2) |
| out2.sum().backward() |
| self.assertEqual(out.dtype, dtype) |
| self.assertEqual(input.grad.dtype, dtype) |
| self.assertEqual(out, out2, exact_dtype=False) |
| self.assertEqual(input.grad, input2.grad, exact_dtype=False) |
| |
| def func(device): |
| return nn.SmoothL1Loss().to(device=device) |
| |
| shapes = [[1, 3, 1, 6], [1, 3, 1, 128], [1, 3, 128, 128]] |
| for shape in shapes: |
| x = torch.randn(shape, device=device, requires_grad=True) |
| t = torch.randn(shape, device=device) |
| test_dtype(func(device), x, t, torch.bfloat16) |
| |
| # We don't want to make propagating NaN a hard requirement on ops, but for |
| # these easy ones, we should make them do so. |
| # MPS: NotImplementedError: aten::rrelu_with_noise_ https://github.com/pytorch/pytorch/issues/77764 |
| # MPS: NotImplementedError: aten::hardshrink.out https://github.com/pytorch/pytorch/issues/77764 |
| @expectedFailureMPS |
| def test_nonlinearity_propagate_nan(self, device): |
| def test(nonlinearity, *args, **kwargs): |
| x = torch.tensor([nan], device=device) |
| fn = getattr(F, nonlinearity) |
| try: |
| self.assertTrue(math.isnan(fn(x, *args, **kwargs).item())) |
| except Exception as e: |
| if 'not implemented' not in str(e): |
| raise |
| |
| test('relu') |
| test('relu', inplace=True) |
| test('relu6') |
| test('elu') |
| test('selu') |
| test('celu') |
| test('rrelu') |
| test('rrelu', inplace=True) |
| test('hardtanh') |
| test('tanh') |
| test('sigmoid') |
| test('logsigmoid') |
| test('hardshrink') |
| test('tanhshrink') |
| test('softsign') |
| test('softmin', 0) |
| test('softmax', 0) |
| test('log_softmax', 0) |
| test('leaky_relu', 0.2) |
| test('threshold', 3, 2) |
| test('threshold', 3, 2, inplace=True) |
| |
| @expectedFailureMPS # TypeError: float64 the MPS framework doesn't support float64 |
| @parametrize_test("mode", ["nearest-exact", "nearest"]) |
| def test_upsamplingNearest1d(self, device, mode): |
| # Forward AD does not support XLA because XLA tensors don't have storage |
| check_forward_ad = torch.device(device).type != 'xla' |
| |
| m = nn.Upsample(size=4, mode=mode) |
| in_t = torch.ones(1, 1, 2, device=device, dtype=torch.double) |
| in_uint8_t = torch.ones(1, 1, 2, dtype=torch.uint8, device=device) |
| with warnings.catch_warnings(record=True) as w: |
| out_t = m(in_t) |
| out_uint8_t = m(in_uint8_t) |
| self.assertEqual(torch.ones(1, 1, 4, device=device, dtype=torch.double), out_t.data) |
| self.assertEqual(torch.ones(1, 1, 4, dtype=torch.uint8, device=device), out_uint8_t.data) |
| |
| # Checks upsampling |
| input = torch.randn(1, 1, 2, requires_grad=True, device=device, dtype=torch.double) |
| gradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_forward_ad=check_forward_ad) |
| gradgradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_fwd_over_rev=check_forward_ad) |
| |
| # Checks downsampling |
| input = torch.randn(1, 1, 20, requires_grad=True, device=device, dtype=torch.double) |
| gradcheck(lambda x: F.interpolate(x, 11, mode=mode), [input], check_forward_ad=check_forward_ad) |
| gradgradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_fwd_over_rev=check_forward_ad) |
| |
| # consistency CUDA/CPU check |
| if torch.device(device).type == 'cuda': |
| input_cuda = torch.randn(1, 1, 20, device=device, dtype=torch.double) |
| input_cpu = input_cuda.cpu() |
| output_cuda = F.interpolate(input_cuda, 4, mode=mode) |
| output_cpu = F.interpolate(input_cpu, 4, mode=mode) |
| self.assertEqual(output_cuda.cpu(), output_cpu) |
| |
| output_cuda = F.interpolate(input_cuda, 24, mode=mode) |
| output_cpu = F.interpolate(input_cpu, 24, mode=mode) |
| self.assertEqual(output_cuda.cpu(), output_cpu) |
| |
| @parametrize_test("isize, osize", [(20, 11), (10, 15)]) |
| def test_upsamplingNearest1d_correctness(self, device, isize, osize): |
| # Here we check if output matches OpenCV's INTER_NEAREST-like result |
| in_t = torch.arange(isize, dtype=torch.float, device=device).unsqueeze(0).unsqueeze(0) |
| out_t = F.interpolate( |
| in_t, size=(osize, ), recompute_scale_factor=False, mode="nearest" |
| ) |
| # compute expected output as OpenCV |
| expected_out = torch.zeros(osize, dtype=torch.float).unsqueeze(0).unsqueeze(0) |
| scale = 1.0 * isize / osize |
| for o in range(osize): |
| i_f32 = o * scale |
| i = int(i_f32) |
| expected_out[0, 0, o] = in_t[0, 0, i] |
| expected_out = expected_out.to(device=device) |
| self.assertEqual(out_t, expected_out) |
| |
| def test_upsamplingNearestExact1d_rescale(self, device): |
| # Checks https://github.com/pytorch/pytorch/issues/62237 |
| isize = 20 |
| in_t = torch.arange(isize, dtype=torch.float, device=device).unsqueeze(0).unsqueeze(0) |
| # for s in [1.00001, 0.99999]: # 0.9999 case is broken |
| # See issue: https://github.com/pytorch/pytorch/issues/62396 |
| for s in [1.00001, ]: |
| out_t = F.interpolate( |
| in_t, scale_factor=s, recompute_scale_factor=False, mode="nearest-exact" |
| ) |
| expected_out = in_t |
| self.assertEqual(out_t, expected_out, msg=f"scale: {s}") |
| |
| # checks data duplication if output_size == 2 * input_size |
| # for s in [2.00001, 1.99999]: # 1.99999 case is broken |
| # See issue: https://github.com/pytorch/pytorch/issues/62396 |
| for s in [2.00001, ]: |
| out_t = F.interpolate( |
| in_t, scale_factor=s, recompute_scale_factor=False, mode="nearest-exact" |
| ) |
| # input is [[[0, 1, 2, 3, ..., 9]]] |
| # expected out is [[[0, 0, 1, 1, 2, 2, ..., 9, 9]]] |
| expected_out = in_t.repeat_interleave(2, dim=-1) |
| self.assertEqual(out_t, expected_out) |
| |
| @skipIfMps # Partially passes https://github.com/pytorch/pytorch/issues/134430 |
| @parametrize_test("isize, osize", [(20, 11), (10, 15)]) |
| def test_upsamplingNearestExact1d_correctness(self, device, isize, osize): |
| # Here we check if output matches Scikit-Image/Scipy-like result |
| # Checks https://github.com/pytorch/pytorch/issues/34808 |
| in_t = torch.arange(isize, dtype=torch.float, device=device).unsqueeze(0).unsqueeze(0) |
| out_t = F.interpolate( |
| in_t, size=(osize, ), recompute_scale_factor=False, mode="nearest-exact" |
| ) |
| # compute expected output as scikit-image/scipy |
| expected_out = torch.zeros(osize, dtype=torch.float).unsqueeze(0).unsqueeze(0) |
| scale = 1.0 * isize / osize |
| for o in range(osize): |
| i_f32 = (o + 0.5) * scale |
| i = int(i_f32) |
| expected_out[0, 0, o] = in_t[0, 0, i] |
| expected_out = expected_out.to(device=device) |
| self.assertEqual(out_t, expected_out) |
| |
| @expectedFailureMPS # TypeError: the MPS framework doesn't support float64 |
| @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last]) |
| @parametrize_test("mode", ["nearest", "nearest-exact"]) |
| def test_upsamplingNearest2d(self, device, memory_format, mode): |
| # Forward AD does not support XLA because XLA tensors don't have storage |
| check_forward_ad = torch.device(device).type != 'xla' |
| |
| in_t = torch.ones(1, 2, 2, 2, device=device, dtype=torch.double).contiguous(memory_format=memory_format) |
| in_uint8_t = torch.ones(1, 2, 2, 2, dtype=torch.uint8, device=device).contiguous(memory_format=memory_format) |
| with warnings.catch_warnings(record=True) as w: |
| out_t = F.interpolate(in_t, size=4, mode=mode) |
| out_uint8_t = F.interpolate(in_uint8_t, size=4, mode=mode) |
| self.assertEqual(len(w), 0) |
| self.assertEqual(torch.ones(1, 2, 4, 4, device=device, dtype=torch.double), out_t) |
| self.assertEqual(torch.ones(1, 2, 4, 4, dtype=torch.uint8, device=device), out_uint8_t) |
| # Assert that memory format is carried through to the output |
| self.assertTrue(out_t.is_contiguous(memory_format=memory_format)) |
| |
| # test forward when input's height is not same as width |
| in_t = torch.ones(1, 2, 2, 1, device=device, dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_() |
| out_t = F.interpolate(in_t, size=(4, 2), mode=mode) |
| self.assertEqual(torch.ones(1, 2, 4, 2, device=device, dtype=torch.double), out_t) |
| self.assertTrue(out_t.is_contiguous(memory_format=memory_format)) |
| |
| out_t.backward(torch.randn_like(out_t)) |
| self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format)) |
| |
| # test backward when input's height is not same as width |
| input = torch.ones( |
| 1, 2, 2, 1, requires_grad=True, device=device, |
| dtype=torch.double).contiguous(memory_format=memory_format) |
| gradcheck(lambda x: F.interpolate(x, size=(4, 2), mode=mode), [input], check_forward_ad=check_forward_ad) |
| gradgradcheck(lambda x: F.interpolate(x, size=(4, 2), mode=mode), [input], check_fwd_over_rev=check_forward_ad) |
| |
| input = torch.randn( |
| 1, 2, 2, 2, requires_grad=True, device=device, |
| dtype=torch.double).contiguous(memory_format=memory_format) |
| self.assertEqual( |
| F.interpolate(input, 4, mode=mode), |
| F.interpolate(input, scale_factor=2, mode=mode)) |
| gradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_forward_ad=check_forward_ad) |
| gradgradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_fwd_over_rev=check_forward_ad) |
| |
| # Assert that cpu and cuda handle channels_last memory format in the same way |
| # https://github.com/pytorch/pytorch/issues/54590 |
| if torch.device(device).type == 'cuda': |
| for shapes, scale_factor in product([ |
| (2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2) |
| ], [0.5, 1.5, 2]): |
| a_cuda = torch.randn( |
| *shapes, device=device, |
| dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_() |
| a_cpu = a_cuda.detach().cpu().requires_grad_() |
| |
| out_cuda = F.interpolate(a_cuda, scale_factor=scale_factor, mode=mode) |
| out_cpu = F.interpolate(a_cpu, scale_factor=scale_factor, mode=mode) |
| |
| self.assertEqual(out_cpu.cuda(), out_cuda) |
| |
| g_cuda = torch.randn_like(out_cuda) |
| g_cpu = g_cuda.cpu() |
| |
| out_cuda.backward(g_cuda) |
| out_cpu.backward(g_cpu) |
| |
| self.assertEqual(a_cuda.grad, a_cpu.grad) |
| |
| @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last]) |
| @parametrize_test("isize, osize", [(20, 11), (10, 15)]) |
| def test_upsamplingNearest2d_correctness(self, device, memory_format, isize, osize): |
| # Here we check if output matches OpenCV's INTER_NEAREST-like result |
| in_t = torch.arange(isize * isize, dtype=torch.float, device=device).reshape(1, 1, isize, isize) |
| in_t = in_t.contiguous(memory_format=memory_format) |
| out_t = F.interpolate( |
| in_t, size=(osize, osize), recompute_scale_factor=False, mode="nearest" |
| ) |
| # compute expected output as OpenCV |
| expected_out = torch.zeros(1, 1, osize, osize, dtype=torch.float) |
| scale = 1.0 * isize / osize |
| for o1 in range(osize): |
| i1_f32 = o1 * scale |
| i1 = int(i1_f32) |
| for o2 in range(osize): |
| i2_f32 = o2 * scale |
| i2 = int(i2_f32) |
| expected_out[0, 0, o1, o2] = in_t[0, 0, i1, i2] |
| expected_out = expected_out.to(device=device) |
| self.assertEqual(out_t, expected_out) |
| |
| @skipIfMps # Partially passes https://github.com/pytorch/pytorch/issues/134430 |
| @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last]) |
| @parametrize_test("isize, osize", [(20, 11), (10, 15)]) |
| def test_upsamplingNearestExact2d_correctness(self, device, memory_format, isize, osize): |
| # Here we check if output matches Scikit-Image/Scipy-like result |
| # Checks https://github.com/pytorch/pytorch/issues/34808 |
| in_t = torch.arange(isize * isize, dtype=torch.float, device=device).reshape(1, 1, isize, isize) |
| in_t = in_t.contiguous(memory_format=memory_format) |
| out_t = F.interpolate( |
| in_t, size=(osize, osize), recompute_scale_factor=False, mode="nearest-exact" |
| ) |
| # compute expected output as Scikit-Image/Scipy |
| expected_out = torch.zeros(1, 1, osize, osize, dtype=torch.float) |
| scale = 1.0 * isize / osize |
| for o1 in range(osize): |
| i1_f32 = (o1 + 0.5) * scale |
| i1 = int(i1_f32) |
| for o2 in range(osize): |
| i2_f32 = (o2 + 0.5) * scale |
| i2 = int(i2_f32) |
| expected_out[0, 0, o1, o2] = in_t[0, 0, i1, i2] |
| expected_out = expected_out.to(device=device) |
| self.assertEqual(out_t, expected_out) |
| |
| @expectedFailureMPS # TypeError: the MPS framework doesn't support float64 |
| @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last_3d]) |
| @parametrize_test("mode", ["nearest", "nearest-exact"]) |
| def test_upsamplingNearest3d(self, device, memory_format, mode): |
| # Forward AD does not support XLA because XLA tensors don't have storage |
| check_forward_ad = torch.device(device).type != 'xla' |
| |
| m = nn.Upsample(size=4, mode=mode) |
| in_t = torch.ones(1, 2, 2, 2, 2, device=device, dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_() |
| in_uint8_t = torch.ones( |
| 1, 2, 2, 2, 2, dtype=torch.uint8, device=device |
| ).contiguous(memory_format=memory_format) |
| with warnings.catch_warnings(record=True) as w: |
| out_t = m(in_t) |
| out_uint8_t = m(in_uint8_t) |
| expected_output = torch.ones(1, 2, 4, 4, 4, device=device, dtype=torch.double) |
| self.assertEqual(expected_output, out_t) |
| self.assertEqual(expected_output.to(torch.uint8), out_uint8_t) |
| # Assert that memory format is carried through to the output |
| self.assertTrue(out_t.is_contiguous(memory_format=memory_format)) |
| out_t.backward(torch.randn_like(out_t)) |
| self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format)) |
| |
| input = torch.randn( |
| 1, 2, 2, 2, 2, requires_grad=True, device=device, dtype=torch.double |
| ).contiguous(memory_format=memory_format) |
| gradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_forward_ad=check_forward_ad) |
| gradgradcheck(lambda x: F.interpolate(x, 4, mode=mode), [input], check_fwd_over_rev=check_forward_ad) |
| |
| # Assert that cpu and cuda handle channels_last memory format in the same way |
| # https://github.com/pytorch/pytorch/issues/54590 |
| if torch.device(device).type == 'cuda': |
| a = torch.ones( |
| 2, 2, 2, 3, 4, device=device, requires_grad=True, dtype=torch.double |
| ).contiguous(memory_format=torch.channels_last_3d) |
| # make the data asymmetric; ensure that cuda/cpu handle channels_last appropriately. |
| a[1][1][1][2][2] = a[1][1][1][2][3] = 0 |
| |
| out_cuda = torch.nn.functional.interpolate(a, scale_factor=2, mode=mode) |
| out_cpu = torch.nn.functional.interpolate(a.to('cpu'), scale_factor=2, mode=mode) |
| self.assertEqual(out_cpu, out_cuda.to('cpu')) |
| |
| gradcheck(lambda x: F.interpolate(x, 4, mode=mode), [a], check_forward_ad=check_forward_ad) |
| gradgradcheck(lambda x: F.interpolate(x, 4, mode=mode), [a], check_fwd_over_rev=check_forward_ad) |
| |
| gradcheck(lambda x: F.interpolate(x, 4, mode=mode), [a.to('cuda')], check_forward_ad=check_forward_ad) |
| gradgradcheck(lambda x: F.interpolate(x, 4, mode=mode), [a.to('cuda')], check_fwd_over_rev=check_forward_ad) |
| |
| @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last_3d]) |
| @parametrize_test("isize, osize", [(20, 11), (10, 15)]) |
| def test_upsamplingNearest3d_correctness(self, device, memory_format, isize, osize): |
| # Here we check if output matches OpenCV's INTER_NEAREST-like result |
| in_t = torch.arange(isize * isize * isize, dtype=torch.float, device=device) |
| in_t = in_t.reshape(1, 1, isize, isize, isize) |
| in_t = in_t.contiguous(memory_format=memory_format) |
| out_t = F.interpolate( |
| in_t, size=(osize, osize, osize), recompute_scale_factor=False, mode="nearest" |
| ) |
| # compute expected output as OpenCV |
| expected_out = torch.zeros(1, 1, osize, osize, osize, dtype=torch.float) |
| scale = 1.0 * isize / osize |
| for o1 in range(osize): |
| i1_f32 = o1 * scale |
| i1 = int(i1_f32) |
| for o2 in range(osize): |
| i2_f32 = o2 * scale |
| i2 = int(i2_f32) |
| for o3 in range(osize): |
| i3_f32 = o3 * scale |
| i3 = int(i3_f32) |
| expected_out[0, 0, o1, o2, o3] = in_t[0, 0, i1, i2, i3] |
| expected_out = expected_out.to(device=device) |
| self.assertEqual(out_t, expected_out) |
| |
| @expectedFailureMPS # NotImplementedError: aten::_upsample_nearest_exact3d.out https://github.com/pytorch/pytorch/issues/77764 |
| @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last_3d]) |
| @parametrize_test("isize, osize", [(20, 11), (10, 15)]) |
| def test_upsamplingNearestExact3d_correctness(self, device, memory_format, isize, osize): |
| # Here we check if output matches Scikit-Image/Scipy-like result |
| # Checks https://github.com/pytorch/pytorch/issues/34808 |
| in_t = torch.arange(isize * isize * isize, dtype=torch.float, device=device) |
| in_t = in_t.reshape(1, 1, isize, isize, isize) |
| in_t = in_t.contiguous(memory_format=memory_format) |
| out_t = F.interpolate( |
| in_t, size=(osize, osize, osize), recompute_scale_factor=False, mode="nearest-exact" |
| ) |
| # compute expected output as Scikit-Image/Scipy |
| expected_out = torch.zeros(1, 1, osize, osize, osize, dtype=torch.float) |
| scale = 1.0 * isize / osize |
| for o1 in range(osize): |
| i1_f32 = (o1 + 0.5) * scale |
| i1 = int(i1_f32) |
| for o2 in range(osize): |
| i2_f32 = (o2 + 0.5) * scale |
| i2 = int(i2_f32) |
| for o3 in range(osize): |
| i3_f32 = (o3 + 0.5) * scale |
| i3 = int(i3_f32) |
| expected_out[0, 0, o1, o2, o3] = in_t[0, 0, i1, i2, i3] |
| expected_out = expected_out.to(device=device) |
| self.assertEqual(out_t, expected_out) |
| |
| @parametrize_test("antialias", [True, False]) |
| @parametrize_test("align_corners", [True, False]) |
| @parametrize_test("mode", ["bilinear", "bicubic"]) |
| @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last]) |
| @onlyNativeDeviceTypes |
| def test_upsamplingBiMode2d(self, device, antialias, align_corners, mode, memory_format): |
| # Forward AD does not support XLA because XLA tensors don't have storage |
| check_forward_ad = torch.device(device).type != 'xla' |
| |
| kwargs = dict(mode=mode, align_corners=align_corners, antialias=antialias) |
| # test float scale factor up & downsampling |
| for scale_factor in [0.5, 1.5, 2]: |
| in_t = torch.ones( |
| 2, 3, 8, 8, device=device, |
| dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_() |
| out_size = int(math.floor(in_t.shape[-1] * scale_factor)) |
| with warnings.catch_warnings(record=True) as w: |
| out_t = F.interpolate(in_t, scale_factor=scale_factor, **kwargs) |
| expected_out = torch.ones(2, 3, out_size, out_size, device=device, dtype=torch.double) |
| self.assertEqual(expected_out, out_t) |
| # Assert that memory format is carried through to the output |
| self.assertTrue(out_t.is_contiguous(memory_format=memory_format)) |
| out_t.backward(torch.randn_like(out_t)) |
| self.assertTrue(in_t.grad.is_contiguous(memory_format=memory_format)) |
| |
| if torch.device(device).type == 'cuda': |
| # Bilinear backward is nondeterministic because of atomicAdd usage |
| nondet_tol = 1e-5 |
| else: |
| nondet_tol = 0.0 |
| |
| input = torch.randn( |
| 2, 3, 8, 8, device=device, |
| dtype=torch.double).contiguous(memory_format=memory_format).requires_grad_() |
| gradcheck( |
| lambda x: F.interpolate(x, out_size, **kwargs), |
| [input], |
| check_forward_ad=check_forward_ad, nondet_tol=nondet_tol |
| ) |
| gradgradcheck( |
| lambda x: F.interpolate(x, out_size, **kwargs), |
| [input], |
| check_fwd_over_rev=check_forward_ad, nondet_tol=nondet_tol |
| ) |
| |
| # Assert that cpu and cuda give same results |
| if torch.device(device).type == 'cuda': |
| for shapes in [ |
| (2, 2, 3, 4), (2, 3, 4, 5), (3, 1, 2, 2), (1, 5, 3, 2) |
| ]: |
| a_cuda = torch.randn( |
| *shapes, device=device, dtype=torch.double |
| ).contiguous(memory_format=memory_format).requires_grad_() |
| a_cpu = a_cuda.detach().cpu().requires_grad_() |
| |
| with warnings.catch_warnings(record=True): |
| out_cuda = F.interpolate(a_cuda, scale_factor=scale_factor, **kwargs) |
| out_cpu = F.interpolate(a_cpu, scale_factor=scale_factor, **kwargs) |
| |
| self.assertEqual(out_cpu, out_cuda.cpu()) |
| |
| g_cuda = torch.randn_like(out_cuda) |
| g_cpu = g_cuda.cpu() |
| |
| out_cuda.backward(g_cuda) |
| out_cpu.backward(g_cpu) |
| |
| self.assertEqual(a_cuda.grad, a_cpu.grad) |
| |
| @parametrize_test("antialias", [True, False]) |
| @parametrize_test("num_channels", [3, 5]) |
| @parametrize_test("mode", ["nearest", "nearest-exact", "bilinear", "bicubic"]) |
| @parametrize_test("dtype", integral_types() + floating_types()) |
| @onlyNativeDeviceTypes |
| def test_upsamplingBiMode2d_nonsupported_dtypes(self, device, antialias, num_channels, mode, dtype): |
| x = torch.ones(1, num_channels, 32, 32, dtype=dtype, device=device) |
| |
| should_raise_runtime_error = True |
| |
| if "nearest" in mode: |
| if antialias: |
| raise SkipTest("Nearest mode does not have antialiasing") |
| if dtype in (torch.uint8, ) + floating_types(): |
| should_raise_runtime_error = False |
| |
| elif mode in ("bilinear", "bicubic"): |
| if dtype in floating_types() or (device == "cpu" and dtype == torch.uint8): |
| should_raise_runtime_error = False |
| |
| if should_raise_runtime_error: |
| with self.assertRaisesRegex(RuntimeError, "not implemented for"): |
| F.interpolate(x, (12, 12), mode=mode, antialias=antialias) |
| else: |
| _ = F.interpolate(x, (12, 12), mode=mode, antialias=antialias) |
| |
| @expectedFailureMPS # NotImplementedError: aten::_upsample_bilinear2d_aa.out https://github.com/pytorch/pytorch/issues/77764 |
| @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last]) |
| def test_upsamplingBilinear2d_aa_correctness(self, device, memory_format): |
| # NOTE: We expand the batch dim such that `b*c` is above the maximum |
| # size of CUDA grid z-dimension (2**16) |
| shape = [23000, 3, 8, 8] |
| t_in = torch.arange(3 * 8 * 8, dtype=torch.float, device=device).reshape(1, *shape[1:]) |
| t_in = t_in.expand(shape) |
| t_in = t_in.contiguous(memory_format=memory_format) |
| # This expected result is obtain using PIL.Image.resize |
| # for c in range(3): |
| # a_in = t_in.numpy()[0, c, ...] |
| # pil_in = Image.fromarray(a_in) |
| # pil_out = pil_in.resize((2, 2), resample=Image.LINEAR) |
| expected_out = torch.tensor([ |
| 17.035713, 20.25, 42.75, 45.964287, 81.03572, 84.25, |
| 106.75, 109.96428, 145.0357, 148.25, 170.75, 173.9643 |
| ], device=device, dtype=t_in.dtype).reshape(1, 3, 2, 2) |
| t_out = F.interpolate(t_in, size=(2, 2), mode="bilinear", align_corners=False, antialias=True) |
| self.assertEqual(expected_out.expand([*shape[:2], 2, 2]), t_out) |
| |
| # Partially passes. NotImplementedError: aten::upsample_bicubic2d.out https://github.com/pytorch/pytorch/issues/77764 |
| @skipIfMps |
| @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last]) |
| @parametrize_test("mode", ["bilinear", "bicubic"]) |
| @parametrize_test("antialias", [True, False]) |
| @parametrize_test("align_corners", [True, False]) |
| @parametrize_test("num_channels", [3, 5]) |
| @parametrize_test("output_size", [32, 600]) |
| @parametrize_test("check_as_unsqueezed_3d_tensor", [True, False]) |
| @parametrize_test("non_contig", [False, "sliced", "restrided"]) |
| @parametrize_test("batch_size", [1, 5]) |
| def test_upsamplingBiMode2d_consistency( |
| self, |
| device, |
| memory_format, |
| mode, |
| antialias, |
| align_corners, |
| num_channels, |
| output_size, |
| check_as_unsqueezed_3d_tensor, |
| non_contig, |
| batch_size, |
| ): |
| # Check output value consistency between resized_input_uint8 and resized input_float |
| if torch.device(device).type == "cuda": |
| raise SkipTest("CUDA implementation is not yet supporting uint8") |
| |
| torch.manual_seed(0) |
| |
| # - input range is set to [30, 220] for bicubic mode, because the bicubic kernel may create |
| # [intermediate] values outside of the [0, 255] range, which need |
| # to be clipped in uint8 path, but not in float path. This isn't |
| # an issue with bilinear kernel. |
| input_range = (30, 220) if mode == "bicubic" else (0, 256) |
| input_ui8 = torch.randint(*input_range, size=(batch_size, num_channels, 400, 400), dtype=torch.uint8, device=device) |
| input_ui8 = input_ui8.contiguous(memory_format=memory_format) |
| |
| if non_contig == "sliced": |
| input_ui8 = input_ui8[:, :, 10:-10, 10:-10] |
| elif non_contig == "restrided": |
| input_ui8 = input_ui8[:, :, ::2, ::2] |
| |
| if batch_size == 1 and check_as_unsqueezed_3d_tensor: |
| input_ui8 = input_ui8[0, ...] |
| input_ui8 = input_ui8[None, ...] |
| |
| input_f32 = input_ui8.float() |
| |
| output_f32 = F.interpolate( |
| input_f32, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias |
| ).round().clip(0, 255) |
| output_ui8 = F.interpolate( |
| input_ui8, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=antialias |
| ) |
| |
| if non_contig is False: |
| self.assertTrue(input_ui8.is_contiguous(memory_format=memory_format)) |
| |
| # FIXME if-clause shows the current behaviour which is definitely unexpected. |
| # Ideally we want to fix it such that both the ui8 and f32 outputs are also channels_last |
| # See for more details: https://github.com/pytorch/pytorch/pull/100373 |
| if batch_size == 1 and check_as_unsqueezed_3d_tensor and memory_format == torch.channels_last: |
| self.assertTrue(output_ui8.is_contiguous()) |
| self.assertTrue(output_f32.is_contiguous()) |
| else: |
| self.assertTrue(output_ui8.is_contiguous(memory_format=memory_format)) |
| self.assertTrue(output_f32.is_contiguous(memory_format=memory_format)) |
| |
| if mode == "bilinear": |
| torch.testing.assert_close(output_f32, output_ui8.float(), rtol=0, atol=1) |
| else: |
| diff = (output_f32 - output_ui8.float()).abs() |
| self.assertLess(diff.max(), 15) |
| |
| threshold = 2 |
| percent = 3 |
| self.assertLess((diff > threshold).float().mean(), percent / 100) |
| |
| threshold = 5 |
| percent = 1 |
| self.assertLess((diff > threshold).float().mean(), percent / 100) |
| |
| self.assertLess(diff.mean(), 0.4) |
| |
| @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last]) |
| @parametrize_test("align_corners", [True, False]) |
| @parametrize_test("input_size, output_size", [(399, 437), (403, 377)]) |
| def test_upsamplingBiLinear2d_consistency_interp_size_bug(self, device, memory_format, align_corners, input_size, output_size): |
| # Non-regression test for https://github.com/pytorch/pytorch/pull/101403 |
| |
| if torch.device(device).type == "cuda": |
| raise SkipTest("CUDA implementation is not yet supporting uint8") |
| |
| mode = "bilinear" |
| input_ui8 = torch.randint(0, 256, size=(1, 3, input_size, input_size), dtype=torch.uint8, device=device) |
| input_ui8 = input_ui8.contiguous(memory_format=memory_format) |
| input_f32 = input_ui8.float() |
| |
| output_f32 = F.interpolate( |
| input_f32, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=False |
| ).round().to(torch.uint8) |
| output_ui8 = F.interpolate( |
| input_ui8, size=(output_size, output_size), mode=mode, align_corners=align_corners, antialias=False |
| ) |
| torch.testing.assert_close(output_f32, output_ui8, atol=1, rtol=0) |
| |
| @expectedFailureMPS # NotImplementedError: aten::upsample_bicubic2d.out https://github.com/pytorch/pytorch/issues/77764 |
| def test_upsamplingBicubic2d_correctness(self, device): |
| # test output against known input: align_corners=False result must match opencv |
| in_t = torch.arange(8., device=device).view(1, 2, 2, 2) |
| expected_out_t = torch.tensor( |
| [[[[-0.31641, 0.01562, 0.56250, 0.89453], |
| [0.34766, 0.67969, 1.22656, 1.55859], |
| [1.44141, 1.77344, 2.32031, 2.65234], |
| [2.10547, 2.43750, 2.98438, 3.31641]], |
| |
| [[3.68359, 4.01562, 4.56250, 4.89453], |
| [4.34766, 4.67969, 5.22656, 5.55859], |
| [5.44141, 5.77344, 6.32031, 6.65234], |
| [6.10547, 6.43750, 6.98438, 7.31641]]]], device=device) |
| out_t = F.interpolate(in_t, scale_factor=2, mode='bicubic', align_corners=False) |
| torch.set_printoptions(precision=5) |
| self.assertEqual(out_t, expected_out_t, atol=1e-5, rtol=0) |
| |
| @expectedFailureMPS # NotImplementedError: aten::_upsample_bicubic2d_aa.out https://github.com/pytorch/pytorch/issues/77764 |
| @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last]) |
| def test_upsamplingBicubic2d_aa_correctness(self, device, memory_format): |
| t_in = torch.arange(3 * 8 * 8, dtype=torch.float, device=device).reshape(1, 3, 8, 8) |
| t_in = t_in.contiguous(memory_format=memory_format) |
| # This expected result is obtain using PIL.Image.resize |
| # for c in range(3): |
| # a_in = t_in.numpy()[0, c, ...] |
| # pil_in = Image.fromarray(a_in) |
| # pil_out = pil_in.resize((2, 2), resample=Image.BICUBIC) |
| expected_out = torch.tensor([ |
| 15.1205635, 18.760439, 44.23956, 47.879436, 79.12056, 82.76044, |
| 108.23956, 111.87944, 143.12057, 146.76044, 172.23956, 175.87943 |
| ], device=device, dtype=t_in.dtype).reshape(1, 3, 2, 2) |
| t_out = F.interpolate(t_in, size=(2, 2), mode="bicubic", align_corners=False, antialias=True) |
| self.assertEqual(expected_out, t_out) |
| |
| @expectedFailureMPS # NotImplementedError: aten::upsample_trilinear3d.out https://github.com/pytorch/pytorch/issues/77764 |
| @parametrize_test("align_corners", [True, False]) |
| @parametrize_test("memory_format", [torch.contiguous_format, torch.channels_last_3d]) |
| def test_upsamplingTrilinear3d(self, device, align_corners, memory_format): |
| kwargs = dict(mode='trilinear', align_corners=align_corners) |
| |
| # test float scale factor up & downsampling |
| for scale_factor in [0.5, 1.5, 2]: |
| m = nn.Upsample(scale_factor=scale_factor, **kwargs) |
| in_t = torch.ones(1, 2, 4, 4, 4, device=device, dtype=torch.double) |
| in_t = in_t.contiguous(memory_format=memory_format).requires_grad_() |
| out_size = int(math.floor(in_t.shape[-1] * scale_factor)) |
| with warnings.catch_warnings(record=True) as w: |
| out_t = m(in_t) |
| expected_out = torch.ones(1, 2, out_size, out_size, out_size, device=device, dtype=torch.double) |
| self.assertEqual(expected_out, out_t) |
| # Assert that memory format is carried through to the output |
| self.assertTrue(out_t.is_contiguous(memory_format=memory_format)) |
| |
| grad_out = torch.randn_like(out_t).contiguous(memory_format=memory_format) |
| in_t.grad = None |
| out_t.backward(grad_out) |
| grad_in = in_t.grad |
| self.assertTrue(grad_in.is_contiguous(memory_format=memory_format)) |
| |
| if memory_format == torch.channels_last_3d: |
| # check if grad inputs CF and CL match |
| in_t.grad = None |
| out_t.backward(grad_out.contiguous()) |
| self.assertEqual(in_t.grad, grad_in) |
| |
| input = torch.randn(1, 2, 4, 4, 4, requires_grad=True, dtype=torch.double) |
| self.assertEqual( |
| F.interpolate(input, (out_size, out_size, out_size), **kwargs), |
| F.interpolate(input, scale_factor=scale_factor, **kwargs)) |
| gradcheck(lambda x: F.interpolate(x, out_size, **kwargs), [input]) |
| gradgradcheck(lambda x: F.interpolate(x, out_size, **kwargs), [input]) |
| |
| @onlyCUDA |
| @dtypes(torch.half) |
| @largeTensorTest('40GB') |
| def test_upsampling_64bit_indexing_channels_last(self, device, dtype): |
| x = torch.rand((32, 64, 512, 512), dtype=dtype, device=device) |
| out = torch.nn.functional.interpolate(x.to(memory_format=torch.channels_last), scale_factor=2, mode='nearest') |
| out_ref = torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest') |
| del x |
| self.assertTrue(torch.allclose(out, out_ref)) |
| |
| @onlyCUDA |
| @dtypes(torch.half) |
| @largeTensorTest('40GB') |
| def test_replicatepad_64bit_indexing(self, device, dtype): |
| conv = torch.nn.Conv1d(128, 128, 3, 1, 1, padding_mode="replicate", device=device, dtype=dtype) |
| x = torch.randn(size=(256 * 448 * 2, 128, 96), dtype=dtype, device=device) |
| y = conv(x) |
| torch.mean(y).backward() |
| |
| @onlyCUDA |
| @dtypes(torch.half) |
| @largeTensorTest('40GB') |
| def test_upsamplingnearest2d_backward_64bit_indexing(self, device, dtype): |
| x = torch.randn(size=(36, 128, 512, 512), device=device, dtype=dtype).requires_grad_() |
| y = F.interpolate(x, scale_factor=2, mode="nearest") |
| y.backward(torch.randn_like(y)) |
| |
| def _slow_masked_softmax(self, input, mask): |
| exp = torch.exp(input) |
| exp = exp * mask |
| s = exp.sum(dim=3, keepdim=True).expand(exp.size()) |
| return exp / s |
| |
| def test_masked_softmax_mask_types(self, device): |
| # Test that mask type 0 (LxL attention mask), mask type 1 (BxL padding mask), |
| # and mask type 2 (generic BxHxLxL mask) are processed correctly on the |
| # fast path and the results match explicit slow calculation. |
| sizes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)] |
| |
| for (B, num_heads, L) in sizes: |
| |
| # mask_type == 0 => attention mask of shape LxL |
| src_mask_orig = torch.randint(0, 2, (L, L)).bool() |
| src_mask = src_mask_orig.reshape(1, 1, L, L).expand(B, num_heads, L, L).bool() |
| |
| # mask_type == 1 => padding mask of shape BxL |
| src_key_padding_mask_orig = torch.randint(0, 2, (B, L)).bool() |
| src_key_padding_mask = src_key_padding_mask_orig.reshape(B, 1, 1, L).expand(B, num_heads, L, L).bool() |
| |
| # mask_type == 2 => shape BxHxLxL |
| generic_mask = torch.randint(0, 2, (B, num_heads, L, L)).bool() |
| masks = [(src_mask_orig, src_mask, 0), |
| (src_key_padding_mask_orig, src_key_padding_mask, 1), |
| (generic_mask, generic_mask, 2) |
| ] |
| for dim in [0, 3]: |
| for mask_orig, mask, mask_type in masks: |
| if (self.device_type == "cuda") and (num_heads % 2) and (mask_type == 1): |
| # CUDA path doesn't support padding mask when the number of heads is odd |
| continue |
| input = torch.randn((B, num_heads, L, L)) |
| if (self.device_type == "cuda"): |
| input = input.cuda() |
| mask = mask.cuda() |
| mask_orig = mask_orig.cuda() |
| native_res = torch._masked_softmax(input, mask_orig, dim, mask_type) |
| mask = ~mask |
| |
| def slow_masked_softmax(input, mask): |
| exp = torch.exp(input) |
| exp = exp * mask |
| s = exp.sum(dim=dim, keepdim=True).expand(exp.size()) |
| return exp / s |
| |
| pt_res = slow_masked_softmax(input, mask) |
| pt_res = torch.nan_to_num(pt_res) |
| |
| mask_not = mask.logical_not() |
| # In result, should only fill the entirely masked out rows since those are non-deterministic (*may* be 0) |
| # Converts rows with all True's to False |
| mask_out = mask_not.all(dim, keepdim=True).expand(mask_not.shape) |
| self.assertEqual( |
| pt_res.masked_fill(mask_out, 0), |
| native_res.masked_fill(mask_out, 0), |
| exact_dtype=True |
| ) |
| |
| @onlyCUDA |
| @gcIfJetson |
| def test_masked_softmax_devices_parity(self): |
| # Test that softmax with mask type 0 (LxL attention mask), mask type 1 (BxL padding mask), |
| # and mask type 2 (BxHxLxL generic mask) gives the same result on CPU and on CUDA. |
| |
| sizes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)] |
| for (B, num_heads, L) in sizes: |
| # mask_type == 0 => attention mask of shape LxL |
| src_mask = torch.randint(0, 2, (L, L)).bool() |
| # mask_type == 1 => padding mask of shape BxL |
| src_key_padding_mask = torch.randint(0, 2, (B, L)).bool() |
| # mask_type == 2 => generic mask of shape BxHxLxL |
| generic_mask = torch.randint(0, 2, (B, num_heads, L, L)).bool() |
| masks = [(src_mask, 0), (src_key_padding_mask, 1), (generic_mask, 2)] |
| input = torch.randn((B, num_heads, L, L)) |
| for dim in [0, 3]: |
| for mask, mask_type in masks: |
| if (num_heads % 2) and (mask_type == 1): |
| # CUDA path doesn't support padding mask when the number of heads is odd |
| continue |
| |
| def softmax_on_device(mask, input, device): |
| # Compute softmax on a given device |
| input_device = input.to(device) |
| mask_device = mask.to(device) |
| softmax_res = torch._masked_softmax(input_device, mask_device, dim, mask_type) |
| if mask_type == 0: |
| mask_expanded = mask_device.reshape(1, 1, L, L).expand(B, num_heads, L, L).bool() |
| elif mask_type == 1: |
| mask_expanded = mask_device.reshape(B, 1, 1, L).expand(B, num_heads, L, L).bool() |
| else: |
| mask_expanded = mask_device |
| # In result, should only fill the entirely masked out rows since those are non-deterministic (*may* be 0) |
| # Fill rows with all True's with 0 |
| mask_out = mask_expanded.all(dim, keepdim=True).expand(mask_expanded.shape) |
| softmax_res = softmax_res.masked_fill(mask_out, 0) |
| return softmax_res |
| |
| cpu_res = softmax_on_device(mask, input, "cpu") |
| cuda_res = softmax_on_device(mask, input, "cuda") |
| self.assertEqual(cpu_res, cuda_res, exact_dtype=True) |
| |
| def test_masked_softmax(self, device): |
| sizes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)] |
| for (B, num_heads, L) in sizes: |
| for dim in [0, 3]: |
| input = torch.randn((B, num_heads, L, L)) |
| mask = torch.randint(0, 2, (B, L)) |
| mask = mask.reshape(B, 1, 1, L).expand(B, num_heads, L, L).bool() |
| mask_type = 1 # BxL => src_key_padding_mask |
| if (self.device_type == "cuda"): |
| input = input.cuda() |
| mask = mask.cuda() |
| native_res = torch._masked_softmax(input, mask, dim, mask_type) |
| mask = ~mask |
| |
| def slow_masked_softmax(input, mask): |
| exp = torch.exp(input) |
| exp = exp * mask |
| s = exp.sum(dim=dim, keepdim=True).expand(exp.size()) |
| return exp / s |
| |
| pt_res = slow_masked_softmax(input, mask) |
| pt_res = torch.nan_to_num(pt_res) |
| |
| mask_not = mask.logical_not() |
| # In result, should only fill the entirely masked out rows since those are non-deterministic (*may* be 0) |
| # Converts rows with all True's to False |
| mask_out = mask_not.all(dim, keepdim=True).expand(mask_not.shape) |
| self.assertEqual( |
| pt_res.masked_fill(mask_out, 0), |
| native_res.masked_fill(mask_out, 0), |
| exact_dtype=True |
| ) |
| |
| @dtypes(torch.bfloat16, torch.half) |
| @precisionOverride({torch.bfloat16: 2e-2, torch.half: 3e-3}) |
| def test_masked_softmax_lowp(self, dtype): |
| sizes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)] |
| for (B, num_heads, L) in sizes: |
| for dim in [0, 3]: |
| input_lowp = torch.randn((B, num_heads, L, L), dtype=dtype).requires_grad_() |
| input_ref = input_lowp.float().detach().requires_grad_() |
| mask = torch.randint(0, 2, (B, L)) |
| mask = mask.reshape(B, 1, 1, L).expand(B, num_heads, L, L).bool() |
| |
| for mask_type in [1, 2]: |
| res_ref = torch._masked_softmax(input_ref, mask, dim, mask_type) |
| res = torch._masked_softmax(input_lowp, mask, dim, mask_type) |
| self.assertEqual(res_ref.to(dtype), res) |
| |
| grad_lowp = torch.randn_like(res_ref).to(dtype=dtype) |
| grad_ref = grad_lowp.float() |
| |
| res_ref.backward(grad_ref) |
| res.backward(grad_lowp) |
| self.assertEqual(input_ref.grad.to(dtype), input_lowp.grad) |
| |
| def _test_masked_softmax_helper(self, input, dim, mask, mask_type): |
| input_ref = input.detach().clone().requires_grad_() |
| result = torch._masked_softmax(input, mask, dim, mask_type) |
| |
| expected = torch._softmax(input_ref.masked_fill(mask, float('-inf')), dim, False) |
| grad = torch.randn_like(expected).to(dtype=expected.dtype) |
| |
| result.backward(grad) |
| expected.backward(grad) |
| |
| # Make sure the optional argument works as well |
| if dim == input.dim() - 1: |
| input_ref_default = input.detach().clone().requires_grad_() |
| result_default = torch._masked_softmax(input_ref_default, mask, None, mask_type) |
| result_default.backward(grad) |
| self.assertEqual(result, result_default) |
| self.assertEqual(input.grad, input_ref_default.grad) |
| |
| # In result, should only fill the entirely masked out rows since those are non-deterministic (*may* be 0) |
| # Converts rows with all True's to False |
| mask_out = mask.all(dim, keepdim=True).expand(mask.shape) |
| self.assertEqual(result.masked_fill(mask_out, 0), expected.masked_fill(mask_out, 0)) |
| |
| self.assertEqual(input.grad, torch.nan_to_num(input_ref.grad)) |
| self.assertEqual(input.grad, input.grad.masked_fill(mask, 0.0)) |
| |
| def test_masked_softmax_grad(self, device): |
| shapes = [(1, 1, 32), (3, 16, 310), (12, 4, 1024), (4, 2, 1200)] |
| for shape in shapes: |
| dims = [0, len(shape) - 1] if len(shape) > 0 else [0] |
| for dim in dims: |
| for mask_type in [1, 2]: # 1 = BxL => src_key_padding_mask |
| input = torch.randn(shape, requires_grad=True) |
| mask = torch.randint(0, 2, shape).bool() |
| if (self.device_type == "cuda"): |
| input = input.cuda().detach().requires_grad_() |
| mask = mask.cuda() |
| self._test_masked_softmax_helper(input, dim, mask, mask_type) |
| |
| # In this test, the forward pass is expected to produce nan's because when dim=0, we only have unspecified values |
| def test_masked_softmax_forward_with_nans(self, device): |
| dim = 0 |
| shapes = [(4, 5), (50, 100), (1500, 1200)] |
| for (x, y) in shapes: |
| for mask_type in [1, 2]: # 1 = BxL => src_key_padding_mask |
| input = torch.randn((x, y), requires_grad=True) |
| mask = torch.tensor([i % 2 for i in range(y)]).expand((x, y)).bool() |
| if (self.device_type == "cuda"): |
| input = input.cuda().detach().requires_grad_() |
| mask = mask.cuda() |
| self._test_masked_softmax_helper(input, dim, mask, mask_type) |
| |
| @onlyCUDA |
| def test_masked_softmax_transformer_layout(self, device): |
| B = 211 |
| num_heads = 16 |
| L = 42 |
| input = torch.randn((B, num_heads, L, L)) |
| dim = input.dim() - 1 |
| mask = torch.randint(0, 2, (B, L)) |
| mask_type = 1 # BxL => src_key_padding_mask |
| if (self.device_type == "cuda"): |
| input = input.cuda() |
| mask = mask.cuda() |
| mask = mask.bool() |
| native_res = torch._masked_softmax(input, mask, dim, mask_type) |
| mask = mask.reshape(B, 1, 1, L).expand(B, num_heads, L, L) |
| mask = ~mask |
| mask = mask.float() |
| |
| pt_res = self._slow_masked_softmax(input, mask) |
| self.assertEqual(pt_res, native_res, exact_dtype=True) |
| |
| @onlyCUDA |
| def test_masked_softmax_TxT_layout(self, device): |
| B = 211 |
| num_heads = 16 |
| L = 42 |
| input = torch.randn((B, num_heads, L, L)) |
| dim = input.dim() - 1 |
| mask = torch.randint(0, 2, (L, L)) |
| mask_type = 0 # LxL => src_mask |
| if (self.device_type == "cuda"): |
| input = input.cuda() |
| mask = mask.cuda() |
| mask = mask.bool() |
| native_res = torch._masked_softmax(input, mask, dim, mask_type) |
| mask = mask.expand(B, num_heads, L, L) |
| mask = ~mask |
| mask = mask.float() |
| |
| pt_res = self._slow_masked_softmax(input, mask) |
| self.assertEqual(pt_res, native_res, exact_dtype=True) |
| |
| @onlyCPU |
| @dtypes(torch.bfloat16, torch.half) |
| def test_log_softmax_cpu(self, device, dtype): |
| for dim in [0, 1]: |
| inputf = torch.rand(200, 200, device=device, dtype=torch.float, requires_grad=True) |
| input = inputf.to(dtype).detach().requires_grad_(True) |
| outf = F.log_softmax(inputf, dim=dim) |
| out = F.log_softmax(input, dim=dim) |
| self.assertEqual(out, outf.to(dtype=dtype), atol=0.1, rtol=0) |
| |
| out.sum().backward() |
| outf.sum().backward() |
| self.assertEqual(input.grad, inputf.grad.to(dtype), atol=0.1, rtol=0) |
| |
| @onlyCPU |
| @dtypes(torch.bfloat16, torch.half) |
| def test_softmax_cpu(self, device, dtype): |
| for dim in [0, 1]: |
| inputf = torch.rand(200, 200, device=device, dtype=torch.float, requires_grad=True) |
| input = inputf.to(dtype).detach().requires_grad_(True) |
| outf = F.softmax(inputf, dim=dim) |
| out = F.softmax(input, dim=dim) |
| self.assertEqual(out, outf.to(dtype), atol=1e-3, rtol=0) |
| |
| out.sum().backward() |
| outf.sum().backward() |
| self.assertEqual(input.grad, inputf.grad.to(dtype), atol=1e-3, rtol=0) |
| |
| @dtypesIfCUDA(torch.half, torch.float) |
| @dtypes(torch.float) |
| def test_softmax_results(self, device, dtype): |
| # Non-even sizes and non-zero shifts test fallback paths in vectorized kernel |
| # Note: dim1 > 1024 is needed to exercise the vectorized (non-persistent) path, (16, 30576) is BERT-esque |
| sizes = [(0, 10), (32, 20), (10, 0), (31, 20), (32, 21), (31, 23), (32, 1536), (31, 2048), (33, 2049), (16, 30576)] |
| shifts = [(0, 0), (1, 0), (0, 1), (1, 1)] |
| for fn in [F.softmax, F.log_softmax]: |
| for size in sizes: |
| for shift in shifts: |
| input = torch.rand(size, device=device, dtype=dtype) |
| # Note: With the largest tests we can hit upper limit of fp16 when we |
| # sum, so scale the input down to stay in a nicer range. |
| if dtype == torch.float16: |
| input = input / 100. |
| input = input[shift[0]:, shift[1]:] |
| # Note; Don't want to bprop back through slice op |
| input = input.detach().requires_grad_(True) |
| ref_input = input.clone().cpu().detach().requires_grad_(True) |
| for dim in [0, 1]: |
| ref_output = fn(ref_input, dtype=torch.float, dim=dim) |
| output = fn(input, dtype=torch.float, dim=dim) |
| grad_output = torch.rand(size, device=device, dtype=dtype) |
| grad_output = grad_output[shift[0]:, shift[1]:] |
| ref_grad_output = grad_output.clone().cpu().detach() |
| grad_input, = torch.autograd.grad(output, input, grad_outputs=(grad_output), create_graph=True) |
| ref_grad_input, = torch.autograd.grad(ref_output, ref_input, |
| grad_outputs=(ref_grad_output), create_graph=True) |
| grad_input.sum().backward() |
| ref_grad_input.sum().backward() |
| |
| self.assertEqual(output, ref_output) |
| self.assertEqual(grad_input, ref_grad_input) |
| self.assertEqual(input.grad, ref_input.grad) |
| |
| @onlyCUDA |
| @dtypes(torch.float, torch.half) |
| @largeTensorTest("20GB") |
| @largeTensorTest("64GB", "cpu") |
| def test_warp_softmax_64bit_indexing(self, device, dtype): |
| def run_test(*shape): |
| x = torch.randn(shape, device="cuda", dtype=torch.float16, requires_grad=True) |
| y = F.log_softmax(x, dim=-1, dtype=dtype) |
| y.backward(y) |
| with torch.no_grad(): |
| xx = x.cpu().requires_grad_() |
| yy = F.log_softmax(xx.float(), dim=-1).to(dtype) |
| yy.backward(yy) |
| # workaround to reduce memory usage vs. self.assertEqual, see #84944 |
| rtol, atol = torch.testing._comparison.get_tolerances(dtype, rtol=None, atol=None) |
| self.assertTrue(torch.allclose(y.cpu(), yy, rtol=rtol, atol=atol)) |
| # x is half |
| rtol, _ = torch.testing._comparison.get_tolerances(torch.half, rtol=None, atol=None) |
| self.assertTrue(torch.allclose(x.grad.cpu(), xx.grad, rtol=rtol, atol=1e-3)) |
| |
| run_test(1100000000, 2) # Illegal memory access https://github.com/pytorch/pytorch/issues/52715 |
| run_test(2200000000, 1) # invalid configuration argument https://github.com/pytorch/pytorch/issues/52716 |
| |
| @onlyCUDA |
| @dtypes(torch.half) |
| @largeTensorTest("20GB") |
| @largeTensorTest("2GB", "cpu") |
| @precisionOverride({torch.half: 0.001}) |
| def test_softmax_64bit_indexing(self, device, dtype): |
| def run_test(*shape): |
| x = torch.ones(shape, device=device, dtype=dtype, requires_grad=True) |
| y = F.log_softmax(x, dim=-1, dtype=dtype) |
| y.backward(y) |
| self.assertEqual(y[0], y[-1]) |
| self.assertEqual(x.grad[0], x.grad[-1]) |
| |
| run_test(1024 * 256 + 1, 8192) # https://github.com/pytorch/pytorch/issues/84144 |
| |
| |
| @dtypes(torch.float) |
| @dtypesIfCUDA(torch.float, torch.half) |
| def test_log_softmax_big(self, device, dtype): |
| def _test_helper(shape): |
| # generate a tensor with big numbers that are exactly representable in dtype |
| # and are at a constant offset from tensor with small numbers |
| # the logsoftmax of a small and big tensors should be equal |
| x_small = torch.randint(100, shape, dtype=dtype, device=device) |
| offset = 1.5e3 if dtype == torch.half else 1e7 |
| x_big = x_small + offset |
| self.assertEqual(F.log_softmax(x_small, -1), F.log_softmax(x_big, -1)) |
| _test_helper((16, 4)) |
| if self.device_type == 'cuda': |
| # test non-persistent softmax kernel |
| _test_helper((4, 1536)) |
| |
| def test_save_lstm_compatibility(self, device): |
| # Test that saving an LSTM in PyTorch 1.7 and older can still be |
| # loaded in newer versions of PyTorch. |
| model = nn.LSTM(2, 3) |
| x = torch.randn(32, 5, 2) |
| expected = model(x) |
| |
| # Get a state dict for PyTorch 1.7 LSTM. Before PyTorch 1.8, proj_size |
| # didn't exist. |
| assert model.proj_size == 0 |
| state_dict = model.__dict__ |
| del state_dict['proj_size'] |
| |
| # load a model |
| loaded_model = nn.LSTM(2, 3) |
| loaded_model.__setstate__(state_dict) |
| result = loaded_model(x) |
| self.assertEqual(result, expected) |
| |
| @onlyCUDA |
| @tf32_on_and_off(0.005) |
| def test_grid_sample_large(self, device): |
| def issue_35202(): |
| input_tensor = torch.rand(1, 1, 480, 640, dtype=torch.float, device=device, requires_grad=True) |
| coords = torch.tensor([[-10059144, 67680944], [67680944, 67680944]], dtype=torch.float, device=device) |
| coords = coords.unsqueeze(0).unsqueeze(0).repeat(1, 1, 1, 1) |
| result = torch.nn.functional.grid_sample(input_tensor, coords) |
| self.assertEqual(result, torch.tensor([[[[0., 0.]]]], dtype=torch.float, device=device)) |
| result.backward(torch.ones_like(result)) |
| torch.cuda.synchronize() |
| issue_35202() |
| |
| def issue_24823_1(dtype): |
| image = torch.arange(27, 0, -1, dtype=dtype, device=device).view(1, 1, 3, 3, 3) |
| image.requires_grad_() |
| grid = torch.nn.functional.affine_grid( |
| torch.tensor([[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0]]], dtype=dtype, device=device), |
| (1, 1, 3, 3, 3)) |
| grid[:, 1, 1, 1, 0] = float('inf') |
| result = torch.nn.functional.grid_sample(image, grid, padding_mode='zeros') |
| tol_override = {'atol': 0.005, 'rtol': 0} if dtype == torch.half else {} |
| self.assertEqual(result, torch.tensor([[[[[27., 26., 25.], [24., 23., 22.], [21., 20., 19.]], |
| [[18., 17., 16.], [15., 0., 13.], [12., 11., 10.]], |
| [[9., 8., 7.], [6., 5., 4.], [3., 2., 1.]]]]], |
| device=device, dtype=dtype), **tol_override) |
| result.backward(torch.ones_like(result)) |
| expected_grad = torch.ones_like(image) |
| expected_grad[0, 0, 1, 1, 1] = 0 |
| self.assertEqual(image.grad, expected_grad, atol=0.005, rtol=0) |
| issue_24823_1(torch.half) |
| issue_24823_1(torch.float) |
| issue_24823_1(torch.double) |
| |
| def issue_24823_2(): |
| param = torch.tensor([[[-1.0e+20, 0.0, 0.0], [0.0, -1.0e+20, 0.0]]], dtype=torch.float, device=device) |
| img = torch.zeros((1, 1, 4, 4), dtype=torch.float, device=device, requires_grad=True) |
| grid = torch.nn.functional.affine_grid(param, img.size()) |
| result = torch.nn.functional.grid_sample(img, grid) |
| self.assertEqual(result, torch.zeros(1, 1, 4, 4, device=device, dtype=torch.float)) |
| result.backward(torch.ones_like(result)) |
| torch.cuda.synchronize() |
| issue_24823_2() |
| |
| @dtypes(torch.float, torch.double) |
| @largeTensorTest(lambda self, device, dtype: |
| # Compute sum of the large tensor sizes: |
| # (im.numel() + small_image.numel() + small_image.grad.numel() + |
| # large_view.grad.numel()) * sizeof(dtype) |
| 32769 * (65536 + 3 * 65536 / 128) * |
| torch.tensor([], dtype=dtype).element_size()) |
| def test_grid_sample_large_index_2d(self, device, dtype): |
| # Test 64-bit indexing with grid_sample (gh-41656) |
| # Try accessing the corners, there should be no segfault |
| coords = torch.tensor([[[-1., -1.], |
| [+1., -1.]], |
| |
| [[-1., +1.], |
| [+1., +1.]]], device=device, dtype=dtype) |
| coords = coords.expand(1, 2, 2, 2) |
| im = torch.zeros([1, 1, 32769, 65536], device=device, dtype=dtype) |
| |
| # Compare sampling with large strides to the same op on a contiguous tensor |
| coords = torch.rand(1, 4, 4, 2, device=device, dtype=dtype) |
| large_view = im[..., 127::128] |
| small_image = torch.rand_like(large_view) |
| large_view[...] = small_image |
| large_view.requires_grad, small_image.requires_grad = True, True |
| self.assertTrue( |
| sum(i * s for i, s in zip(large_view.size(), large_view.stride())) >= 2 ** 31, |
| msg="View must use 64-bit indexing") |
| for mode, padding_mode, align_corners in itertools.product( |
| ('nearest', 'bilinear', 'bicubic'), ('zeros', 'border', 'reflection'), (True, False)): |
| a = F.grid_sample( |
| small_image, coords, mode=mode, |
| padding_mode=padding_mode, align_corners=align_corners) |
| a.sum().backward() |
| |
| b = F.grid_sample( |
| large_view, coords, mode=mode, |
| padding_mode=padding_mode, align_corners=align_corners) |
| b.sum().backward() |
| |
| self.assertEqual(a, b) |
| self.assertEqual(small_image.grad, large_view.grad) |
| |
| small_image.grad.zero_() |
| large_view.grad.zero_() |
| |
| @dtypes(torch.float, torch.double) |
| @largeTensorTest(lambda self, device, dtype: |
| # Compute sum of the large tensor sizes: |
| # (im.numel() + small_image.numel() + small_image.grad.numel() + |
| # large_view.grad.numel()) * sizeof(dtype) |
| 2 * 32769 * (32768 + 3 * 32768 / 128) * |
| torch.tensor([], dtype=dtype).element_size()) |
| def test_grid_sample_large_index_3d(self, device, dtype): |
| # Test 64-bit indexing with grid_sample (gh-41656) |
| # Try accessing the corners, there should be no segfault |
| coords = torch.full((1, 2, 2, 2, 3), 1., device=device, dtype=dtype) |
| im = torch.zeros([1, 1, 2, 32769, 32768], device=device, dtype=dtype) |
| |
| result = F.grid_sample(im, coords, align_corners=False) |
| self.assertEqual(result, torch.zeros((1, 1, 2, 2, 2), device=device, dtype=dtype)) |
| |
| # Compare sampling with large strides to the same op on a contiguous tensor |
| coords = torch.rand(1, 1, 4, 4, 3, device=device, dtype=dtype) |
| large_view = im[..., 127::128] |
| small_image = torch.rand_like(large_view) |
| large_view[...] = small_image |
| small_image.requires_grad, large_view.requires_grad = True, True |
| self.assertTrue( |
| sum(i * s for i, s in zip(large_view.size(), large_view.stride())) >= 2 ** 31, |
| msg="View must use 64-bit indexing") |
| for mode, padding_mode, align_corners in itertools.product( |
| ('nearest', 'bilinear'), ('zeros', 'border', 'reflection'), (True, False)): |
| a = F.grid_sample( |
| small_image, coords, mode=mode, |
| padding_mode=padding_mode, align_corners=align_corners) |
| a.sum().backward() |
| |
| b = F.grid_sample( |
| large_view, coords, mode=mode, |
| padding_mode=padding_mode, align_corners=align_corners) |
| b.sum().backward() |
| |
| self.assertEqual(a, b) |
| self.assertEqual(small_image.grad, large_view.grad) |
| |
| small_image.grad.zero_() |
| large_view.grad.zero_() |
| |
| @onlyCUDA |
| def test_grid_sample_half_precision(self): |
| def helper(shape_in, shape_out, align_corners): |
| for mode in ('bilinear', 'nearest', 'bicubic'): |
| if len(shape_in) != 4 and mode == 'bicubic': |
| continue |
| data = torch.randn(shape_in, device='cuda', dtype=torch.half) |
| grid = torch.rand(shape_out, device='cuda', dtype=torch.half) * 2.0 - 1.0 |
| |
| out_half = F.grid_sample(data, grid, mode=mode, padding_mode='zeros', align_corners=align_corners) |
| out_double = F.grid_sample(data.double(), grid.double(), mode=mode, padding_mode='zeros', |
| align_corners=align_corners) |
| |
| self.assertEqual(out_half, out_double.half(), msg=f"grid_sample with mode = {mode} doesn't match") |
| |
| helper((32, 64, 16, 16), (32, 8, 8, 2), True) |
| helper((32, 64, 16, 16, 16), (32, 8, 8, 8, 3), True) |
| helper((32, 64, 16, 16), (32, 8, 8, 2), False) |
| helper((32, 64, 16, 16, 16), (32, 8, 8, 8, 3), False) |
| |
| @onlyCUDA |
| def test_grid_sample_bfloat16_precision(self): |
| def helper(shape_in, shape_out, align_corners): |
| for mode in ('bilinear', 'nearest', 'bicubic'): |
| if len(shape_in) != 4 and mode == 'bicubic': |
| continue |
| data = torch.randn(shape_in, device='cuda', dtype=torch.bfloat16) |
| grid = torch.rand(shape_out, device='cuda', dtype=torch.bfloat16) * 2.0 - 1.0 |
| |
| out_half = F.grid_sample(data, grid, mode=mode, padding_mode='zeros', align_corners=align_corners) |
| out_double = F.grid_sample(data.double(), grid.double(), mode=mode, padding_mode='zeros', |
| align_corners=align_corners) |
| |
| self.assertEqual(out_half, out_double.bfloat16(), msg=f"grid_sample with mode = {mode} doesn't match") |
| |
| helper((32, 64, 16, 16), (32, 8, 8, 2), True) |
| helper((32, 64, 16, 16, 16), (32, 8, 8, 8, 3), True) |
| helper((32, 64, 16, 16), (32, 8, 8, 2), False) |
| helper((32, 64, 16, 16, 16), (32, 8, 8, 8, 3), False) |
| |
| def _test_gumbel_softmax_st_shapes(self, device, dtype, shape, dim, count_expected): |
| logits = torch.randn(shape, dtype=torch.float, device=device) |
| logits = logits.to(dtype) |
| |
| y_draw = F.gumbel_softmax(logits, hard=True, dim=dim) |
| |
| # All values positive |
| self.assertGreaterEqual(y_draw.min(), 0) |
| # Shape unchanged |
| self.assertTrue(y_draw.shape == logits.shape) |
| # One choice per draw |
| self.assertEqual(y_draw.sum(), count_expected, atol=torch.finfo(y_draw.dtype).eps, rtol=0) |
| |
| def _test_gumbel_softmax_straight_through(self, device, dtype): |
| num_draws = 100 |
| |
| logits = torch.tensor([[0.2, 0.8, 0.1]], device=device) |
| logits = logits.reshape([1, 3]) |
| logits = logits.to(dtype).requires_grad_() |
| probs = logits.softmax(dim=-1) |
| |
| counts = torch.zeros_like(logits) |
| for _ in range(num_draws): |
| y_draw = F.gumbel_softmax(logits, hard=True) |
| counts = counts + y_draw |
| |
| # All values positive |
| self.assertGreaterEqual(y_draw.min(), 0) |
| # Each experiment should result in 1 draw. |
| self.assertEqual(counts.sum(), num_draws, atol=torch.finfo(counts.dtype).eps, rtol=0) |
| |
| # check results is asymptotically as expected. |
| expected = probs * num_draws |
| # ~z is approximately N(0,1) for unbiased count |
| z = (counts - expected) / (expected * (1 - probs)).sqrt() |
| # A (lazy) approximate 99% two-sided test: |
| # occurs with prob alpha~>=0.01 if unbiased |
| self.assertLess(z.abs().max().item(), 2.58) |
| |
| def _test_gumbel_softmax_grad(self, device, dtype): |
| # "hard" and "not hard" should propagate same gradient. |
| logits_soft = torch.zeros(10, 10, dtype=dtype, device=device, requires_grad=True) |
| logits_hard = torch.zeros(10, 10, dtype=dtype, device=device, requires_grad=True) |
| |
| seed = torch.random.get_rng_state() |
| y_soft = F.gumbel_softmax(logits_soft, hard=False) |
| torch.random.set_rng_state(seed) |
| y_hard = F.gumbel_softmax(logits_hard, hard=True) |
| |
| y_soft.sum().backward() |
| y_hard.sum().backward() |
| |
| # 2eps = 1x addition + 1x subtraction. |
| tol = 2 * torch.finfo(dtype).eps |
| self.assertEqual(logits_soft.grad, logits_hard.grad, atol=tol, rtol=0) |
| |
| @dtypesIfCUDA(torch.half, torch.float, torch.double) |
| @dtypesIfMPS(torch.float) |
| @dtypes(torch.float, torch.double) |
| def test_gumbel_softmax(self, device, dtype): |
| self._test_gumbel_softmax_st_shapes(device, dtype, shape=[5], dim=0, count_expected=1) |
| self._test_gumbel_softmax_st_shapes(device, dtype, shape=[5], dim=-1, count_expected=1) |
| self._test_gumbel_softmax_st_shapes(device, dtype, shape=[5, 4], dim=1, count_expected=5) |
| self._test_gumbel_softmax_st_shapes(device, dtype, shape=[5, 4, 3], dim=1, count_expected=5 * 3) |
| self._test_gumbel_softmax_st_shapes(device, dtype, shape=[5, 4, 3], dim=-1, count_expected=5 * 4) |
| self._test_gumbel_softmax_straight_through(device, dtype) |
| self._test_gumbel_softmax_grad(device, dtype) |
| |
| def _test_rnn_retain_variables(self, device, dtype): |
| rnns = [nn.LSTM(10, 20, num_layers=2).to(device, dtype), |
| nn.GRU(10, 20, num_layers=2).to(device, dtype), |
| nn.RNN(10, 20, num_layers=2).to(device, dtype)] |
| for rnn in rnns: |
| input = torch.randn(5, 6, 10, device=device, dtype=dtype, requires_grad=True) |
| output = rnn(input) |
| output[0].sum().backward(retain_graph=True) |
| grads = [input.grad.data.clone()] + [p.grad.data.clone() for p in rnn.parameters()] |
| for _ in range(4): |
| rnn.zero_grad() |
| input.grad.data.zero_() |
| output[0].sum().backward(retain_graph=True) |
| grads2 = [input.grad.data] + [p.grad.data for p in rnn.parameters()] |
| self.assertEqual(grads, grads2) |
| |
| @dtypesIfCUDA(torch.half, torch.float, torch.double) |
| @dtypesIfMPS(torch.half, torch.float) |
| @dtypes(torch.double) |
| def test_rnn_retain_variables(self, device, dtype): |
| self._test_rnn_retain_variables(device, dtype) |
| |
| if self.device_type == 'cuda' and self.has_cudnn(): |
| with torch.backends.cudnn.flags(enabled=False): |
| self._test_rnn_retain_variables(device, dtype) |
| |
| @onlyCUDA |
| @dtypes(torch.double) |
| def test_lstmcell_backward_only_one_output_grad(self, device, dtype): |
| # checks that undefined gradients doen't hamper the backward |
| # see #11872 |
| l = torch.nn.LSTMCell(2, 3).to(device).to(dtype=dtype) |
| s = torch.randn(1, 2, device=device, dtype=dtype, requires_grad=True) |
| for i in range(2): |
| out = l(s)[i] |
| out.sum().backward() |
| self.assertFalse(s.grad is None or s.grad.abs().sum().item() == 0) |
| |
| def _test_rnn_mod(self, mod, inp): |
| def flatten_out(mod, inp): |
| out = mod(inp) |
| return tuple([t if isinstance(t, torch.Tensor) else tt for t in out for tt in t]) |
| gradcheckfunc = partial(flatten_out, mod) |
| with torch.backends.cudnn.flags(enabled=False): |
| gradcheck(gradcheckfunc, inp, check_batched_grad=False) |
| gradgradcheck(gradcheckfunc, inp, check_batched_grad=False) |
| |
| if inp.is_cuda and not TEST_WITH_ROCM: |
| # Assert that we have good error message around unsupported CuDNN double backward |
| # NB: we trigger double backward using .backward() instead of autograd.grad due to |
| # https://github.com/pytorch/pytorch/issues/37874 |
| with torch.backends.cudnn.flags(enabled=True): |
| result = gradcheckfunc(inp) |
| result[0].sum().backward(create_graph=True) |
| grad0 = next(mod.parameters()).grad |
| with self.assertRaisesRegex(RuntimeError, |
| "please disable the CuDNN backend temporarily"): |
| grad0.sum().backward() |
| |
| # Here we avoid the backward(create_graph=True) memory leak |
| # described in https://github.com/pytorch/pytorch/issues/7343 |
| for param in mod.parameters(): |
| param.grad = None |
| inp.grad = None |
| |
| # Merge into OpInfo? |
| @skipMeta # LSTM cell reuses output which was resized |
| @expectedFailureMPS # TypeError: the MPS framework doesn't support float64 |
| @dtypes(torch.double) |
| def test_LSTM_grad_and_gradgrad(self, device, dtype): |
| hsize = 4 |
| inp = torch.rand(1, 3, hsize, device=device, dtype=dtype, requires_grad=True) |
| for bias in [True, False]: |
| mod = torch.nn.LSTM(hsize, hsize, bias=bias).to(device).to(dtype) |
| self._test_rnn_mod(mod, inp) |
| |
| @skipMeta # GRU cell reuses output which was resized |
| @expectedFailureMPS # TypeError: the MPS framework doesn't support float64 |
| @dtypes(torch.double) |
| def test_GRU_grad_and_gradgrad(self, device, dtype): |
| hsize = 4 |
| inp = torch.rand(1, 3, hsize, device=device, dtype=dtype, requires_grad=True) |
| for bias in [True, False]: |
| mod = torch.nn.GRU(hsize, hsize, bias=bias).to(device).to(dtype) |
| self._test_rnn_mod(mod, inp) |
| |
| @skipMeta |
| @dtypes(torch.float32, torch.bfloat16) |
| @onlyCPU |
| def test_LSTM_differentiable_backward_using_oneDNN(self, dtype): |
| batch = 10 |
| seq_len = 12 |
| input = 3 |
| Net = nn.LSTM(input, 3, 20, batch_first=True) |
| import copy |
| Net_clone = copy.deepcopy(Net) |
| x = torch.rand(batch, seq_len, input) |
| x1 = x.clone().requires_grad_(True) |
| x2 = x.clone().requires_grad_(True) |
| |
| torch._C._set_mkldnn_enabled(False) |
| out1, _ = Net(x1) |
| der_out1 = torch.autograd.grad(out1, x1, |
| grad_outputs=torch.ones_like(out1), |
| retain_graph=True, |
| create_graph=True)[0] |
| loss1 = der_out1.sum() |
| loss1.backward(retain_graph=True) |
| |
| torch._C._set_mkldnn_enabled(True) |
| out2, _ = Net(x2) |
| der_out2 = torch.autograd.grad(out2, x2, |
| grad_outputs=torch.ones_like(out2), |
| retain_graph=True, |
| create_graph=True)[0] |
| loss2 = der_out2.sum() |
| loss2.backward(retain_graph=True) |
| assert torch.allclose(der_out1, der_out2) |
| assert torch.allclose(x1.grad, x2.grad) |
| |
| @onlyCUDA |
| def test_upsamplingNearest1d_launch_config(self, device): |
| m = nn.Upsample(scale_factor=2) |
| inp = torch.rand(2**25, 1, 1, device=device) |
| out = m(inp) |
| inp_ref = inp.cpu() |
| out_ref = m(inp_ref) |
| self.assertEqual(out_ref, out) |
| |
| @onlyCUDA |
| def test_upsamplingNearest2d_launch_config(self, device): |
| m = nn.Upsample(scale_factor=2) |
| inp = torch.rand(2**25, 1, 1, 1, device=device) |
| out = m(inp) |
| inp_ref = inp.cpu() |
| out_ref = m(inp_ref) |
| self.assertEqual(out_ref, out) |
| |
| @onlyCUDA |
| @gcIfJetson |
| def test_upsamplingNearest3d_launch_config(self, device): |
| m = nn.Upsample(scale_factor=2) |
| inp = torch.rand(2**25, 1, 1, 1, 1, device=device) |
| out = m(inp) |
| inp_ref = inp.cpu() |
| out_ref = m(inp_ref) |
| self.assertEqual(out_ref, out) |
| |
| @unittest.expectedFailure |
| @skipIfRocm |
| @onlyCUDA |
| def test_upsamplingNearest2d_launch_fail(self, device): |
| m = nn.Upsample(scale_factor=2) |
| # launch grid_y == 2**16 (larger than maximum y-dimension limit 65535) |
| inp = torch.rand(1, 1, 2**15, 2**8, device=device) |
| out = m(inp) |
| |
| @onlyCUDA |
| @skipCUDAIfNotRocm |
| def test_upsamplingNearest2d_launch_rocm(self, device): |
| # test_upsamplingNearest2d_launch_fail should run OK on ROCm |
| m = nn.Upsample(scale_factor=2) |
| inp = torch.rand(1, 1, 2**15, 2**8, device=device) |
| out = m(inp) |
| |
| @onlyCUDA |
| @skipCUDAIfCudnnVersionLessThan(7600) |
| def test_CTCLoss_cudnn(self, device): |
| def _helper(zero_infinity): |
| target_lengths = [30, 25, 20] |
| input_lengths = [50, 50, 50] |
| targets = torch.randint(1, 15, (sum(target_lengths),), dtype=torch.int) |
| log_probs = torch.randn(50, 3, 15, dtype=torch.float, device=device).log_softmax(2).requires_grad_() |
| |
| log_probs_ref = log_probs.detach().clone().requires_grad_() |
| |
| with torch.backends.cudnn.flags(enabled=True): |
| res = torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths, zero_infinity=zero_infinity) |
| res.backward() |
| |
| expected = ctcloss_reference(log_probs, targets.cuda(), input_lengths, target_lengths).float() |
| |
| with torch.backends.cudnn.flags(enabled=False): |
| res2 = torch.nn.functional.ctc_loss(log_probs_ref, targets.cuda().long(), input_lengths, target_lengths, |
| zero_infinity=zero_infinity) |
| res2.backward() |
| |
| self.assertEqual(res, expected) |
| self.assertEqual(res2, res) |
| self.assertEqual(log_probs.grad, log_probs_ref.grad) |
| |
| _helper(zero_infinity=True) |
| _helper(zero_infinity=False) |
| |
| def _CTCLoss_gen_losses(self, device, input_length, vocab_size, target_length, reduction, use_module_form): |
| batch_size = 1 |
| log_probs = torch.randn(input_length, batch_size, vocab_size, dtype=torch.float, device=device) \ |
| .log_softmax(2).requires_grad_() |
| targets = torch.randint(low=1, high=vocab_size - 1, size=(batch_size, target_length), |
| dtype=torch.int, device=device) |
| input_lengths = batch_size * [input_length] |
| target_lengths = batch_size * [target_length] |
| |
| log_probs_no_bd = log_probs.squeeze(1).detach().clone().requires_grad_() |
| targets_no_bd = targets.squeeze(0).detach().clone() |
| input_lengths_no_bd = torch.tensor(input_length) |
| target_lengths_no_bd = torch.tensor(target_length) |
| |
| # currently only length 2 and 1 right now, but left flexible for additional potential cases |
| log_probs_refs = [log_probs.detach().clone().requires_grad_() for _ in range(2)] |
| log_probs_no_bd_refs = [log_probs_no_bd.detach().clone().requires_grad_() for _ in range(1)] |
| |
| losses = [] |
| losses_no_bd = [] |
| |
| has_cuda = torch.cuda.is_available() |
| has_cudnn = has_cuda and 'cuda' in device and self.has_cudnn() |
| # cudnn requires a cpu target |
| if has_cuda and has_cudnn: |
| targets = targets.cpu() |
| targets_no_bd = targets_no_bd.cpu() |
| |
| ctc_loss = ( |
| nn.CTCLoss(reduction=reduction, zero_infinity=True) |
| if use_module_form |
| else partial(torch.nn.functional.ctc_loss, reduction=reduction, zero_infinity=True) |
| ) |
| |
| with torch.backends.cudnn.flags(enabled=has_cudnn): |
| # batched case. log_probs.shape = (T, N, C), targets = (N, S), input_lengths/target_lengths = (N,) |
| losses.append(ctc_loss(log_probs_refs[0], targets, input_lengths, target_lengths)) |
| # batched case. input.shape = (T, N, C), targets = (S,), input_lengths/target_lengths = (N,) |
| losses.append(ctc_loss(log_probs_refs[1], targets_no_bd, input_lengths, target_lengths)) |
| # unbatched case. input.shape = (T, C), targets = (S,), input_lengths/target_lengths = (N,) |
| losses_no_bd.append(ctc_loss(log_probs_no_bd_refs[0], targets_no_bd, |
| input_lengths_no_bd, target_lengths_no_bd)) |
| |
| for loss in losses + losses_no_bd: |
| loss.backward() |
| |
| return losses, losses_no_bd, log_probs_refs, log_probs_no_bd_refs |
| |
| def _assertEqual_list(self, expected, list_to_compare, atol=None, rtol=None): |
| for ele in list_to_compare: |
| self.assertEqual(expected, ele, atol=atol, rtol=rtol) |
| |
| @expectedFailureMPS # NotImplementedError: aten::_ctc_loss https://github.com/pytorch/pytorch/issues/77764 |
| @parametrize_test("reduction", ['none', 'mean', 'sum']) |
| @parametrize_test("use_module_form", [True, False]) |
| def test_CTCLoss_no_batch_dim(self, device, reduction, use_module_form): |
| input_length = 40 |
| vocab_size = 3 |
| target_length = 12 |
| |
| args = self._CTCLoss_gen_losses(device, input_length, vocab_size, target_length, reduction, use_module_form) |
| losses, losses_no_bd, log_probs_refs, log_probs_no_bd_refs = args |
| |
| # test output values |
| self._assertEqual_list(losses[0], losses[1:], atol=1e-4, rtol=0) |
| self._assertEqual_list(losses[0].squeeze(0), losses_no_bd, atol=1e-4, rtol=0) |
| |
| # test gradient values |
| self._assertEqual_list(log_probs_refs[0].grad, [t.grad for t in log_probs_refs[1:]], atol=1e-4, rtol=0) |
| self._assertEqual_list( |
| log_probs_refs[0].grad.squeeze(1), |
| [t.grad for t in log_probs_no_bd_refs], |
| atol=1e-4, |
| rtol=0, |
| ) |
| |
| # checking the output's shape |
| # batch dim case should be (N,). no batch dim case should be () |
| self._assertEqual_list((1,) if reduction == 'none' else (), [loss.shape for loss in losses]) |
| self._assertEqual_list((), [loss.shape for loss in losses_no_bd]) |
| |
| # checking the gradient's shape |
| # batch dim case should have shape (T, N, C). no batch dim case should have shape (T, C) |
| self._assertEqual_list((input_length, 1, vocab_size), [t.grad.shape for t in log_probs_refs]) |
| self._assertEqual_list((input_length, vocab_size), [t.grad.shape for t in log_probs_no_bd_refs]) |
| |
| def _ordered_sequence(self, device, dtype): |
| """Create ordered list of random sequences""" |
| seqs = [torch.empty(random.randint(1, 6), device=device, dtype=dtype) |
| for _ in range(5)] |
| seqs = [s.random_(-128, 128) for s in seqs] |
| ordered = sorted(seqs, key=len, reverse=True) |
| return ordered |
| |
| def _padded_sequence(self, device, dtype): |
| """Create Tensor of random padded sequences""" |
| ordered = self._ordered_sequence(device, dtype) |
| lengths = [len(i) for i in ordered] |
| padded_tensor = rnn_utils.pad_sequence(ordered) |
| return padded_tensor, lengths |
| |
| @onlyCUDA |
| def test_device_mask(self, device): |
| for enforce_sorted in [True, False]: |
| padded, lengths = self._padded_sequence('cpu', torch.float) |
| packed = rnn_utils.pack_padded_sequence( |
| padded, lengths, enforce_sorted=enforce_sorted) |
| self.assertFalse(packed.is_cuda) |
| packed = packed.to(device) |
| self.assertTrue(packed.is_cuda) |
| unpacked, _ = rnn_utils.pad_packed_sequence(packed) |
| self.assertTrue(unpacked.is_cuda) |
| self.assertEqual(unpacked.dtype, torch.float) |
| |
| @onlyCUDA |
| def test_overwrite_module_params_on_conversion_cpu_device(self, device): |
| # Test that under the current default settings |
| # (`torch.__future__.get_overwrite_module_params_on_conversion() == False`), |
| # a view to a module's parameters is not pointing to the same storage as |
| # its base variable after converting the module to a different device. |
| m = nn.Linear(20, 10) |
| mw = m.weight[:] |
| m.to(device) |
| with torch.no_grad(): |
| # Without using `torch.no_grad()`, this will leak CUDA memory. |
| # (Issue is filed at https://github.com/pytorch/pytorch/issues/21875) |
| mw[0][0] = 5 |
| self.assertTrue(mw[0][0].device.type == "cpu") |
| self.assertTrue(mw._base[0][0].device.type == "cuda") |
| |
| try: |
| torch.__future__.set_overwrite_module_params_on_conversion(True) |
| |
| # Test that if `torch.__future__.get_overwrite_module_params_on_conversion() == True`, |
| # a view to a module's parameters is still pointing to the same storage as |
| # its base variable after converting the module to a different device. |
| m = nn.Linear(20, 10) |
| mw = m.weight[:] |
| m.to(device) |
| with torch.no_grad(): |
| mw[0][0] = 5 |
| self.assertTrue(mw[0][0] == mw._base[0][0]) |
| |
| # Test that if `torch.__future__.get_overwrite_module_params_on_conversion() == True`, |
| # `cpu_module.to("cuda")` doesn't preserve previous references to |
| # `cpu_module`'s parameters or gradients. |
| m = nn.Linear(20, 10) |
| m.weight.grad = torch.randn(10, 20) |
| weight_ref = m.weight |
| weight_grad_ref = m.weight.grad |
| m.to(device) |
| self.assertNotEqual(weight_ref.device, m.weight.device) |
| self.assertNotEqual(weight_grad_ref.device, m.weight.grad.device) |
| finally: |
| torch.__future__.set_overwrite_module_params_on_conversion(False) |
| |
| @onlyCUDA |
| @dtypes(torch.half, torch.float) |
| def test_softmax(self, device, dtype): |
| input = torch.rand(32, 100, device=device, dtype=dtype, requires_grad=True) |
| inputf = input.to(torch.float).detach().requires_grad_(True) |
| out = F.softmax(input, dim=-1, dtype=torch.float) |
| outf = F.softmax(inputf, dim=-1) |
| # should be bitwise equal |
| self.assertEqual(out, outf, atol=0, rtol=0) |
| gO = torch.empty_like(outf).uniform_() |
| out.backward(gO) |
| outf.backward(gO) |
| # should be bitwise equal |
| self.assertEqual(input.grad, inputf.grad.to(dtype), atol=0, rtol=0) |
| |
| def _test_batchnorm_grad(self, device, dtype=torch.double): |
| bs, n_feat, size_feat = 4, 5, 6 |
| input = torch.arange(bs * n_feat * size_feat, device=device, |
| requires_grad=True, dtype=dtype).view(bs, n_feat, size_feat) |
| weight = torch.arange(1, n_feat + 1, device=device, requires_grad=True, dtype=dtype) |
| bias = torch.arange(n_feat, device=device, requires_grad=True, dtype=dtype) |
| running_mean = 1 - torch.arange(n_feat, device=device, dtype=dtype) |
| running_var = 2 * torch.arange(n_feat, device=device, dtype=dtype) |
| for training in [False, True]: |
| _assertGradAndGradgradChecks(self, F.batch_norm, (input, running_mean, running_var, weight, bias, |
| training, 0.1, 0.0001)) |
| |
| @expectedFailureMPS # TypeError: the MPS framework doesn't support float64 |
| def test_batchnorm_grad(self, device): |
| self._test_batchnorm_grad(device) |
| |
| if self.device_type == 'cuda' and self.has_cudnn(): |
| with torch.backends.cudnn.flags(enabled=False): |
| self._test_batchnorm_grad(device) |
| |
| @onlyCUDA |
| def test_layernorm_half_precision(self): |
| width = 128 |
| input = torch.rand(1, 5, width, device="cuda", dtype=torch.half) * 0.1 |
| normalized_shape = (width,) |
| weight = torch.ones(width, device="cuda", dtype=torch.half) |
| bias = torch.zeros(width, device="cuda", dtype=torch.half) |
| eps = 1e-5 |
| |
| output_fp16 = torch.layer_norm(input, normalized_shape, weight, bias, eps) |
| output_fp32 = torch.layer_norm(input.float(), normalized_shape, weight.float(), bias.float(), eps).half() |
| self.assertEqual(output_fp16, output_fp32, atol=0, rtol=0) |
| |
| @onlyCUDA |
| def test_layernorm_weight_bias(self): |
| width = 128 |
| input = torch.rand(1, 5, width, device="cuda", dtype=torch.float32) * 0.1 |
| normalized_shape = (width,) |
| data = torch.randn(width, device="cuda", dtype=torch.float32) |
| weight = torch.ones(width, device="cuda", dtype=torch.float32) |
| bias = torch.zeros(width, device="cuda", dtype=torch.float32) |
| eps = 1e-5 |
| |
| out_none_weight = torch.layer_norm(input, normalized_shape, None, data, eps) |
| out_one_weight = torch.layer_norm(input, normalized_shape, weight, data, eps) |
| self.assertEqual(out_none_weight, out_one_weight) |
| |
| out_none_bias = torch.layer_norm(input, normalized_shape, data, None, eps) |
| out_zero_bias = torch.layer_norm(input, normalized_shape, data, bias, eps) |
| self.assertEqual(out_none_bias, out_zero_bias) |
| |
| @expectedFailureMPS # TypeError: the MPS framework doesn't support float64 |
| def test_hardsigmoid_grad(self, device): |
| inputs = (torch.randn(4, 16, 16, device=device, dtype=torch.double) - 0.5) * 10 |
| inputs.requires_grad = True |
| self.assertTrue(gradcheck(F.hardsigmoid, (inputs,))) |
| |
| # currently fails on XLA |
| @onlyNativeDeviceTypes |
| def test_hardswish_grad(self, device): |
| inputs = (torch.randn(4, 16, 16, device=device, dtype=torch.double) - 0.5) * 10 |
| inputs.requires_grad = True |
| self.assertTrue(gradcheck(F.hardswish, (inputs,))) |
| |
| |
| def _test_batchnorm_eval(self, ndim, device, dtype, module_dtype=None): |
| module_dtype = module_dtype or dtype |
| module = nn.BatchNorm1d(3).to(device, module_dtype) |
| module.eval() |
| |
| data = torch.rand([3] * ndim, device=device, dtype=dtype, requires_grad=True) |
| grad = torch.rand([3] * ndim, device=device, dtype=dtype) |
| |
| # 1st pass |
| res1 = module(data) |
| res1.backward(grad) |
| grad1 = data.grad.clone() |
| |
| # 2nd pass |
| if data.grad is not None: |
| data.grad.data.zero_() |
| |
| res2 = module(data) |
| res2.backward(grad) |
| grad2 = data.grad.clone() |
| self.assertEqual(res1, res2) |
| self.assertEqual(grad1, grad2) |
| |
| # track_running_stats=False |
| module = nn.BatchNorm1d(3, track_running_stats=False).to(device, module_dtype) |
| |
| data = torch.rand(4, 3, device=device, dtype=dtype, requires_grad=True) |
| grad = torch.rand(4, 3, device=device, dtype=dtype) |
| |
| # 1st pass |
| res1 = module(data) |
| res1.backward(grad) |
| grad1 = data.grad.clone() |
| |
| # set eval |
| module.eval() |
| |
| # 2nd pass |
| if data.grad is not None: |
| data.grad.data.zero_() |
| |
| res2 = module(data) |
| res2.backward(grad) |
| grad2 = data.grad.clone() |
| self.assertEqual(res1, res2) |
| self.assertEqual(grad1, grad2) |
| |
| @dtypes(torch.float) |
| @dtypesIfCUDA(torch.float, torch.bfloat16) |
| def test_batchnorm_eval(self, device, dtype): |
| self._test_batchnorm_eval(2, device, dtype) |
| self._test_batchnorm_eval(3, device, dtype) |
| |
| if self.device_type == 'cuda' and self.has_cudnn(): |
| with torch.backends.cudnn.flags(enabled=False): |
| self._test_batchnorm_eval(2, device, dtype) |
| self._test_batchnorm_eval(3, device, dtype) |
| |
| @onlyCUDA |
| @dtypes(torch.bfloat16, torch.half) |
| def test_batchnorm_eval_mixed(self, device, dtype): |
| # Test bfloat16 input with float module |
| self._test_batchnorm_eval(2, device, dtype, torch.float) |
| self._test_batchnorm_eval(3, device, dtype, torch.float) |
| |
| if self.device_type == 'cuda' and self.has_cudnn(): |
| with torch.backends.cudnn.flags(enabled=False): |
| self._test_batchnorm_eval(2, device, dtype, torch.float) |
| self._test_batchnorm_eval(3, device, dtype, torch.float) |
| |
| def _test_batchnorm_affine(self, ndim, device, dtype, module_dtype=None): |
| # Compare affine against no-op weights and bias |
| module_dtype = module_dtype or dtype |
| module = nn.BatchNorm1d(3, affine=False).to(device, module_dtype) |
| module_affine = nn.BatchNorm1d(3, affine=True).to(device, module_dtype) |
| with torch.no_grad(): |
| module_affine.weight.fill_(1.0) |
| module_affine.bias.zero_() |
| |
| data = torch.rand([3] * ndim, device=device, dtype=dtype, requires_grad=True) |
| grad = torch.ones_like(data, requires_grad=False) |
| |
| # With weights all ones and bias all zeros |
| res1 = module_affine(data) |
| res1.backward(grad) |
| grad1 = data.grad.clone() |
| data.grad.zero_() |
| |
| # Without any weights or bias |
| res2 = module(data) |
| res2.backward(grad) |
| grad2 = data.grad |
| |
| self.assertEqual(res1, res2) |
| self.assertEqual(grad1, grad2) |
| |
| @dtypes(torch.float) |
| @dtypesIfCUDA(torch.float, torch.bfloat16) |
| def test_batchnorm_affine(self, device, dtype): |
| self._test_batchnorm_affine(2, device, dtype) |
| self._test_batchnorm_affine(3, device, dtype) |
| |
| if self.device_type == 'cuda' and self.has_cudnn(): |
| with torch.backends.cudnn.flags(enabled=False): |
| self._test_batchnorm_affine(2, device, dtype) |
| self._test_batchnorm_affine(3, device, dtype) |
| |
| @onlyCUDA |
| @dtypes(torch.bfloat16, torch.half) |
| def test_batchnorm_affine_mixed(self, device, dtype): |
| cudnn_enabled = [False] |
| if self.device_type == 'cuda' and self.has_cudnn(): |
| # TODO: Test fails with cudnn, see gh-62034 |
| # cudnn_enabled = [False, True] |
| pass |
| |
| # Test bfloat16 input with float module |
| for enabled in cudnn_enabled: |
| with torch.backends.cudnn.flags(enabled=enabled): |
| self._test_batchnorm_affine(2, device, dtype, torch.float) |
| self._test_batchnorm_affine(3, device, dtype, torch.float) |
| |
| def _test_batchnorm_simple_average(self, device, dtype, module_dtype=None): |
| module_dtype = module_dtype or dtype |
| module = nn.BatchNorm1d(3, momentum=None).to(dtype=module_dtype, device=device) |
| zeros = torch.zeros(3, dtype=module_dtype, device=device) |
| ones = torch.ones(3, dtype=module_dtype, device=device) |
| self.assertEqual(module.running_mean, zeros) |
| self.assertEqual(module.running_var, ones) |
| |
| data1 = torch.rand(4, 3, dtype=dtype, device=device) |
| data2 = torch.rand(4, 3, dtype=dtype, device=device) |
| |
| # 1st pass |
| res1 = module(data1) |
| running_mean1 = module.running_mean.clone() |
| running_var1 = module.running_var.clone() |
| self.assertNotEqual(running_mean1, zeros) |
| self.assertNotEqual(running_var1, ones) |
| |
| # reset stats |
| module.reset_running_stats() |
| self.assertEqual(module.running_mean, zeros) |
| self.assertEqual(module.running_var, ones) |
| |
| # 2nd pass |
| res2 = module(data2) |
| running_mean2 = module.running_mean.clone() |
| running_var2 = module.running_var.clone() |
| self.assertNotEqual(running_mean2, zeros) |
| self.assertNotEqual(running_var2, ones) |
| |
| # reset stats |
| module.reset_running_stats() |
| self.assertEqual(module.running_mean, zeros) |
| self.assertEqual(module.running_var, ones) |
| |
| # 3rd (combined) pass |
| res3 = module(data1) |
| res4 = module(data2) |
| self.assertEqual(res3, res1) |
| self.assertEqual(res4, res2) |
| self.assertEqual(module.running_mean, (running_mean1 + running_mean2) / 2) |
| self.assertEqual(module.running_var, (running_var1 + running_var2) / 2) |
| |
| @dtypes(torch.float) |
| @dtypesIfCUDA(torch.float, torch.bfloat16) |
| def test_batchnorm_simple_average(self, device, dtype): |
| self._test_batchnorm_simple_average(device, dtype) |
| |
| if self.device_type == 'cuda' and self.has_cudnn(): |
| with torch.backends.cudnn.flags(enabled=False): |
| self._test_batchnorm_simple_average(device, dtype) |
| |
| @onlyCUDA |
| @dtypes(torch.bfloat16, torch.half) |
| def test_batchnorm_simple_average_mixed(self, device, dtype): |
| self._test_batchnorm_simple_average(device, dtype, torch.float) |
| |
| if self.device_type == 'cuda' and self.has_cudnn(): |
| with torch.backends.cudnn.flags(enabled=False): |
| self._test_batchnorm_simple_average(device, dtype, torch.float) |
| |
| @onlyNativeDeviceTypes |
| @dtypes(torch.float, torch.double) |
| def test_grid_sample_nan_inf(self, device, dtype): |
| input = torch.zeros([1, 1, 3, 3], device=device, dtype=dtype) |
| grid = torch.tensor([[[[nan, 0], [0, inf]]]], device=device, dtype=dtype) |
| for padding_mode in ('reflection', 'border', 'zeros'): |
| sample = torch.nn.functional.grid_sample(input=input, grid=grid, mode='nearest', |
| padding_mode=padding_mode, align_corners=False) |
| self.assertEqual(sample, torch.zeros([1, 1, 1, 2], device=device, dtype=dtype)) |
| |
| @expectedFailureMPS # NotImplementedError aten::_ctc_loss https://github.com/pytorch/pytorch/issues/77764 |
| def test_CTCLoss_empty_target(self, device): |
| target_lengths = [0, 0, 0] |
| input_lengths = [50, 50, 50] |
| targets = torch.randint(1, 15, (0,), dtype=torch.long, device=device) |
| log_probs = torch.randn(50, 3, 15, dtype=torch.double, device=device).log_softmax(2) |
| loss = torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths, reduction='none') |
| self.assertTrue((loss >= 0).all().item()) |
| self.assertEqual(-log_probs.sum(0)[:, 0], loss) |
| |
| target_lengths = [0, 9, 0] |
| input_lengths = [50, 50, 50] |
| targets = torch.randint(1, 15, (9,), dtype=torch.long, device=device) |
| log_probs = torch.randn(50, 3, 15, dtype=torch.double, device=device).log_softmax(2) |
| loss = torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths, reduction='none') |
| self.assertTrue((loss >= 0).all().item()) |
| self.assertEqual(-log_probs.sum(0)[[0, 2], 0], loss[[0, 2]]) |
| |
| # Merge into OpInfo? |
| @skipCUDAIf(True, """Test is flaky on Linux and Windows, typical error message: |
| https://github.com/pytorch/pytorch/issues/34870""") |
| @expectedFailureMPS # NotImplementedError aten::_ctc_loss https://github.com/pytorch/pytorch/issues/77764 |
| def test_ctc_loss(self, device): |
| batch_size = 64 |
| num_labels = 101 |
| target_length = 15 |
| gradcheck_input_size = 10 |
| |
| ZERO_NONE = 0 |
| ZERO_SOME = 1 |
| ZERO_ALL = 2 |
| |
| # input_length, vary_lengths, zero_lengths |
| tests = [(150, False, ZERO_NONE), |
| (150, True, ZERO_NONE), |
| (50, True, ZERO_SOME), |
| (50, True, ZERO_ALL)] |
| |
| if 'cuda' in device: |
| tests += [(50, False, ZERO_NONE), |
| (50, True, ZERO_NONE), |
| (150, True, ZERO_SOME), |
| (150, True, ZERO_ALL)] |
| |
| for input_length, vary_lengths, zero_mode in tests: |
| targets = torch.randint(1, num_labels, (batch_size, target_length), |
| device=device, dtype=torch.long) |
| x = torch.randn(gradcheck_input_size, dtype=torch.double, device=device, requires_grad=True) |
| tile_factors = torch.randn(input_length * batch_size * num_labels // gradcheck_input_size + 1, |
| device=device) |
| input_lengths = [(torch.randint(input_length // 2, input_length + 1, ()).item() |
| if vary_lengths or i == 0 else input_length) for i in range(batch_size)] |
| if zero_mode == ZERO_ALL: |
| target_lengths = [0 for _ in range(batch_size)] |
| else: |
| target_lengths = [(torch.randint(target_length // 2, target_length + 1, ()).item() |
| if vary_lengths else target_length) for _ in range(batch_size)] |
| if zero_mode == ZERO_SOME: |
| idxes = torch.randint(0, batch_size, (10,)) |
| for i in idxes: |
| target_lengths[i] = 0 |
| |
| def ctc_after_softmax(x): |
| x_full = ((x[:, None] * tile_factors[None, :]).view(-1)[:input_length * batch_size * num_labels] |
| .view(input_length, batch_size, num_labels)) |
| log_probs = torch.log_softmax(x_full, 2) |
| return torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths) |
| |
| gradcheck(ctc_after_softmax, [x]) |
| |
| @onlyCUDA |
| @skipCUDAIfRocm(msg="skipped Cudnn test on ROCm") |
| @skipCUDAIfCudnnVersionLessThan(7600) |
| def test_ctc_loss_cudnn(self, device): |
| batch_size = 16 |
| input_length = 30 |
| num_labels = 101 |
| target_length = 15 |
| targets = torch.randint(1, num_labels, (batch_size * target_length,), |
| device='cuda', dtype=torch.long) |
| log_probs = torch.log_softmax(torch.randn(input_length, batch_size, num_labels, device='cuda', dtype=torch.float), 2) |
| log_probs.requires_grad_() |
| |
| input_lengths = batch_size * [input_length] |
| target_lengths = batch_size * [target_length] |
| grad_out = torch.randn(batch_size, device='cuda', dtype=torch.float) |
| with torch.backends.cudnn.flags(enabled=False): |
| loss_native = torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths, reduction='none') |
| grad_native, = torch.autograd.grad(loss_native, log_probs, grad_out) |
| loss_cudnn = torch.nn.functional.ctc_loss(log_probs, targets.to('cpu', torch.int32), |
| input_lengths, target_lengths, reduction='none') |
| self.assertTrue("Cudnn" in str(loss_cudnn.grad_fn)) |
| grad_cudnn, = torch.autograd.grad(loss_cudnn, log_probs, grad_out) |
| self.assertEqual(grad_cudnn, grad_native, atol=1e-4, rtol=0) |
| |
| @onlyCUDA |
| @skipCUDAIfRocm(msg="skipped Cudnn test on ROCm") |
| @skipCUDAIfCudnnVersionLessThan(8000) |
| def test_ctc_loss_cudnn_tensor(self, device): |
| batch_size = 16 |
| input_length = 30 |
| num_labels = 101 |
| target_length = 15 |
| targets = torch.randint(1, num_labels, (batch_size * target_length,), |
| device='cuda', dtype=torch.long) |
| log_probs = torch.log_softmax(torch.randn(input_length, batch_size, num_labels, device='cuda', dtype=torch.float), 2) |
| log_probs.requires_grad_() |
| |
| input_lengths = batch_size * [input_length] |
| input_lengths = torch.linspace(start=15, end=input_length, steps=batch_size, dtype=torch.long, device='cuda') |
| target_lengths = torch.tensor(batch_size * [target_length], dtype=torch.long, device='cuda') |
| grad_out = torch.randn(batch_size, device='cuda', dtype=torch.float) |
| with torch.backends.cudnn.flags(enabled=False): |
| loss_native = torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths, reduction='none') |
| grad_native, = torch.autograd.grad(loss_native, log_probs, grad_out) |
| loss_cudnn = torch.nn.functional.ctc_loss(log_probs, |
| targets.to('cuda', torch.int32), |
| input_lengths.to('cuda', torch.int32), |
| target_lengths.to('cuda', torch.int32), |
| reduction='none') |
| self.assertTrue("Cudnn" in str(loss_cudnn.grad_fn)) |
| grad_cudnn, = torch.autograd.grad(loss_cudnn, log_probs, grad_out) |
| self.assertEqual(grad_cudnn, grad_native, atol=1e-4, rtol=0) |
| |
| @expectedFailureMPS # RuntimeError: LSTM with projections is not currently supported with MPS. |
| @dtypesIfCUDA(torch.half, torch.float, torch.double) |
| @dtypes(torch.float) |
| @tf32_on_and_off(0.005) |
| @skipIfTorchDynamo("TorchDynamo fails here for unknown reasons") |
| def test_variable_sequence(self, device, dtype): |
| def pad(var, length): |
| if var.size(0) == length: |
| return var |
| return torch.cat([var, var.new_zeros(length - var.size(0), *var.size()[1:])]) |
| |
| def maybe_index_tuple(maybe_tuple_of_tensors, index): |
| if maybe_tuple_of_tensors is None: |
| return None |
| return tuple(maybe_tuple_of_tensors[j][:, index:index + 1, :].contiguous() |
| for j in range(2)) |
| |
| def check_lengths(lengths, enforce_sorted, use_default_hiddens, proj_size): |
| input_size = 3 |
| hidden_size = 4 |
| num_layers = 2 |
| bidirectional = True |
| |
| max_length = max(lengths) |
| x_leaf = torch.randn(max_length, len(lengths), input_size, device=device, |
| dtype=dtype, requires_grad=True) |
| num_directions = 2 if bidirectional else 1 |
| lstm = nn.LSTM(input_size, hidden_size, bidirectional=bidirectional, |
| num_layers=num_layers, proj_size=proj_size).to(device, dtype) |
| lstm2 = deepcopy(lstm).to(device, dtype) |
| x = x_leaf |
| |
| hidden0 = None |
| if not use_default_hiddens: |
| real_hidden_size = hidden_size if proj_size == 0 else proj_size |
| hidden0 = (torch.randn(num_directions * num_layers, len(lengths), real_hidden_size, |
| device=device, dtype=dtype), |
| torch.randn(num_directions * num_layers, len(lengths), hidden_size, |
| device=device, dtype=dtype)) |
| |
| # Compute sequences separately |
| seq_outs = [] |
| seq_hiddens = [] |
| for i, l in enumerate(lengths): |
| hidden_i = maybe_index_tuple(hidden0, i) |
| out, hid = lstm2(x[:l, i:i + 1], hidden_i) |
| out_pad = pad(out, max_length) |
| seq_outs.append(out_pad) |
| seq_hiddens.append(hid) |
| seq_out = torch.cat(seq_outs, 1) |
| seq_hidden = tuple(torch.cat(hids, 1) for hids in zip(*seq_hiddens)) |
| |
| # Use packed format |
| packed = rnn_utils.pack_padded_sequence(x, lengths, enforce_sorted=enforce_sorted) |
| packed_out, packed_hidden = lstm(packed, hidden0) |
| unpacked, unpacked_len = rnn_utils.pad_packed_sequence(packed_out) |
| |
| # Check forward |
| prec = dtype2prec_DONTUSE[dtype] |
| self.assertEqual(packed_hidden, seq_hidden, atol=prec, rtol=0) |
| self.assertEqual(unpacked, seq_out, atol=prec, rtol=0) |
| self.assertEqual(unpacked_len, lengths, atol=prec, rtol=0) |
| |
| # Check backward |
| seq_out.sum().backward() |
| grad_x = x_leaf.grad.data.clone() |
| x_leaf.grad.data.zero_() |
| unpacked.sum().backward() |
| |
| self.assertEqual(x_leaf.grad, grad_x, atol=dtype2prec_DONTUSE[dtype], rtol=0) |
| for p1, p2 in zip(lstm.parameters(), lstm2.parameters()): |
| prec = dtype2prec_DONTUSE[dtype] |
| if dtype == torch.float16: |
| prec = 4e-2 |
| self.assertEqual(p1.grad, p2.grad, atol=prec, rtol=0) |
| |
| tests = [ |
| # enforce_sorted, lengths |
| [True, [5]], |
| [False, [5]], |
| [True, [10, 10, 6, 2, 2, 1, 1]], |
| [False, [10, 10, 6, 2, 2, 1, 1]], |
| [False, [2, 1, 3, 2, 10, 5, 3]], |
| ] |
| |
| for enforce_sorted, seq_lens, in tests: |
| for use_default_hiddens in (True, False): |
| for proj_size in [0, 2]: |
| check_lengths(seq_lens, enforce_sorted, use_default_hiddens, proj_size) |
| |
| def _test_batchnorm_update_stats(self, device, dtype=torch.float): |
| module = nn.BatchNorm1d(3).to(device, dtype) |
| |
| data = torch.rand(4, 3, device=device, dtype=dtype) |
| |
| # training pass |
| old_running_mean = module.running_mean.clone() |
| old_running_var = module.running_var.clone() |
| old_num_batches_tracked = module.num_batches_tracked.clone() |
| module(data) |
| self.assertNotEqual(old_running_mean, module.running_mean) |
| self.assertNotEqual(old_running_var, module.running_var) |
| self.assertEqual(old_num_batches_tracked + 1, module.num_batches_tracked) |
| |
| # eval pass |
| module.eval() |
| old_running_mean = module.running_mean.clone() |
| old_running_var = module.running_var.clone() |
| old_num_batches_tracked = module.num_batches_tracked.clone() |
| module(data) |
| self.assertEqual(old_running_mean, module.running_mean) |
| self.assertEqual(old_running_var, module.running_var) |
| self.assertEqual(old_num_batches_tracked, module.num_batches_tracked) |
| |
| def test_batchnorm_update_stats(self, device): |
| self._test_batchnorm_update_stats(device) |
| |
| if self.device_type == 'cuda' and self.has_cudnn(): |
| with torch.backends.cudnn.flags(enabled=False): |
| self._test_batchnorm_update_stats(device) |
| |
| @onlyCPU |
| @dtypes(torch.bfloat16, torch.float16) |
| def test_activations_bfloat16_half_cpu(self, device, dtype): |
| def test_helper(fn, device, inp_dims, prec=None): |
| torch.manual_seed(37) |
| # bfloat16/half compute |
| fn = fn.to(dtype=dtype) |
| input = torch.randn(inp_dims, dtype=dtype, device=device, requires_grad=True) |
| out = fn(input) |
| grad_input = torch.randn_like(out, dtype=dtype, device=device) |
| out.backward(grad_input) |
| |
| # fp32 compute |
| input2 = input.detach().clone().float().requires_grad_(True) |
| out2 = fn.float()(input2) |
| grad_input2 = grad_input.detach().clone().float() |
| out2.backward(grad_input2) |
| |
| self.assertEqual(out.dtype, dtype) |
| self.assertEqual(input.grad.dtype, dtype) |
| self.assertEqual(out, out2.to(dtype=dtype), atol=prec, rtol=prec) |
| self.assertEqual(input.grad.data, input2.grad.data.to(dtype=dtype), atol=prec, rtol=prec) |
| |
| shapes = [[1, 3, 1, 6], [1, 3, 1, 128], [1, 3, 256, 256]] |
| for shape in shapes: |
| test_helper(torch.nn.LogSigmoid(), device, shape) |
| test_helper(torch.nn.Hardsigmoid(), device, shape) |
| test_helper(torch.nn.Hardshrink(), device, shape) |
| test_helper(torch.nn.Softshrink(), device, shape) |
| test_helper(torch.nn.Hardswish(), device, shape) |
| test_helper(torch.nn.Softplus(), device, shape) |
| test_helper(torch.nn.SiLU(), device, shape) |
| test_helper(torch.nn.Hardtanh(), device, shape) |
| test_helper(torch.nn.Mish(), device, shape) |
| test_helper(torch.nn.ELU(), device, shape) |
| test_helper(torch.nn.PReLU(), device, shape) |
| test_helper(torch.nn.GLU(), device, shape, prec=1e-2) |
| test_helper(torch.nn.Threshold(0.1, 20), device, shape) |
| test_helper(torch.nn.GELU(), device, shape) |
| test_helper(torch.nn.Hardtanh(), device, shape) |
| test_helper(torch.nn.LeakyReLU(), device, shape) |
| |
| @onlyCUDA |
| def test_activations_bfloat16(self, device): |
| _test_bfloat16_ops(self, torch.nn.ReLU(), device, inp_dims=(5), prec=1e-2) |
| _test_bfloat16_ops(self, torch.nn.Threshold(0.1, 20), device, inp_dims=(5), prec=1e-2) |
| _test_bfloat16_ops(self, torch.nn.ELU(), device, inp_dims=(5), prec=1e-2) |
| _test_bfloat16_ops(self, torch.nn.Softplus(), device, inp_dims=(5), prec=1e-2) |
| _test_bfloat16_ops(self, torch.nn.Hardshrink(), device, inp_dims=(5), prec=1e-2) |
| _test_bfloat16_ops(self, torch.nn.Softshrink(), device, inp_dims=(5), prec=1e-2) |
| _test_bfloat16_ops(self, torch.nn.LeakyReLU(), device, inp_dims=(5), prec=1e-2) |
| |
| @onlyNativeDeviceTypes |
| def test_softmax_bfloat16(self, device): |
| for dim in [0, 1, 2, 3]: |
| _test_bfloat16_ops(self, torch.nn.Softmax(dim=dim), device, inp_dims=(16, 33, 15, 16), prec=1e-2) |
| # test softmax with large input value which casues exp() to overflow |
| _test_bfloat16_ops(self, torch.nn.Softmax(dim=dim), device, inp_dims=(16, 33, 15, 16), prec=0.05, scale_factor=1000.0) |
| |
| def test_nll_loss_mismatched_batch(self, device): |
| x = torch.randn((10, 3), requires_grad=True, device=device) |
| # t should have size (10,) |
| t = torch.zeros((3,), dtype=torch.int64, device=device) |
| with self.assertRaisesRegex(ValueError, 'Expected.*batch_size'): |
| F.nll_loss(x, t) |
| |
| def test_nll_loss_out_of_bounds_ignore_index(self, device): |
| x = torch.randn(6, 3, requires_grad=True, device=device) |
| t = torch.tensor([0, 1, 255, 0, 1, 2], dtype=torch.int64, device=device) |
| for reduction in ['mean', 'none']: |
| F.nll_loss(x, t, ignore_index=255, reduction=reduction).sum().backward() |
| |
| def test_nll_loss_invalid_target_dim(self, device): |
| x = torch.randn((10, 3), device=device) |
| t = torch.zeros((10, 2), dtype=torch.int64, device=device) |
| with self.assertRaisesRegex(RuntimeError, "1D target tensor expected"): |
| F.nll_loss(x, t) |
| |
| def test_nll_loss_invalid_weights(self, device): |
| x = torch.randn((10, 3), device=device) |
| t = torch.empty(10, dtype=torch.int64, device=device).random_(0, 3) |
| invalid_weights = [ |
| torch.randn(4, device=device), |
| torch.randn(1, 3, device=device), |
| ] |
| msg = "weight tensor should be defined either for all 3 classes or no classes" |
| for weight in invalid_weights: |
| with self.assertRaisesRegex(RuntimeError, msg): |
| F.nll_loss(x, t, weight=weight) |
| |
| # Ref: https://github.com/pytorch/pytorch/issue/85005 |
| @onlyCUDA |
| @largeTensorTest("120GB", "cpu") |
| @largeTensorTest("45GB", "cuda") |
| @parametrize_test("reduction", ("none", "mean", "sum")) |
| def test_nll_loss_large_tensor(self, device, reduction): |
| shape = [int(2 ** 16), int(2 ** 16) + 1] |
| |
| input = torch.randn(shape, device=device, dtype=torch.float32, requires_grad=True) |
| labels = torch.randint(shape[0], (shape[0],), dtype=torch.long, device=device) |
| |
| out = F.nll_loss(input, labels, reduction=reduction) |
| |
| with torch.no_grad(): |
| input_cpu = input.cpu().float().requires_grad_() |
| labels_cpu = labels.cpu() |
| out_cpu = F.nll_loss(input_cpu, labels_cpu, reduction=reduction) |
| # workaround to reduce memory usage vs. self.assertEqual, see #84944 |
| rtol, atol = torch.testing._comparison.get_tolerances(torch.float32, rtol=None, atol=None) |
| if reduction == "sum": |
| orig_rtol, orig_atol = rtol, atol |
| rtol, atol = 7 * rtol, 3 * atol |
| with torch.no_grad(): |
| self.assertTrue(torch.allclose(out.cpu(), out_cpu, rtol=rtol, atol=atol)) |
| if reduction == "sum": |
| rtol, atol = orig_rtol, orig_atol |
| |
| if reduction != "none": |
| out.backward() |
| out_cpu.backward() |
| with torch.no_grad(): |
| self.assertTrue(torch.allclose(input.grad.cpu(), input_cpu.grad, rtol=rtol, atol=atol)) |
| |
| # Ref: https://github.com/pytorch/pytorch/issue/108345 |
| @onlyCUDA |
| @largeTensorTest("20GB", "cpu") |
| @largeTensorTest("20GB", "cuda") |
| @parametrize_test("reduction", ("none", "mean", "sum")) |
| def test_cross_entropy_64bit(self, device, reduction): |
| labels = torch.zeros(190, 50, dtype=torch.long, device=device) |
| logits = torch.ones(190, 229000, 50, dtype=torch.float, device=device) |
| loss = torch.nn.functional.cross_entropy(logits, labels) |
| loss_cpu = torch.nn.functional.cross_entropy(logits.cpu(), labels.cpu()) |
| print(logits.numel(), labels.numel(), loss.numel()) |
| self.assertTrue(torch.allclose(loss_cpu, loss.cpu(), rtol=1e-4, atol=1e-4)) |
| |
| def _nll_loss_helper(self, input_size, reduction, expected, device): |
| input = torch.rand(input_size, requires_grad=True, device=device) |
| num_channels = input_size[1] |
| target_size = (input_size[0], ) + tuple(input_size[2:]) |
| target = torch.randint(num_channels, target_size, device=device) |
| |
| output = F.nll_loss(input, target, reduction=reduction) |
| self.assertEqual(output, expected, exact_dtype=False) |
| |
| output.sum().backward() |
| self.assertEqual(input.grad.size(), input.size()) |
| |
| def test_nll_loss_empty_tensor_reduction_none(self, device): |
| self._nll_loss_helper([0, 3], "none", torch.empty([0], device=device), device) |
| self._nll_loss_helper([0, 3, 5, 7], "none", torch.empty([0, 5, 7], device=device), device) |
| self._nll_loss_helper([2, 3, 0, 7], "none", torch.empty([2, 0, 7], device=device), device) |
| self._nll_loss_helper([2, 3, 5, 0], "none", torch.empty([2, 5, 0], device=device), device) |
| self._nll_loss_helper([2, 3, 5, 7, 0], "none", torch.empty([2, 5, 7, 0], device=device), device) |
| |
| @expectedFailureMPS # RuntimeError: [srcBuf length] > 0 INTERNAL ASSERT FAILED https://github.com/pytorch/pytorch/issues/134431 |
| def test_nll_loss_empty_tensor_reduction_mean(self, device): |
| nan = torch.tensor(float('nan'), device=device) |
| self._nll_loss_helper([0, 3], "mean", nan, device) |
| self._nll_loss_helper([0, 3, 5, 7], "mean", nan, device) |
| self._nll_loss_helper([2, 3, 0, 7], "mean", nan, device) |
| self._nll_loss_helper([2, 3, 5, 0], "mean", nan, device) |
| self._nll_loss_helper([2, 3, 5, 7, 0], "mean", nan, device) |
| |
| @expectedFailureMPS # RuntimeError: [srcBuf length] > 0 INTERNAL ASSERT FAILED https://github.com/pytorch/pytorch/issues/134431 |
| def test_nll_loss_empty_tensor_reduction_sum(self, device): |
| zero = torch.tensor(0, device=device) |
| self._nll_loss_helper([0, 3], "sum", zero, device) |
| self._nll_loss_helper([0, 3, 5, 7], "sum", zero, device) |
| self._nll_loss_helper([2, 3, 0, 7], "sum", zero, device) |
| self._nll_loss_helper([2, 3, 5, 0], "sum", zero, device) |
| self._nll_loss_helper([2, 3, 5, 7, 0], "sum", zero, device) |
| |
| @expectedFailureMPS # AssertionError: Expected nan but got 0.0. |
| def test_nll_loss_total_weight_is_zero(self, device): |
| |
| def helper(input_size): |
| input = torch.ones(input_size, requires_grad=True, device=device) |
| num_channels = input_size[1] |
| target_size = (input_size[0], ) + tuple(input_size[2:]) |
| target = torch.zeros(target_size, dtype=torch.long, device=device) |
| weight = torch.zeros([num_channels], device=device) |
| self.assertEqual(F.nll_loss(input, target, weight, reduction="sum").item(), 0.) |
| self.assertEqual(F.nll_loss(input, target, weight, reduction="mean").item(), float("nan")) |
| self.assertEqual(F.nll_loss(input, target, weight, reduction="none"), torch.zeros(target.shape, device=device)) |
| |
| helper([2, 3]) |
| helper([2, 3, 5, 7]) |
| helper([2, 3, 5, 7, 9]) |
| |
| @expectedFailureMPS # AssertionError: Expected nan but got 0.0. |
| def test_nll_loss_all_ignored(self, device): |
| |
| def helper(input_size): |
| input = torch.ones(input_size, device=device) |
| num_channels = input_size[1] |
| target_size = (input_size[0], ) + tuple(input_size[2:]) |
| target = torch.zeros(target_size, dtype=torch.long, device=device) |
| self.assertEqual(F.nll_loss(input, target, ignore_index=0, reduction="sum").item(), 0) |
| self.assertEqual(F.nll_loss(input, target, ignore_index=0, reduction="mean").item(), float("nan")) |
| self.assertEqual(F.nll_loss(input, target, ignore_index=0, reduction="none"), torch.zeros(target.shape, device=device)) |
| |
| helper([2, 3]) |
| helper([2, 3, 5, 7]) |
| helper([2, 3, 5, 7, 9]) |
| |
| def test_nll_loss_byte_target_matches_long(self, device): |
| N, C = 10, 4 |
| input = torch.randn(N, C, device=device, requires_grad=True) |
| target = torch.empty(N, dtype=torch.long, device=device).random_(0, C) |
| |
| def compute_result_and_gradient(reduction, target_dtype): |
| input_ = input.detach() |
| input_.requires_grad_() |
| |
| prob = F.log_softmax(input_, dim=-1) |
| loss = nn.NLLLoss(reduction=reduction) |
| result = loss(prob, target.to(target_dtype)) |
| result.sum().backward() |
| |
| return result, input_.grad |
| |
| for reduction in ["none", "mean", "sum"]: |
| result_long, grad_long = compute_result_and_gradient(reduction, torch.long) |
| result_byte, grad_byte = compute_result_and_gradient(reduction, torch.uint8) |
| self.assertEqual(result_long, result_byte) |
| self.assertEqual(grad_long, grad_byte) |
| |
| @onlyCUDA |
| @skipIfRocm |
| @dtypes(torch.float16, torch.float32) |
| def test_cross_entropy_loss_2d_out_of_bounds_class_index(self, device, dtype): |
| # Test for issue #117532 |
| # Run in a different process to prevent the device-side assert from affecting other tests |
| stderr = TestCase.runWithPytorchAPIUsageStderr(f"""\ |
| #!/usr/bin/env python3 |
| |
| import torch |
| import torch.nn.functional as F |
| from torch.testing._internal.common_utils import (run_tests, TestCase) |
| |
| class TestThatContainsCUDAAssert(TestCase): |
| def test_cross_entropy_loss_2d_out_of_bounds_class_index(self): |
| device = '{str(device)}' |
| dtype = {str(dtype).strip("'")} |
| ignore_index = 255 |
| b = 10 |
| n_classes = 3 |
| w = 768 |
| h = 1024 |
| pred = torch.randn(b, n_classes, w, h, dtype=dtype, device=device) |
| labels = torch.zeros(b, w, h, dtype=torch.int64, device=device) |
| labels[5, 200, 200] = ignore_index |
| # Set invalid class index |
| labels[5, 200, 200] = 254 |
| |
| x = F.cross_entropy( |
| pred, labels, reduction="none", ignore_index=ignore_index |
| ) |
| torch.cuda.synchronize() |
| |
| |
| if __name__ == '__main__': |
| run_tests() |
| """) |
| self.assertIn('CUDA error: device-side assert triggered', stderr) |
| |
| |
| |
| def test_cross_entropy_loss_prob_target_all_reductions(self, device): |
| # Test with k-dimensional loss. |
| for k in range(5): |
| N, C = 5, 4 |
| other_dims = [torch.randint(2, 5, size=(1,)).item() for _ in range(k)] |
| input = torch.randn(N, C, *other_dims, device=device, requires_grad=True) |
| target = torch.randn(N, C, *other_dims, device=device, requires_grad=True) |
| weight = torch.randn(C, device=device).abs() |
| |
| for reduction, w in product(['none', 'mean', 'sum'], [None, weight]): |
| m = torch.nn.CrossEntropyLoss(weight=w, reduction=reduction) |
| output = m(input, target) |
| output_ref = loss_reference_fns['CrossEntropyLoss']( |
| input, target, reduction=reduction, weight=w) |
| self.assertEqual(output, output_ref) |
| |
| def test_cross_entropy_loss_prob_target_unit_weights(self, device): |
| # Test with k-dimensional loss. |
| for k in range(5): |
| N, C = 5, 4 |
| other_dims = [torch.randint(2, 5, size=(1,)).item() for _ in range(k)] |
| input = torch.randn(N, C, *other_dims, device=device, requires_grad=True) |
| target = torch.randn(N, C, *other_dims, device=device, requires_grad=True) |
| |
| for reduction in ['none', 'mean', 'sum']: |
| # Ensure result with unit weights is equivalent to result without weights. |
| m = torch.nn.CrossEntropyLoss(reduction=reduction) |
| unit_weight = torch.ones(C, device=device, dtype=target.dtype) |
| m_unit = torch.nn.CrossEntropyLoss(weight=unit_weight, reduction=reduction) |
| output = m(input, target) |
| output_unit = m_unit(input, target) |
| self.assertEqual(output, output_unit) |
| |
| @parametrize_test('reduction', ['none', 'mean', 'sum']) |
| @parametrize_test('weighted', [False, True]) |
| def test_cross_entropy_loss_prob_target_no_batch_dim(self, device, reduction, weighted): |
| C = 5 |
| input = torch.randn(C, device=device).log_softmax(dim=-1) |
| target = torch.randn(C, device=device).softmax(dim=-1) |
| weight = torch.randn(C, device=device) if weighted else None |
| m = nn.CrossEntropyLoss(reduction=reduction, weight=weight) |
| loss_no_batch = m(input, target) |
| loss_batch = m(input.unsqueeze(0), target.unsqueeze(0)) |
| if reduction == 'none': |
| loss_batch = loss_batch.squeeze(0) |
| self.assertEqual(loss_no_batch, loss_batch) |
| |
| def test_cross_entropy_loss_index_target_unit_weights(self, device): |
| # Test with k-dimensional loss. |
| for k in range(5): |
| N, C = 5, 4 |
| other_dims = [torch.randint(2, 5, size=(1,)).item() for _ in range(k)] |
| input = torch.randn(N, C, *other_dims, device=device, requires_grad=True) |
| target = torch.empty(N, *other_dims, dtype=torch.long, device=device).random_(0, C) |
| |
| for reduction in ['none', 'mean', 'sum']: |
| # Ensure result with unit weights is equivalent to result without weights. |
| m = torch.nn.CrossEntropyLoss(reduction=reduction) |
| unit_weight = torch.ones(C, device=device, dtype=input.dtype) |
| m_unit = torch.nn.CrossEntropyLoss(weight=unit_weight, reduction=reduction) |
| output = m(input, target) |
| output_unit = m_unit(input, target) |
| self.assertEqual(output, output_unit) |
| |
| def test_cross_entropy_loss_one_hot_target(self, device): |
| # Test with k-dimensional loss. |
| for k in range(5): |
| N, C = 5, 4 |
| other_dims = [torch.randint(2, 5, size=(1,)).item() for _ in range(k)] |
| input = torch.randn(N, C, *other_dims, device=device, requires_grad=True) |
| target = torch.empty(N, *other_dims, dtype=torch.long, device=device).random_(0, C) |
| weight = torch.randn(C, device=device).abs() |
| |
| # Get one-hot representation of the target. |
| target_one_hot = F.one_hot(target, num_classes=C).to(input.dtype) |
| # Need to put the C dim at index 1. |
| target_one_hot = target_one_hot.permute(0, -1, *range(1, target_one_hot.dim() - 1)) |
| |
| for reduction, w in product(['none', 'mean', 'sum'], [None, weight]): |
| # Skip this case for now because soft and hard label CE are not consistent |
| # in the way they apply class weights (see issue #61309). |
| if reduction == 'mean' and weight is not None: |
| continue |
| |
| # Ensure loss computed with class indices matches loss |
| # computed with one-hot class probs. |
| m = torch.nn.CrossEntropyLoss(weight=w, reduction=reduction) |
| output = m(input, target) |
| output_one_hot = m(input, target_one_hot) |
| self.assertEqual(output, output_one_hot) |
| |
| def test_cross_entropy_label_smoothing_errors(self, device): |
| N, C = 3, 4 |
| input_args = [ |
| (torch.randn((N, C), device=device), torch.arange(0, C, device=device)), |
| (torch.randn((N, C), device=device), torch.randn(N, C, device=device)) |
| ] |
| for input_arg in input_args: |
| loss = nn.CrossEntropyLoss(label_smoothing=1.2) |
| with self.assertRaisesRegex(RuntimeError, |
| r"label_smoothing must be between 0\.0"): |
| loss(*input_arg) |
| |
| @expectedFailureMPS # TypeError: the MPS framework doesn't support float64 |
| @set_default_dtype(torch.double) |
| def test_cross_entropy_label_smoothing_consistent_index_target_and_probs(self, device): |
| N, C = 10, 4 |
| ks = range(5) |
| reductions = ['none', 'mean', 'sum'] |
| label_smoothings = [0.05, 0.15] |
| |
| for k, reduction, label_smoothing in product(ks, reductions, label_smoothings): |
| other_dims = [torch.randint(2, 5, size=(1,)).item() for _ in range(k)] |
| input = torch.randn(N, C, *other_dims, device=device, requires_grad=True) |
| target = torch.empty(N, *other_dims, dtype=torch.long, device=device).random_(0, C) |
| |
| # construct target probablity that should have the same result as label_smoothing |
| target_proba = F.one_hot(target, num_classes=C) |
| # Need to put the C dim at index 1. |
| target_proba = target_proba.permute(0, -1, *range(1, target_proba.dim() - 1)) |
| target_mask = (target_proba == 1) |
| target_proba = target_proba.to(dtype=input.dtype) |
| |
| # y_k^ls = y_k * (1 - label_smoothing) + label_smoothing / n_classes |
| # Get one-hot representation of the target. |
| target_proba.masked_fill_(target_mask, 1 - label_smoothing + label_smoothing / C) |
| target_proba.masked_fill_(~target_mask, label_smoothing / C) |
| |
| loss = nn.CrossEntropyLoss(reduction=reduction) |
| output_with_prob = loss(input, target_proba) |
| |
| loss = nn.CrossEntropyLoss( |
| reduction=reduction, label_smoothing=label_smoothing) |
| output_with_index = loss(input, target) |
| |
| self.assertEqual(output_with_prob, output_with_index, |
| rtol=1e-07, atol=1e-05) |
| |
| def test_cross_entropy_label_smoothing_with_probs(self, device): |
| N, C = 10, 4 |
| ks = range(5) |
| reductions = ['none', 'mean', 'sum'] |
| label_smoothings = [0.05, 0.15] |
| |
| # Test with k-dimensional loss. |
| for k, label_smoothing in product(ks, label_smoothings): |
| other_dims = [torch.randint(2, 5, size=(1,)).item() for _ in range(k)] |
| input = torch.randn(N, C, *other_dims, device=device, requires_grad=True) |
| target = F.log_softmax(torch.randn(N, C, *other_dims, device=device), dim=1) |
| |
| for reduction in reductions: |
| # use with label_smoothing |
| loss = nn.CrossEntropyLoss(reduction=reduction, label_smoothing=label_smoothing) |
| output_with_smoothing = loss(input, target) |
| |
| # manually smoothing target |
| # class_proba^ls = class_proba * (1 - label_smoothing) + |
| # label_smoothing / n_classes |
| target_with_smoothing = target * (1 - label_smoothing) + label_smoothing / C |
| loss = nn.CrossEntropyLoss(reduction=reduction) |
| output_with_manual_smoothing = loss(input, target_with_smoothing) |
| |
| self.assertEqual(output_with_smoothing, output_with_manual_smoothing) |
| |
| |
| def test_cross_entropy_label_smoothing_weight_ignore_indices(self, device): |
| reductions = ['none', 'sum', 'mean'] |
| label_smoothings = [0.05, 0.15] |
| |
| wgt = torch.tensor([0.3, 0.6], device=device) |
| inp1 = torch.tensor([[0.3, 0.4], [1, 2]], device=device) |
| inp2 = torch.tensor([[0.3, 0.6], [1, 2]], device=device) |
| |
| targ_default_ignore_index = torch.tensor([-100, 1], device=device) |
| targ_negative_ignore_index = torch.tensor([-2, 1], device=device) |
| targ_positive_ignore_index = torch.tensor([2, 1], device=device) |
| |
| for reduction, label_smoothing, weight in product(reductions, label_smoothings, (None, wgt)): |
| def check_equal(loss, inp_targ_1, inp_targ_2): |
| inp1, targ1 = inp_targ_1 |
| inp2, targ2 = inp_targ_2 |
| l1 = loss(inp1, targ1) |
| l2 = loss(inp2, targ2) |
| self.assertEqual(l1, l2) |
| |
| # Default ignore_index |
| loss = nn.CrossEntropyLoss(reduction=reduction, |
| label_smoothing=label_smoothing, |
| weight=weight) |
| check_equal(loss, (inp1, targ_default_ignore_index), (inp2, targ_default_ignore_index)) |
| if reduction != 'none': |
| # Check that we correctly tally the denominator for `mean` |
| # i.e. we don't count the ignored_idx at all. |
| check_equal(loss, (inp1, targ_default_ignore_index), (inp2[1:], targ_default_ignore_index[1:])) |
| |
| # negative ignore_index |
| loss = nn.CrossEntropyLoss(reduction=reduction, |
| label_smoothing=label_smoothing, |
| ignore_index=-2, |
| weight=weight) |
| check_equal(loss, (inp1, targ_negative_ignore_index), (inp2, targ_negative_ignore_index)) |
| if reduction != 'none': |
| # Check that we correctly tally the denominator for `mean` |
| # i.e. we don't count the ignored_idx at all. |
| check_equal(loss, (inp1, targ_negative_ignore_index), (inp2[1:], targ_negative_ignore_index[1:])) |
| |
| # positive ignore_index |
| loss = nn.CrossEntropyLoss(reduction=reduction, |
| label_smoothing=label_smoothing, |
| ignore_index=2, |
| weight=weight) |
| check_equal(loss, (inp1, targ_positive_ignore_index), (inp2, targ_positive_ignore_index)) |
| if reduction != 'none': |
| # Check that we correctly tally the denominator for `mean` |
| # i.e. we don't count the ignored_idx at all. |
| check_equal(loss, (inp1, targ_positive_ignore_index), (inp2[1:], targ_positive_ignore_index[1:])) |
| |
| # Ref: https://github.com/pytorch/pytorch/issues/85005 |
| @onlyCUDA |
| @largeTensorTest("45GB", "cpu") |
| @largeTensorTest("70GB", "cuda") |
| @parametrize_test("reduction", ("none", "mean", "sum")) |
| def test_cross_entropy_large_tensor(self, device, reduction): |
| logits = torch.randn(int(2 ** 16), int(2 ** 16) + 1, dtype=torch.float32, device='cuda', requires_grad=True) |
| labels = torch.zeros(logits.size(0), dtype=torch.long, device='cuda') |
| loss = F.cross_entropy(logits, labels, reduction=reduction) |
| if reduction != "none": |
| loss.backward() |
| |
| with torch.no_grad(): |
| logits_cpu = logits.cpu().detach().requires_grad_() |
| labels_cpu = labels.cpu().detach() |
| loss_cpu = F.cross_entropy(logits_cpu, labels_cpu, reduction=reduction) |
| if reduction != "none": |
| loss_cpu.backward() |
| |
| # workaround to reduce memory usage vs. self.assertEqual, see #84944 |
| rtol, atol = torch.testing._comparison.get_tolerances(torch.float32, rtol=None, atol=None) |
| self.assertTrue(torch.allclose(loss.cpu(), loss_cpu, rtol=rtol, atol=atol)) |
| if reduction != "none": |
| self.assertTrue(torch.allclose(logits.grad.cpu(), logits_cpu.grad, rtol=rtol, atol=atol)) |
| |
| def test_smoothl1loss_backward_zero_beta(self, device): |
| input = torch.randn(300, 256, requires_grad=True, device=device) |
| target = input.detach() |
| |
| loss = F.smooth_l1_loss(input, target, beta=0.0, reduction='sum') |
| loss.backward() |
| |
| grad_max_abs = input.grad.abs().max().item() |
| self.assertLessEqual(grad_max_abs, 1.0) |
| |
| def test_softshrink_negative(self, device): |
| input = torch.randn(5, device=device, requires_grad=True) |
| m = torch.nn.Softshrink(-1) |
| with self.assertRaisesRegex(RuntimeError, |
| r'lambda must be greater or equal to 0, but found to be -1\.'): |
| m(input) |
| |
| @expectedFailureMPS # TypeError: the MPS framework doesn't support float64 |
| def test_fold(self, device): |
| def test_dtype(fn, input, dtype): |
| input = input.detach().clone().to(dtype=dtype).requires_grad_(True) |
| input2 = input.detach().clone().float().requires_grad_(True) |
| out = fn(input) |
| out.sum().backward() |
| out2 = fn(input2) |
| out2.sum().backward() |
| self.assertEqual(out.dtype, dtype) |
| self.assertEqual(input.grad.dtype, dtype) |
| self.assertEqual(out, out2.to(dtype=dtype), atol=0.05, rtol=0) |
| self.assertEqual(input.grad, input2.grad.to(dtype=dtype)) |
| |
| def func(x): |
| return F.fold(x, output_size=(4, 5), kernel_size=(2, 2)) |
| |
| seeds = (44, 83, 71, 25, 999) |
| for sd in seeds: |
| torch.manual_seed(sd) |
| x = torch.randn(1, 12, 12, device=device, requires_grad=True, dtype=torch.double) |
| gradcheck(func, [x], check_forward_ad=True) |
| gradgradcheck(func, [x], check_fwd_over_rev=True) |
| if device == 'cpu': |
| test_dtype(func, x, torch.bfloat16) |
| |
| |
| def test_logsigmoid_out(self, device): |
| # this isn't actually documented, but was broken previously: |
| # https://github.com/pytorch/pytorch/issues/36499 |
| x = torch.randn(2, 3, device=device).t() |
| empty_out = torch.randn(0, device=device) |
| self.assertEqual(F.logsigmoid(x), F.logsigmoid(x, out=empty_out)) |
| |
| noncontig_out = torch.randn(2, 3, device=device).t() |
| self.assertEqual(F.logsigmoid(x), F.logsigmoid(x, out=noncontig_out)) |
| |
| # Check that clip_grad_norm_ raises an error if the total norm of the |
| # parameters' gradients is non-finite |
| @expectedFailureMPS # TypeError: the MPS framework doesn't support float64 |
| def test_clip_grad_norm_error_if_nonfinite(self, device): |
| norms_pos = [0.1, 1, 2, 3.5, inf] |
| norms_neg = [-0.1, -1, -2, -3.5] |
| norms_except_0 = norms_pos + norms_neg |
| norms_all = norms_except_0 + [0] |
| |
| # Each entry in test_cases has the following values, in this order: |
| # |
| # grad_only_one_elem If True, only one element of the parameter's |
| # gradient is set to the scalar grad, and the |
| # rest of the elements are 0. If False, all grad |
| # elements are equal to the scalar. |
| # |
| # prefix_finite_grad_param If True, prefix a parameter that has a grad |
| # of 1. |
| # |
| # scalars Scalars to use as the parameter's grad, through |
| # multiplication |
| # |
| # norms_nonfinite Norm types that should produce nonfinite total norm |
| # |
| # norms_finite Norm types that should produce finite total norm |
| test_cases = [ |
| # Test errors from an infinite grad |
| (False, False, [inf, -inf], norms_except_0, [0]), |
| (False, True, [inf, -inf], norms_pos, norms_neg + [0]), |
| (True, False, [inf, -inf], norms_pos, norms_neg + [0]), |
| (True, True, [inf, -inf], norms_pos, norms_neg + [0]), |
| |
| # Test errors from a NaN grad |
| (False, False, [nan], norms_except_0, [0]), |
| (False, True, [nan], norms_except_0, [0]), |
| (True, False, [nan], norms_except_0, [0]), |
| (True, True, [nan], norms_except_0, [0]), |
| |
| # Test a grad that should never error |
| (False, False, [2e22, -2e22], [], norms_all), |
| (False, True, [2e22, -2e22], [], norms_all), |
| (True, False, [2e22, -2e22], [], norms_all), |
| (True, True, [2e22, -2e22], [], norms_all), |
| |
| # Test a grad that will overflow to inf for only some norm orders |
| (False, False, [2e200, -2e200], [3.5, 2, -2, -3.5], [inf, 1, 0.1, 0, -1, -0.1]), |
| (False, True, [2e200, -2e200], [3.5, 2], norms_neg + [inf, 1, 0.1, 0]), |
| (True, False, [2e200, -2e200], [3.5, 2], norms_neg + [inf, 1, 0.1, 0]), |
| (True, True, [2e200, -2e200], [3.5, 2], norms_neg + [inf, 1, 0.1, 0]), |
| ] |
| |
| def gen_parameters(scalar, grad_only_one_elem, prefix_finite_grad_param): |
| param = torch.ones(10, dtype=torch.float64, device=device, requires_grad=True) |
| |
| if grad_only_one_elem: |
| param[1].mul(scalar).sum().backward() |
| else: |
| param.mul(scalar).sum().backward() |
| |
| if prefix_finite_grad_param: |
| prefix_param = torch.ones(1, dtype=torch.float64, device=device, requires_grad=True) |
| prefix_param.mul(1).sum().backward() |
| parameters = [prefix_param, param] |
| else: |
| parameters = [param] |
| |
| return parameters |
| |
| def run_test_case(norm_type, error_if_nonfinite, scalar, grad_only_one_elem, prefix_finite_grad_param, is_norm_nonfinite): |
| msg = ( |
| f'norm_type: {norm_type}, ', |
| f'error_if_nonfinite: {error_if_nonfinite}, ' |
| f'scalar: {scalar}, ' |
| f'grad_only_one_elem: {grad_only_one_elem}, ' |
| f'prefix_finite_grad_param: {prefix_finite_grad_param}, ' |
| f'is_norm_nonfinite: {is_norm_nonfinite}') |
| |
| parameters = gen_parameters(scalar, grad_only_one_elem, prefix_finite_grad_param) |
| |
| # Should only throw an error if the total norm is expected to be |
| # nonfinite and `error_if_nonfinite=True` |
| if is_norm_nonfinite and error_if_nonfinite: |
| error_msg = f'The total norm of order {float(norm_type)} for gradients' |
| |
| grads_before = [p.grad.clone() for p in parameters] |
| |
| with self.assertRaisesRegex(RuntimeError, error_msg, msg=msg): |
| clip_grad_norm_(parameters, 1, norm_type=norm_type, error_if_nonfinite=True) |
| |
| # Grad should not change if error is thrown |
| grads_after = [p.grad for p in parameters] |
| self.assertEqual(grads_before, grads_after, msg=msg) |
| else: |
| clip_grad_norm_(parameters, 1, norm_type=norm_type, error_if_nonfinite=error_if_nonfinite) |
| |
| for grad_only_one_elem, prefix_finite_grad_param, scalars, norms_nonfinite, norms_finite in test_cases: |
| for error_if_nonfinite in [False, True]: |
| for norm_type, scalar in product(norms_nonfinite, scalars): |
| run_test_case(norm_type, error_if_nonfinite, scalar, grad_only_one_elem, prefix_finite_grad_param, True) |
| |
| for norm_type, scalar in product(norms_finite, scalars): |
| run_test_case(norm_type, error_if_nonfinite, scalar, grad_only_one_elem, prefix_finite_grad_param, False) |
| |
| @onlyCUDA |
| @deviceCountAtLeast(2) |
| @parametrize_test('foreach', (False, True)) |
| def test_clip_grad_norm_multi_device(self, devices, foreach): |
| class TestModel(nn.Module): |
| def __init__(self) -> None: |
| super().__init__() |
| self.layer1 = nn.Linear(10, 10) |
| self.layer2 = nn.Linear(10, 10) |
| |
| test_model = TestModel() |
| test_model.layer1.to(devices[0]) |
| test_model.layer2.to(devices[1]) |
| ref_model = TestModel().to(devices[0]) |
| for norm_type in [2., math.inf]: |
| for p in test_model.parameters(): |
| p.grad = torch.ones_like(p) |
| for p in ref_model.parameters(): |
| p.grad = torch.ones_like(p) |
| norm = clip_grad_norm_(test_model.parameters(), 0.5, norm_type=norm_type, foreach=foreach) |
| expected = clip_grad_norm_(ref_model.parameters(), 0.5, norm_type=norm_type, foreach=foreach) |
| self.assertEqual(norm, expected) |
| for p, pe in zip(test_model.parameters(), ref_model.parameters()): |
| self.assertEqual(p.grad.to(devices[0]), pe.grad) |
| |
| def test_elu_inplace_overlap(self, device): |
| dtype = torch.bfloat16 if device != 'mps:0' else torch.float16 |
| x = torch.randn((1, 6), dtype=dtype, device=device).expand((6, 6)) |
| with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): |
| F.elu(x, inplace=True) |
| with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): |
| F.elu_(x) |
| |
| # Merge into OpInfo? |
| @onlyNativeDeviceTypes |
| def test_elu_inplace_with_neg_alpha(self, device): |
| a = torch.tensor([-1., 1.], device=device, requires_grad=True) |
| b = torch.nn.functional.elu_(a.clone(), alpha=-2) |
| with self.assertRaisesRegex(RuntimeError, "call out-of-place version"): |
| b.backward(torch.ones(2, device=device)) |
| |
| a = torch.tensor([-1., 1.], device=device, requires_grad=True) |
| b = torch.nn.functional.celu_(a.clone(), alpha=-2) |
| with self.assertRaisesRegex(RuntimeError, "call out-of-place version"): |
| b.backward(torch.ones(2, device=device)) |
| |
| @expectedFailureMeta # https://github.com/pytorch/pytorch/issues/54897 |
| def test_hardswish_inplace_overlap(self, device): |
| x = torch.randn((1, 6), device=device).expand((6, 6)) |
| with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): |
| F.hardswish(x, inplace=True) |
| |
| def test_silu_inplace_overlap(self, device): |
| x = torch.randn((1, 6), device=device).expand((6, 6)) |
| with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): |
| F.silu(x, inplace=True) |
| |
| @onlyNativeDeviceTypes |
| def test_mish_inplace_overlap(self, device): |
| x = torch.randn((1, 6), device=device).expand((6, 6)) |
| with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): |
| F.mish(x, inplace=True) |
| |
| def test_softplus_inplace_overlap(self, device): |
| x = torch.randn((1, 6), device=device).expand((6, 6)) |
| with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): |
| F.softplus(x, out=x) |
| |
| @expectedFailureMPS # TypeError: the MPS framework doesn't support float64 |
| def test_softplus_low_threshold(self, device): |
| # Ensure gradients are computed correctly with a low threshold. |
| model = torch.nn.Softplus(threshold=1).double() |
| input = torch.tensor(0.9, device=device, dtype=torch.double, |
| requires_grad=True) |
| output = model(input) |
| torch.autograd.gradcheck(model, input) |
| |
| def test_softshrink_inplace_overlap(self, device): |
| x = torch.randn((1, 6), device=device).expand((6, 6)) |
| with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): |
| F.softshrink(x, out=x) |
| |
| def test_leaky_relu_inplace_overlap(self, device): |
| x = torch.randn((1, 6), device=device).expand((6, 6)) |
| with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): |
| F.leaky_relu(x, inplace=True) |
| with self.assertRaisesRegex(RuntimeError, 'unsupported operation'): |
| F.leaky_relu_(x) |
| |
| # Merge into OpInfo? |
| @expectedFailureMPS # NotImplementedError: aten::rrelu_with_noise_ https://github.com/pytorch/pytorch/issues/77764 |
| def test_leaky_relu_inplace_with_neg_slope(self, device): |
| a = torch.tensor([-1., 1.], device=device, requires_grad=True) |
| b = torch.nn.functional.leaky_relu_(a.clone(), -2) |
| with self.assertRaisesRegex(RuntimeError, "call out-of-place version"): |
| b.backward(torch.ones(2, device=device)) |
| |
| a = torch.tensor([-1., 1.], device=device, requires_grad=True) |
| b = torch.nn.functional.rrelu_(a.clone(), -5.0, 1.0) |
| with self.assertRaisesRegex(RuntimeError, "call out-of-place version"): |
| b.backward(torch.ones(2, device=device)) |
| |
| # Merge into OpInfo? |
| def test_leaky_relu_inplace_with_zero_slope(self, device): |
| a = torch.tensor([-2., 0., 2.], device=device, requires_grad=True) |
| b = torch.nn.functional.leaky_relu_(a.clone(), 0.0) |
| b.backward(torch.ones(3, device=device)) |
| expected = torch.tensor([0., 0., 1.], device=device) |
| self.assertEqual(a.grad, expected) |
| |
| dtype = torch.bfloat16 if device != 'mps:0' else torch.float16 |
| a_bf16 = torch.tensor([-2., 0., 2.], device=device, dtype=dtype, requires_grad=True) |
| b_bf16 = torch.nn.functional.leaky_relu_(a_bf16.clone(), 0.0) |
| b_bf16.backward(torch.ones(3, device=device)) |
| expected_bf16 = torch.tensor([0., 0., 1.], device=device, dtype=dtype) |
| self.assertEqual(a_bf16.grad, expected_bf16) |
| |
| @onlyCPU |
| def test_softshrink(self, device): |
| x = torch.tensor([[1.21, 0.56, 0.5001, 0.4999, 1.2357, -0.4999, -0.5001, -1.154, |
| 0.254, -0.24, -0.225, 0.104, 0.002, -0.001, 0.0574, 1.2344, |
| 0.1748, -0.1797, -0.8125, 0.2051, -1.1328, 1.2344, -0.1562, 2.3554, |
| -0.1953, 0.0304, -0.3613, -1.3047, 1.0312, 0.1436, -0.6953, 0.5664, |
| -0.5820, -0.3301, 0.8203, 0.6133, 0.5938], |
| [-0.8203, -1.2344, -0.5234, 2.5312, -0.4551, -0.6875, -1.5547, -0.2217, |
| -0.3027, 2.6406, 1.3047, 0.2344, -1.6719, 0.2773, -1.3516, 3.4575, |
| 0.4414, 0.2656, 2.1094, -1.5156, 1.2344, -0.4336, 0.6797, -3.5486, |
| 0.9766, -0.4062, 1.4844, 0.7500, -1.7578, 0.7461, 1.6094, 8.5458, |
| 0.3730, -0.3477, -1.0625, 0.3848, 0.0557]], device=device) |
| expected = torch.tensor([[0.71, 0.06, 0.0001, 0., 0.7357, 0., -0.0001, -0.654, |
| 0., 0., 0., 0., 0., 0., 0., 0.7344, |
| 0., 0., -0.3125, 0., -0.6328, 0.7344, 0., 1.8554, |
| 0., 0., 0., -0.8047, 0.5312, 0., -0.1953, 0.0664, |
| -0.0820, 0.0, 0.3203, 0.1133, 0.0938], |
| [-0.3203, -0.7344, -0.0234, 2.0312, 0.0, -0.1875, -1.0547, 0., |
| 0.0, 2.1406, 0.8047, 0., -1.1719, 0., -0.8516, 2.9575, |
| 0., 0., 1.6094, -1.0156, 0.7344, 0., 0.1797, -3.0486, |
| 0.4766, 0., 0.9844, 0.2500, -1.2578, 0.2461, 1.1094, 8.0458, |
| 0., 0., -0.5625, 0., 0.]]) |
| softshrink = torch.nn.Softshrink() |
| out = softshrink(x) |
| self.assertEqual(out, expected, atol=1e-2, rtol=0) |
| |
| def test_threshold_inplace_overlap(self, device): |
| # Inplace threshold is okay, because it is idempotent |
| x = torch.randn((1, 6), device=device).expand((6, 6)) |
| F.threshold(x, 0.5, 0.5, inplace=True) |
| F.threshold_(x, 0.5, 0.5) |
| |
| @onlyNativeDeviceTypes |
| def test_triplet_margin_with_distance_loss_default_parity(self, device): |
| # Test for `nn.TripletMarginWithDistanceLoss` and |
| # `F.triplet_margin_with_distance_loss`. Checks |
| # for parity against the respective non-distance-agnostic |
| # implementations of triplet margin loss (``nn.TripletMarginLoss` |
| # and `F.triplet_margin_loss`) under *default args*. |
| |
| for extra_args in \ |
| itertools.product((0.5, 1, 1.5), (True, False), ('none', 'mean', 'sum')): |
| kwargs = {'margin': extra_args[0], 'swap': extra_args[1], 'reduction': extra_args[2]} |
| |
| anchor = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double) |
| positive = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double) |
| negative = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double) |
| |
| # Test forward, functional |
| expected = F.triplet_margin_loss(anchor, positive, negative, **kwargs) |
| actual = F.triplet_margin_with_distance_loss(anchor, positive, negative, **kwargs) |
| self.assertEqual(actual, expected, rtol=1e-6, atol=1e-6) |
| |
| # Test forward, module |
| loss_ref = nn.TripletMarginLoss(**kwargs) |
| loss_op = nn.TripletMarginWithDistanceLoss(**kwargs) |
| self.assertEqual(loss_op(anchor, positive, negative), |
| loss_ref(anchor, positive, negative), |
| rtol=1e-6, atol=1e-6) |
| |
| # Test backward |
| self.assertTrue(gradcheck(lambda a, p, n: F.triplet_margin_with_distance_loss( |
| a, p, n, **kwargs), (anchor, positive, negative))) |
| self.assertTrue(gradcheck(lambda a, p, n: loss_op(a, p, n), |
| (anchor, positive, negative))) |
| |
| @onlyNativeDeviceTypes |
| def test_triplet_margin_with_distance_loss(self, device): |
| # Test for parity between `nn.TripletMarginWithDistanceLoss` and |
| # `F.triplet_margin_with_distance_loss`. |
| |
| pairwise_distance = nn.PairwiseDistance() |
| |
| def cosine_distance(x, y): |
| return 1.0 - F.cosine_similarity(x, y) |
| |
| distance_functions = (pairwise_distance, cosine_distance, |
| lambda x, y: 1.0 - F.cosine_similarity(x, y)) |
| |
| reductions = ('mean', 'none', 'sum') |
| margins = (1.0, 1.5, 0.5) |
| swaps = (True, False) |
| |
| for distance_fn, reduction, margin, swap \ |
| in itertools.product(distance_functions, reductions, margins, swaps): |
| anchor = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double) |
| positive = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double) |
| negative = torch.randn(5, 10, device=device, requires_grad=True, dtype=torch.double) |
| |
| # Test backward |
| self.assertTrue(gradcheck(lambda a, p, n: F.triplet_margin_with_distance_loss( |
| a, p, n, distance_function=distance_fn, reduction=reduction, margin=margin, swap=swap), |
| (anchor, positive, negative))) |
| loss_op = nn.TripletMarginWithDistanceLoss(distance_function=distance_fn, |
| reduction=reduction, margin=margin, swap=swap) |
| self.assertTrue(gradcheck(lambda a, p, n: loss_op( |
| a, p, n), (anchor, positive, negative))) |
| traced_loss_op = torch.jit.trace(loss_op, (anchor, positive, negative)) |
| self.assertTrue(gradcheck(lambda a, p, n: traced_loss_op( |
| a, p, n), (anchor, positive, negative))) |
| |
| # Test forward parity |
| functional = F.triplet_margin_with_distance_loss(anchor, positive, negative, |
| distance_function=distance_fn, |
| reduction=reduction, margin=margin, swap=swap) |
| modular = loss_op(anchor, positive, negative) |
| traced = traced_loss_op(anchor, positive, negative) |
| self.assertEqual(functional, modular, atol=1e-6, rtol=1e-6) |
| self.assertEqual(traced, modular, atol=1e-6, rtol=1e-6) |
| |
| @dtypesIfMPS(torch.cfloat, torch.float) |
| @dtypes(torch.cfloat, torch.cdouble, torch.float) |
| def test_to_complex(self, device, dtype): |
| m = nn.Linear(3, 5).to(device) |
| self.assertIs(m, m.to(device)) |
| m.to(dtype) |
| self.assertIs(m.weight.dtype, dtype) |
| with warnings.catch_warnings(record=True) as w: |
| # Trigger warning |
| m.to(torch.cfloat) |
| # Check warning occurs |
| self.assertEqual(len(w), 1) |
| self.assertTrue("Complex modules are a new feature" in str(w[-1].message)) |
| |
| @skipMeta |
| @dtypesIfMPS(torch.float32) |
| @dtypes(torch.float32, torch.float64) |
| def test_module_to_empty(self, device, dtype): |
| class MyModule(nn.Module): |
| def __init__(self, in_features, out_features, device=None, dtype=None): |
| super().__init__() |
| factory_kwargs = {"device": device, "dtype": dtype} |
| self.weight = nn.Parameter(torch.randn(in_features, out_features, **factory_kwargs)) |
| |
| def forward(self, x): |
| return x @ self.weight |
| |
| # Test meta module instantiation. |
| input = torch.randn(5, 10, device=device, dtype=dtype) |
| m = MyModule(10, 1, device='meta', dtype=dtype) |
| m(input) |
| |
| # Test empty meta module error with torch.nn.Module.to(). |
| with self.assertRaisesRegex( |
| NotImplementedError, |
| re.escape( |
| "Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() " |
| "instead of torch.nn.Module.to() when moving module from meta to a different " |
| "device." |
| ), |
| ): |
| m.to(device) |
| |
| # Test materializing meta module on a real device. |
| m.to_empty(device=device) |
| m(input) |
| with torch.no_grad(): |
| torch.nn.init.kaiming_uniform_(m.weight) |
| m(input) |
| |
| # Test creating meta module from materialized module. |
| m.to_empty(device='meta') |
| m(input) |
| |
| def test_module_to_empty_non_recursive(self, device): |
| class Layer(nn.Module): |
| def __init__(self, in_features, out_features): |
| super().__init__() |
| self.weight = nn.Parameter(torch.randn(in_features, out_features)) |
| self.register_buffer('buf', torch.randn(out_features)) |
| |
| def forward(self, x): |
| return x @ self.weight + self.buf |
| |
| class MyModule(nn.Module): |
| def __init__(self, in_features, out_features): |
| super().__init__() |
| self.weight = nn.Parameter(torch.randn(in_features, out_features)) |
| self.register_buffer('buf1', torch.randn(out_features)) |
| self.layer = Layer(out_features, out_features) |
| |
| def forward(self, x): |
| return self.layer(x @ self.weight + self.buf1) |
| |
| with torch.device('meta'): |
| m = MyModule(3, 5) |
| |
| m.to_empty(device=device, recurse=False) |
| |
| # params/buffers of parent should have been materialized on device |
| self.assertTrue(not m.weight.is_meta) |
| self.assertTrue(not m.buf1.is_meta) |
| |
| # parameters/buffers of children submodules should still be on meta |
| for p in (*m.layer.parameters(), *m.layer.buffers()): |
| self.assertTrue(p.is_meta) |
| |
| @skipMeta |
| def test_skip_init(self, device): |
| torch.manual_seed(1) |
| m_initialized = torch.nn.Linear(5, 1) |
| m_initialized.to(device) |
| |
| torch.manual_seed(1) |
| m_uninitialized = torch.nn.utils.skip_init(torch.nn.Linear, 5, 1, device=device) |
| |
| self.assertEqual(m_initialized.weight.device, m_uninitialized.weight.device) |
| self.assertFalse(torch.allclose(m_initialized.weight, m_uninitialized.weight)) |
| |
| @skipIfRocm(msg='See https://github.com/pytorch/pytorch/issues/135150') |
| @skipIfMps # TODO(hvaara): Investigate as possible bug. macOS 13 passes, while 14 and 15 fails. |
| @dtypes(torch.float) |
| @dtypesIfCUDA(torch.double, torch.float, torch.half) |
| def test_transformerencoderlayer(self, device, dtype): |
| if TEST_WITH_ROCM and PLATFORM_SUPPORTS_FLASH_ATTENTION and dtype == torch.half: |
| self.skipTest("Skip on ROCM due to Flash Attention tolerances") |
| # this is a deterministic test for TransformerEncoderLayer |
| d_model = 4 |
| nhead = 2 |
| dim_feedforward = 16 |
| dropout = 0.0 |
| bsz = 2 |
| |
| atol = 1e-5 |
| rtol = 1e-7 |
| if "cuda" in device: |
| atol = 1e-3 |
| rtol = 1e-2 |
| |
| def _test(training, batch_first, atol, rtol): |
| def perm_fn(x): |
| return x.transpose(1, 0) if batch_first else x |
| |
| model = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, |
| batch_first=batch_first, device=device, dtype=dtype) |
| |
| if not training: |
| assert dropout == 0 |
| model = model.eval() |
| |
| # set constant weights of the model |
| for idx, p in enumerate(model.parameters()): |
| x = p.data |
| sz = x.view(-1).size(0) |
| shape = x.shape |
| x = torch.cos(torch.arange(0, sz).float().view(shape)) |
| p.data.copy_(x) |
| |
| # deterministic input |
| encoder_input = torch.tensor([[[20., 30., 40., 50.]]], device=device, dtype=dtype) |
| result = model(encoder_input) |
| ref_output = torch.tensor([[[2.258703, 0.127985, -0.697881, 0.170862]]], device=device, dtype=dtype) |
| self.assertEqual(result.shape, ref_output.shape) |
| torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) |
| # 0 values are NOT masked. This shouldn't mask anything. |
| mask = torch.tensor([[0]], device=device) == 1 |
| # TODO: enable fast path for calls with a mask! |
| result = model(encoder_input, src_key_padding_mask=mask) |
| self.assertEqual(result.shape, ref_output.shape) |
| torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) |
| mask = torch.tensor([[1]], device=device) == 1 |
| result = model(encoder_input, src_key_padding_mask=mask) |
| fast_path_device = result.is_cuda or result.is_cpu |
| result = result.cpu().detach().numpy() |
| # Non Fast Paths |
| if training or not batch_first or TEST_WITH_CROSSREF or not fast_path_device: |
| # We changed the semenatic, on the non fast path so that fully masked out rows return |
| # 0 from attention thus NaNs should no longer be present and the output should be nonzero |
| # due to skip connections |
| self.assertTrue(not np.isnan(result).any()) |
| else: |
| # Fast Paths |
| self.assertTrue(np.isnan(result).all()) |
| |
| |
| # deterministic input |
| encoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]], |
| [[5., 6., 7., 8.]]], device=device, dtype=dtype)) |
| result = model(encoder_input) |
| ref_output = perm_fn(torch.tensor([[[2.272644, 0.119035, -0.691669, 0.153486]], |
| [[2.272644, 0.119035, -0.691669, 0.153486]]], device=device, dtype=dtype)) |
| self.assertEqual(result.shape, ref_output.shape) |
| torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) |
| # all 0 which is no masking |
| mask = torch.tensor([[0, 0]], device=device) == 1 |
| result = model(encoder_input, src_key_padding_mask=mask) |
| self.assertEqual(result.shape, ref_output.shape) |
| torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) |
| mask = torch.tensor([[1, 0]], device=device) == 1 |
| result = model(encoder_input, src_key_padding_mask=mask) |
| ref_output = perm_fn(torch.tensor([[[2.301516, 0.092249, -0.679101, 0.103088]], |
| [[2.301516, 0.092249, -0.679101, 0.103088]]], device=device, dtype=dtype)) |
| self.assertEqual(result.shape, ref_output.shape) |
| torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) |
| |
| # deterministic input |
| encoder_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], |
| [0.5387, 0.1655, 0.3565, 0.0471]], |
| [[0.8335, 0.2799, 0.5031, 0.2947], |
| [0.1402, 0.0318, 0.7636, 0.1346]], |
| [[0.6333, 0.9344, 0.1376, 0.9938], |
| [0.8924, 0.2872, 0.6692, 0.2944]], |
| [[0.9897, 0.6915, 0.3154, 0.1733], |
| [0.8645, 0.3513, 0.3064, 0.0767]], |
| [[0.8117, 0.2366, 0.4838, 0.7881], |
| [0.3718, 0.4945, 0.9511, 0.0864]]], device=device, dtype=dtype)) |
| result = model(encoder_input) |
| ref_output = perm_fn(torch.tensor([[[2.428589, 0.020835, -0.602055, -0.085249], |
| [2.427987, 0.021213, -0.602496, -0.084103]], |
| [[2.424689, 0.019155, -0.604793, -0.085672], |
| [2.413863, 0.022211, -0.612486, -0.072490]], |
| [[2.433774, 0.021598, -0.598343, -0.087548], |
| [2.425104, 0.019748, -0.604515, -0.084839]], |
| [[2.436185, 0.022682, -0.596625, -0.087261], |
| [2.433556, 0.021891, -0.598509, -0.086832]], |
| [[2.416246, 0.017512, -0.610712, -0.082961], |
| [2.422901, 0.024187, -0.606178, -0.074929]]], device=device, dtype=dtype)) |
| self.assertEqual(result.shape, ref_output.shape) |
| torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) |
| |
| # all 0 |
| mask = torch.zeros([2, 5], device=device) == 1 |
| result = model(encoder_input, src_key_padding_mask=mask) |
| self.assertEqual(result.shape, ref_output.shape) |
| torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) |
| mask[0, 1] = 1 |
| mask[1, 3] = 1 |
| mask[1, 4] = 1 |
| result = model(encoder_input, src_key_padding_mask=mask) |
| ref_output = perm_fn(torch.tensor([[[2.429026, 0.020793, -0.601741, -0.085642], |
| [2.428811, 0.021445, -0.601912, -0.084252]], |
| [[2.425009, 0.019155, -0.604566, -0.085899], |
| [2.415408, 0.02249 , -0.611415, -0.073]], |
| [[2.434199, 0.021682, -0.598039, -0.087699], |
| [2.42598, 0.019941, -0.603896, -0.085091]], |
| [[2.436457, 0.022736, -0.59643 , -0.08736], |
| [2.434021, 0.022093, -0.598179, -0.08679]], |
| [[2.416531, 0.017498, -0.610513, -0.083181], |
| [2.4242, 0.024653, -0.605266, -0.074959]]], device=device, dtype=dtype)) |
| self.assertEqual(result.shape, ref_output.shape) |
| torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) |
| |
| # NestedTensor is only supported for the fast path |
| # currently, which won't be used if training. |
| if (batch_first and not training and |
| ('cuda' in str(device) or 'cpu' in str(device)) and not TEST_WITH_CROSSREF): |
| encoder_input[0][-1] = torch.zeros_like(encoder_input[0][1]) |
| mask = torch.zeros(encoder_input.shape[:-1], device=device, dtype=torch.bool) |
| mask[0][-1] = True |
| |
| nt = torch.nested.nested_tensor([encoder_input[0][:-1], encoder_input[1]], device=device) |
| result = model(nt) |
| ref_output = torch.tensor( |
| [ |
| [ |
| [2.4268184, 0.02042419, -0.603311, -0.08476824], |
| [2.423306, 0.01889652, -0.6057701, -0.08519465], |
| [2.431538, 0.02078694, -0.5999354, -0.08746159], |
| [2.4348664, 0.02212971, -0.5975677, -0.08733892], |
| [2.423133, 0.02097577, -0.60594773, -0.08113337], |
| ], |
| [ |
| [2.4279876, 0.02121329, -0.60249615, -0.08410317], |
| [2.4138637, 0.02221113, -0.6124869, -0.07249016], |
| [2.4251041, 0.01974815, -0.6045152, -0.08483928], |
| [2.4335563, 0.0218913, -0.59850943, -0.08683228], |
| [2.4229012, 0.02418739, -0.6061784, -0.07492948], |
| ], |
| ], |
| device=device, dtype=dtype |
| ) |
| result = result.to_padded_tensor(0) |
| ref_output[0][-1] = torch.zeros_like( |
| ref_output[0][-1], device=device, dtype=dtype |
| ) |
| result[0][-1] = torch.zeros_like( |
| result[0][-1], device=device, dtype=dtype |
| ) |
| self.assertEqual(tuple(result.shape), tuple(ref_output.shape)) |
| if 'cuda' in device: |
| if dtype == torch.float: |
| atol = 2e-4 |
| rtol = 4e-3 |
| else: |
| atol = 7e-4 |
| rtol = 2e-2 |
| torch.testing.assert_close(result, ref_output, atol=atol, rtol=rtol) |
| else: |
| torch.testing.assert_close(result, ref_output) |
| |
| |
| for batch_first in (True, False): |
| for training in (True, False): |
| if training: |
| cm = contextlib.nullcontext() |
| else: |
| # Fast path requires inference mode. |
| cm = torch.no_grad() |
| with cm: |
| _test(batch_first=batch_first, training=training, atol=atol, rtol=rtol) |
| |
| @onlyCPU |
| @dtypes(torch.double) |
| def test_transformerencoderlayer_fast_path(self, device, dtype): |
| """ |
| Test transformer fast path on CPU with different valid mask types and shapes |
| """ |
| d_model = 512 |
| nhead = 8 |
| batch_size = 32 |
| src_len = 10 |
| |
| model = torch.nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True, |
| device=device, dtype=dtype, dropout=0) |
| model.eval() |
| |
| # Batched inputs |
| src = torch.rand(batch_size, src_len, 512, dtype=dtype) |
| |
| # Attention mask of shape (src_len, src_len) |
| src_mask = torch.zeros(src_len, src_len).to(torch.bool) |
| with torch.no_grad(): |
| model(src, src_mask=src_mask) |
| |
| # Padding mask of shape (batch_size, src_len) |
| src_key_padding_mask = torch.zeros(batch_size, src_len).to(torch.bool) |
| with torch.no_grad(): |
| model(src, src_key_padding_mask=src_key_padding_mask) |
| |
| # Provide both masks |
| with torch.no_grad(): |
| model(src, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask) |
| |
| |
| @dtypes(torch.float) |
| @dtypesIfCUDA(torch.half, torch.float) |
| def test_transformerencoderlayer_gelu(self, device, dtype): |
| if TEST_WITH_ROCM and PLATFORM_SUPPORTS_FLASH_ATTENTION and dtype == torch.half: |
| self.skipTest("Skip on ROCM due to Flash Attention tolerances") |
| # this is a deterministic test for TransformerEncoderLayer with gelu activation |
| d_model = 4 |
| nhead = 2 |
| dim_feedforward = 16 |
| dropout = 0.0 |
| bsz = 2 |
| |
| atol = 0 |
| rtol = 1e-5 |
| if "cuda" in device: |
| atol = 1e-3 |
| rtol = 1e-2 |
| |
| def _test(activation, batch_first, training): |
| def perm_fn(x): |
| return x.transpose(1, 0) if batch_first else x |
| |
| model = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, |
| activation, batch_first=batch_first, device=device, dtype=dtype) |
| if not training: |
| assert dropout == 0 |
| model = model.eval() |
| |
| # set constant weights of the model |
| for idx, p in enumerate(model.parameters()): |
| x = p.data |
| sz = x.view(-1).size(0) |
| shape = x.shape |
| x = torch.cos(torch.arange(0, sz).float().view(shape)) |
| p.data.copy_(x) |
| |
| # deterministic input |
| encoder_input = torch.tensor([[[20., 30., 40., 50.]]], device=device, dtype=dtype) |
| result = model(encoder_input) |
| ref_output = torch.tensor([[[2.249815, 0.131006, -0.702199, 0.177868]]], device=device, dtype=dtype) |
| torch.testing.assert_close(result, ref_output, rtol=rtol, atol=atol) |
| |
| # deterministic input |
| encoder_input = perm_fn(torch.tensor([[[1., 2., 3., 4.]], |
| [[5., 6., 7., 8.]]], device=device, dtype=dtype)) |
| result = model(encoder_input) |
| ref_output = perm_fn(torch.tensor([[[2.264103, 0.121417, -0.696012, 0.159724]], |
| [[2.264103, 0.121417, -0.696012, 0.159724]]], device=device, dtype=dtype)) |
| torch.testing.assert_close(result, ref_output, rtol=rtol, atol=atol) |
| |
| # deterministic input |
| encoder_input = perm_fn(torch.tensor([[[0.7462, 0.6653, 0.5679, 0.4891], |
| [0.5387, 0.1655, 0.3565, 0.0471]], |
| [[0.8335, 0.2799, 0.5031, 0.2947], |
| [0.1402, 0.0318, 0.7636, 0.1346]], |
| [[0.6333, 0.9344, 0.1376, 0.9938], |
| [0.8924, 0.2872, 0.6692, 0.2944]], |
| [[0.9897, 0.6915, 0.3154, 0.1733], |
| [0.8645, 0.3513, 0.3064, 0.0767]], |
| [[0.8117, 0.2366, 0.4838, 0.7881], |
| [0.3718, 0.4945, 0.9511, 0.0864]]], device=device, dtype=dtype)) |
| result = model(encoder_input) |
| ref_output = perm_fn(torch.tensor([[[2.42163188, 0.03227153, -0.60714219, -0.05908082], |
| [2.42151276, 0.03302179, -0.60722523, -0.05762651]], |
| [[2.41926761, 0.02974034, -0.60879519, -0.0621269], |
| [2.41626395, 0.03539356, -0.61087842, -0.04978623]], |
| [[2.42382808, 0.03218872, -0.6055963, -0.06073591], |
| [2.41983477, 0.03085259, -0.60840145, -0.06046414]], |
| [[2.42500749, 0.03328855, -0.60476388, -0.0595334], |
| [2.4237977, 0.03290575, -0.60561789, -0.05940082]], |
| [[2.41383916, 0.02686345, -0.61256377, -0.06380707], |
| [2.42000277, 0.03800944, -0.60824798, -0.04754947]]], device=device, dtype=dtype)) |
| torch.testing.assert_close(result, ref_output, rtol=rtol, atol=atol) |
| for activation, batch_first, training in product(('gelu', F.gelu, nn.GELU()), (True, False), (True, False)): |
| # Fast path requires inference mode. |
| if training: |
| cm = contextlib.nullcontext() |
| else: |
| cm = torch.no_grad() |
| with cm: |
| _test(activation=activation, batch_first=batch_first, training=training) |
| |
| @skipIfMps # RuntimeError: foreach=True was passed, but can't use the foreach API on mps tensors |
| @parametrize_test('foreach', (False, True)) |
| def test_clip_grad_value(self, foreach, device): |
| if torch.device(device).type == 'xla' and foreach: |
| raise SkipTest('foreach not supported on XLA') |
| |
| l = nn.Linear(10, 10).to(device) |
| clip_value = 2.5 |
| |
| grad_w, grad_b = torch.arange(-50., 50, device=device).view(10, 10).div_(5), torch.ones(10, device=device).mul_(2) |
| for grad_list in [[grad_w, grad_b], [grad_w, None]]: |
| for p, g in zip(l.parameters(), grad_list): |
| p._grad = g.clone().view_as(p.data) if g is not None else g |
| |
| clip_grad_value_(l.parameters(), clip_value, foreach=foreach) |
| for p in filter(lambda p: p.grad is not None, l.parameters()): |
| self.assertLessEqual(p.grad.data.max(), clip_value) |
| self.assertGreaterEqual(p.grad.data.min(), -clip_value) |
| |
| # Should accept a single Tensor as input |
| p1, p2 = torch.randn(10, 10, device=device), torch.randn(10, 10, device=device) |
| g = torch.arange(-50., 50, device=device).view(10, 10).div_(5) |
| p1._grad = g.clone() |
| p2._grad = g.clone() |
| clip_grad_value_(p1, clip_value, foreach=foreach) |
| clip_grad_value_([p2], clip_value, foreach=foreach) |
| self.assertEqual(p1.grad, p2.grad) |
| |
| @skipIfMps # TypeError: the MPS framework doesn't support float64 |
| @parametrize_test('foreach', (False, True)) |
| @parametrize_test('norm_type', (0.5, 1.5, 2, 4, 'inf')) |
| def test_clip_grad_norm(self, norm_type, foreach, device): |
| if torch.device(device).type == 'xla' and foreach: |
| raise SkipTest('foreach not supported on XLA') |
| |
| l = nn.Linear(10, 10).to(device) |
| max_norm = 2 |
| |
| def compute_norm(norm_type): |
| norm_type = float(norm_type) |
| if norm_type != inf: |
| total_norm = 0 |
| for p in l.parameters(): |
| total_norm += p.grad.data.abs().pow(norm_type).sum() |
| return pow(total_norm, 1. / norm_type) |
| else: |
| return max(p.grad.data.abs().max() for p in l.parameters()) |
| |
| def compare_scaling(grads): |
| p_scale = [p.grad.data.div(g).view(-1) for p, g in zip(l.parameters(), grads)] |
| scale = torch.cat(p_scale) |
| self.assertEqual(scale.std(), 0) |
| return scale[0] |
| |
| grads = torch.arange(1., 101, device=device).view(10, 10), torch.ones(10, device=device).div(1000) |
| for p, g in zip(l.parameters(), grads): |
| p._grad = g.clone().view_as(p.data) |
| norm_before = compute_norm(norm_type) |
| norm = clip_grad_norm_(l.parameters(), max_norm, norm_type=norm_type, foreach=foreach) |
| norm_after = compute_norm(norm_type) |
| self.assertEqual(norm, norm_before) |
| self.assertEqual(norm_after, max_norm) |
| self.assertLessEqual(norm_after, norm_before) |
| compare_scaling(grads) |
| |
| # Small gradients should be left unchanged |
| grads = torch.rand(10, 10, device=device).div(10000), torch.ones(10, device=device).div(500) |
| for p, g in zip(l.parameters(), grads): |
| p.grad.data.copy_(g) |
| norm_before = compute_norm(norm_type) |
| norm = clip_grad_norm_(l.parameters(), max_norm, norm_type=norm_type, foreach=foreach) |
| norm_after = compute_norm(norm_type) |
| self.assertEqual(norm, norm_before) |
| self.assertEqual(norm_before, norm_after) |
| self.assertLessEqual(norm_after, max_norm) |
| scale = compare_scaling(grads) |
| self.assertEqual(scale, 1) |
| |
| # Should accept a single Tensor as input |
| p1, p2 = torch.randn(10, 10, device=device), torch.randn(10, 10, device=device) |
| g = torch.arange(1., 101, device=device).view(10, 10) |
| p1._grad = g.clone() |
| p2._grad = g.clone() |
| clip_grad_norm_(p1, max_norm, norm_type=norm_type, foreach=foreach) |
| clip_grad_norm_([p2], max_norm, norm_type=norm_type, foreach=foreach) |
| self.assertEqual(p1.grad, p2.grad) |
| |
| # reference issue: https://github.com/pytorch/pytorch/issues/111484 |
| @onlyCUDA |
| @largeTensorTest("42GB", "cuda") |
| def test_softmax_forward_64bit_indexing(self, device): |
| batch_size = 70 |
| seq_len = 2048 |
| vocab_size = 50000 |
| |
| shift_labels = torch.zeros(batch_size, seq_len - 1, dtype=torch.long, device=device) |
| logits = torch.ones(batch_size, seq_len - 1, vocab_size, dtype=torch.float16, device=device) |
| loss_fct = torch.nn.CrossEntropyLoss(reduction="none") |
| nll = loss_fct(logits.permute(0, 2, 1), shift_labels).float() |
| rtol, atol = torch.testing._comparison.get_tolerances(torch.float16, rtol=None, atol=None) |
| self.assertEqual(nll, torch.ones_like(nll) * torch.log(torch.tensor(vocab_size)), rtol=rtol, atol=atol) |
| |
| @onlyCUDA |
| @largeTensorTest("20GB", "cuda") |
| def test_softmax_backward_64bit_indexing(self, device): |
| for numel in (2147483650, 2147483650 + 1): |
| x = torch.empty([1, 1, numel], device=device, dtype=torch.float16) |
| x.fill_(1.0 / numel) |
| out = torch._softmax_backward_data(x, x, 2, x.dtype) |
| self.assertEqual(out[0, 0, 0], 1 / numel) |
| |
| # reference issue: https://github.com/pytorch/pytorch/issues/68248 |
| @onlyCUDA |
| def test_adaptiveavg_pool1d_shmem(self, device): |
| x = torch.randn(1, 256, 1, 5000, device=device).to(memory_format=torch.channels_last) |
| x_cpu = x.cpu() |
| x_cpu.requires_grad_() |
| x.requires_grad_() |
| y = torch.nn.functional.adaptive_avg_pool2d(x, (1, 256)) |
| y_cpu = torch.nn.functional.adaptive_avg_pool2d(x_cpu, (1, 256)) |
| grad = torch.randn_like(y) |
| grad_cpu = grad.cpu() |
| y.backward(grad) |
| y_cpu.backward(grad_cpu) |
| self.assertEqual(x.grad, x_cpu.grad) |
| |
| @skipMeta |
| @expectedFailureMPS # NotImplementedError: aten::channel_shuffle https://github.com/pytorch/pytorch/issues/77764 |
| def test_channel_shuffle(self, device): |
| # 3D tensor |
| x = torch.tensor( |
| [[[1, 2], |
| [5, 6], |
| [9, 10], |
| [13, 14], |
| ]], device=device |
| ) |
| y_ref = torch.tensor( |
| [[[1, 2], |
| [9, 10], |
| [5, 6], |
| [13, 14], |
| ]], device=device |
| ) |
| # ChannelsFirst |
| with warnings.catch_warnings(record=True) as w: |
| y = F.channel_shuffle(x, 2).to(device) |
| self.assertEqual(len(w), 0) |
| self.assertEqual(y, y_ref) |
| # ChannelsLast not supported for 3dim |
| |
| # 4D tensor |
| x = torch.tensor( |
| [[[[1, 2], |
| [3, 4]], |
| [[5, 6], |
| [7, 8]], |
| [[9, 10], |
| [11, 12]], |
| [[13, 14], |
| [15, 16]], |
| ]], device=device |
| ) |
| y_ref = torch.tensor( |
| [[[[1, 2], |
| [3, 4]], |
| [[9, 10], |
| [11, 12]], |
| [[5, 6], |
| [7, 8]], |
| [[13, 14], |
| [15, 16]], |
| ]], device=device |
| ) |
| # ChannelsFirst NCHW |
| with warnings.catch_warnings(record=True) as w: |
| y = F.channel_shuffle(x, 2).to(device) |
| self.assertEqual(len(w), 0) |
| self.assertEqual(y, y_ref) |
| # ChannelsLast NHWC |
| with warnings.catch_warnings(record=True) as w: |
| y = F.channel_shuffle(x.contiguous(memory_format=torch.channels_last), 2).to(device) |
| self.assertEqual(len(w), 0) |
| y = y.contiguous(memory_format=torch.contiguous_format) |
| self.assertEqual(y, y_ref) |
| |
| # 5D tensor |
| x = torch.tensor( |
| [[[[[1, 2], |
| [3, 4]]], |
| [[[5, 6], |
| [7, 8]]], |
| [[[9, 10], |
| [11, 12]]], |
| [[[13, 14], |
| [15, 16]]], |
| ]], device=device |
| ) |
| y_ref = torch.tensor( |
| [[[[[1, 2], |
| [3, 4]]], |
| [[[9, 10], |
| [11, 12]]], |
| [[[5, 6], |
| [7, 8]]], |
| [[[13, 14], |
| [15, 16]]], |
| ]], device=device |
| ) |
| # ChannelsFirst NCHW |
| with warnings.catch_warnings(record=True) as w: |
| y = F.channel_shuffle(x, 2).to(device) |
| self.assertEqual(len(w), 0) |
| self.assertEqual(y, y_ref) |
| # ChannelsLast NHWC |
| with warnings.catch_warnings(record=True) as w: |
| y = F.channel_shuffle(x.contiguous(memory_format=torch.channels_last_3d), 2).to(device) |
| self.assertEqual(len(w), 0) |
| y = y.contiguous(memory_format=torch.contiguous_format) |
| self.assertEqual(y, y_ref) |
| |
| |
| class TestFunctionalPickle(TestCase): |
| |
| # issue gh-38137 |
| def test_pickle_softsign(self): |
| # Make sure it does not throw an exception |
| s = pickle.dumps(F.softsign) |
| |
| |
| class TestFusionUtils(TestCase): |
| def test_fuse_conv_bn_requires_grad(self): |
| conv = torch.nn.Conv2d(3, 3, 3) |
| bn = torch.nn.BatchNorm2d(3) |
| cases = itertools.product([True, False], [True, False]) |
| for w_rg, b_rg in cases: |
| conv.weight.requires_grad = w_rg |
| conv.bias.requires_grad = b_rg |
| weight, bias = \ |
| fuse_conv_bn_weights(conv.weight, conv.bias, |
| bn.running_mean, bn.running_var, bn.eps, bn.weight, bn.bias) |
| self.assertEqual(weight.requires_grad, w_rg) |
| self.assertEqual(bias.requires_grad, b_rg) |
| |
| def test_fuse_linear_bn_requires_grad(self): |
| linear = torch.nn.Linear(3, 3) |
| bn = torch.nn.BatchNorm1d(3) |
| cases = itertools.product([True, False], [True, False]) |
| for w_rg, b_rg in cases: |
| linear.weight.requires_grad = w_rg |
| linear.bias.requires_grad = b_rg |
| weight, bias = \ |
| fuse_linear_bn_weights(linear.weight, linear.bias, |
| bn.running_mean, bn.running_var, bn.eps, bn.weight, bn.bias) |
| self.assertEqual(weight.requires_grad, w_rg) |
| self.assertEqual(bias.requires_grad, b_rg) |
| |
| class TestUtils(TestCase): |
| def test_consume_prefix_in_state_dict_if_present(self): |
| class Block(nn.Module): |
| def __init__(self) -> None: |
| super().__init__() |
| self.conv1 = nn.Conv2d(3, 3, 3, bias=True) |
| self.conv2 = nn.Conv2d(3, 3, 3, bias=False) |
| |
| class Net(nn.Module): |
| def __init__(self) -> None: |
| super().__init__() |
| self.linear1 = nn.Linear(5, 5) |
| self.linear2 = nn.Linear(5, 5) |
| net.bn = nn.BatchNorm2d(2) |
| self.block = Block() |
| |
| # 0. Case non-DDP model empty state_dict |
| net = nn.Module() |
| state_dict = net.state_dict() |
| nn.modules.utils.consume_prefix_in_state_dict_if_present(state_dict, 'module.') |
| # check they are the same preserving order |
| self.assertEqual(list(state_dict.keys()), list(net.state_dict().keys())) |
| self.assertEqual(list(state_dict._metadata.keys()), list(net.state_dict()._metadata.keys())) |
| |
| # 1. Case non-DDP model test example state_dict |
| net = Net() |
| state_dict = net.state_dict() |
| nn.modules.utils.consume_prefix_in_state_dict_if_present(state_dict, 'module.') |
| # Check they are the same preserving order |
| self.assertEqual(list(state_dict.keys()), list(net.state_dict().keys())) |
| self.assertEqual(list(state_dict._metadata.keys()), list(net.state_dict()._metadata.keys())) |
| |
| # 2. Case DDP model test example state_dict |
| state_dict = net.state_dict() |
| metadata = state_dict._metadata |
| ddp_state_dict = OrderedDict((f'module.{k}', v) for k, v in state_dict.items()) |
| ddp_state_dict._metadata = OrderedDict({'': metadata['']}) |
| ddp_state_dict._metadata.update(('module' if k == '' else f'module.{k}', v) for k, v in metadata.items()) |
| nn.modules.utils.consume_prefix_in_state_dict_if_present(ddp_state_dict, 'module.') |
| # Check they are the same preserving order |
| self.assertEqual(list(state_dict.keys()), list(ddp_state_dict.keys())) |
| self.assertEqual(list(state_dict._metadata.keys()), list(ddp_state_dict._metadata.keys())) |
| |
| |
| instantiate_device_type_tests(TestNNDeviceType, globals(), allow_mps=True) |
| instantiate_parametrized_tests(TestNN) |
| |
| if __name__ == '__main__': |
| TestCase._default_dtype_check_enabled = True |
| run_tests() |