| # Owner(s): ["module: cuda"] |
| |
| from itertools import repeat, chain, product |
| from typing import NamedTuple |
| import collections |
| import contextlib |
| import ctypes |
| import gc |
| import io |
| import os |
| import pickle |
| import queue |
| import sys |
| import tempfile |
| import threading |
| import unittest |
| |
| import torch |
| import torch.cuda |
| import torch.cuda.comm as comm |
| from torch.nn.parallel import scatter_gather |
| from torch.utils.checkpoint import checkpoint_sequential |
| from torch._six import inf, nan |
| from torch.testing._internal.common_methods_invocations import tri_tests_args, tri_large_tests_args, \ |
| _compare_trilu_indices, _compare_large_trilu_indices |
| from torch.testing._internal.common_utils import TestCase, freeze_rng_state, run_tests, \ |
| NO_MULTIPROCESSING_SPAWN, skipIfRocm, load_tests, IS_REMOTE_GPU, IS_SANDCASTLE, IS_WINDOWS, \ |
| slowTest, skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf, TEST_WITH_ROCM, TEST_NUMPY, \ |
| get_cycles_per_ms |
| from torch.testing._internal.autocast_test_lists import AutocastTestLists |
| |
| # load_tests from common_utils is used to automatically filter tests for |
| # sharding on sandcastle. This line silences flake warnings |
| load_tests = load_tests |
| |
| # We cannot import TEST_CUDA and TEST_MULTIGPU from torch.testing._internal.common_cuda here, |
| # because if we do that, the TEST_CUDNN line from torch.testing._internal.common_cuda will be executed |
| # multiple times as well during the execution of this test suite, and it will |
| # cause CUDA OOM error on Windows. |
| TEST_CUDA = torch.cuda.is_available() |
| TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2 |
| |
| if not TEST_CUDA: |
| print('CUDA not available, skipping tests', file=sys.stderr) |
| TestCase = object # noqa: F811 |
| |
| TEST_LARGE_TENSOR = TEST_CUDA |
| TEST_MEDIUM_TENSOR = TEST_CUDA |
| TEST_CUDNN = TEST_CUDA |
| TEST_BF16 = False |
| if TEST_CUDA: |
| torch.ones(1).cuda() # initialize cuda context |
| TEST_CUDNN = TEST_CUDA and (TEST_WITH_ROCM or |
| torch.backends.cudnn.is_acceptable(torch.tensor(1., device=torch.device('cuda:0')))) |
| TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 12e9 |
| TEST_MEDIUM_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 6e9 |
| TEST_BF16 = torch.cuda.is_bf16_supported() |
| |
| |
| def make_sparse_tensor(t, n, *sizes): |
| assert t.is_sparse |
| tensor = t() |
| i = tensor._indices() |
| i = i.new(len(sizes), n).copy_( |
| torch.cat([torch.LongTensor(1, n).random_(s) for s in sizes], 0)) |
| v = tensor._values() |
| v = v.new(n).copy_(torch.randn(n)) |
| return t(i, v, torch.Size(sizes)).coalesce() |
| |
| _cycles_per_ms = None |
| |
| |
| class TestCuda(TestCase): |
| _do_cuda_memory_leak_check = True |
| _do_cuda_non_default_stream = True |
| FIFTY_MIL_CYCLES = 50000000 |
| |
| def setUp(self): |
| super(TestCuda, self).setUp() |
| self.autocast_lists = AutocastTestLists(torch.device('cuda:0')) |
| |
| def tearDown(self): |
| del self.autocast_lists |
| super(TestCuda, self).tearDown() |
| |
| def _check_memory_stat_consistency(self): |
| snapshot = torch.cuda.memory_snapshot() |
| |
| expected_each_device = collections.defaultdict(lambda: collections.defaultdict(int)) |
| |
| for segment in snapshot: |
| expected = expected_each_device[segment["device"]] |
| pool_str = segment["segment_type"] + "_pool" |
| |
| expected["segment.all.current"] += 1 |
| expected["segment." + pool_str + ".current"] += 1 |
| |
| expected["allocated_bytes.all.current"] += segment["allocated_size"] |
| expected["allocated_bytes." + pool_str + ".current"] += segment["allocated_size"] |
| |
| expected["reserved_bytes.all.current"] += segment["total_size"] |
| expected["reserved_bytes." + pool_str + ".current"] += segment["total_size"] |
| |
| expected["active_bytes.all.current"] += segment["active_size"] |
| expected["active_bytes." + pool_str + ".current"] += segment["active_size"] |
| |
| is_split = len(segment["blocks"]) > 1 |
| for block in segment["blocks"]: |
| if block["state"] == "active_allocated": |
| expected["allocation.all.current"] += 1 |
| expected["allocation." + pool_str + ".current"] += 1 |
| |
| if block["state"].startswith("active_"): |
| expected["active.all.current"] += 1 |
| expected["active." + pool_str + ".current"] += 1 |
| |
| if block["state"] == "inactive" and is_split: |
| expected["inactive_split.all.current"] += 1 |
| expected["inactive_split." + pool_str + ".current"] += 1 |
| expected["inactive_split_bytes.all.current"] += block["size"] |
| expected["inactive_split_bytes." + pool_str + ".current"] += block["size"] |
| |
| for device, expected in expected_each_device.items(): |
| stats = torch.cuda.memory_stats(device) |
| for k, v in expected.items(): |
| self.assertEqual(v, stats[k]) |
| |
| @staticmethod |
| def _test_memory_stats_generator(self, device=None, N=35): |
| if device is None: |
| device = torch.cuda.current_device() |
| |
| m0 = torch.cuda.memory_allocated(device) |
| last_m_arr = [torch.cuda.memory_allocated(device)] |
| max_m_arr = [torch.cuda.max_memory_allocated(device)] |
| last_r_arr = [torch.cuda.memory_reserved(device)] |
| max_r_arr = [torch.cuda.max_memory_reserved(device)] |
| |
| def alloc(*size): |
| with torch.cuda.device(device): |
| # NOTE: do **not** use methods that can have additional |
| # memory overhead, e.g., inplace random sampling methods. |
| # they can leave some memory occupied even after being |
| # deallocated, e.g., initialized RNG state, causing some |
| # memory checks below to fail. |
| return torch.cuda.FloatTensor(*size) |
| |
| def assert_change(comp=1, empty_cache=False, reset_peak=False): |
| # comp > 0: increased |
| # comp = 0: equal |
| # comp < 0: decreased |
| new_m = torch.cuda.memory_allocated(device) |
| new_max_m = torch.cuda.max_memory_allocated(device) |
| if comp > 0: |
| self.assertGreater(new_m, last_m_arr[0]) |
| elif comp < 0: |
| self.assertLess(new_m, last_m_arr[0]) |
| else: |
| self.assertEqual(new_m, last_m_arr[0]) |
| self.assertLessEqual(new_m, new_max_m) |
| self.assertGreaterEqual(new_max_m, max_m_arr[0]) |
| last_m_arr[0] = new_m |
| max_m_arr[0] = new_max_m |
| |
| new_r = torch.cuda.memory_reserved(device) |
| new_max_r = torch.cuda.max_memory_reserved(device) |
| # emptying cache may happen (due to allocation or empty_cache), so |
| # we can't assert new_c >= last_c |
| self.assertLessEqual(new_r, new_max_r) |
| self.assertGreaterEqual(new_max_r, max_r_arr[0]) |
| last_r_arr[0] = new_r |
| max_r_arr[0] = new_max_r |
| |
| if empty_cache: |
| torch.cuda.empty_cache() |
| new_r = torch.cuda.memory_reserved(device) |
| new_max_r = torch.cuda.max_memory_reserved(device) |
| self.assertLessEqual(new_r, last_r_arr[0]) |
| self.assertLessEqual(new_r, new_max_r) |
| self.assertEqual(new_max_r, max_r_arr[0]) |
| last_r_arr[0] = new_r |
| |
| if reset_peak: |
| torch.cuda.reset_peak_memory_stats(device) |
| self.assertEqual(torch.cuda.memory_allocated(device), last_m_arr[0]) |
| self.assertEqual(torch.cuda.max_memory_allocated(device), last_m_arr[0]) |
| max_m_arr[0] = last_m_arr[0] |
| self.assertEqual(torch.cuda.memory_reserved(device), last_r_arr[0]) |
| self.assertEqual(torch.cuda.max_memory_reserved(device), last_r_arr[0]) |
| max_r_arr[0] = last_r_arr[0] |
| |
| assert_change(0) |
| assert_change(0, reset_peak=True) |
| assert_change(0, empty_cache=True) |
| assert_change(0, reset_peak=True) |
| assert_change(0) |
| yield |
| |
| tensors1 = [alloc(1), alloc(10, 20), alloc(200, 300, 2000)] |
| m1 = torch.cuda.memory_allocated(device) |
| assert_change(1) |
| yield |
| |
| tensors2 = [] |
| |
| for i in range(1, int(N / 2) + 1): |
| # small ones |
| tensors2.append(alloc(i, i * 4)) |
| assert_change(1) |
| yield |
| |
| for i in range(5, int(N / 2) + 5): |
| # large ones |
| tensors2.append(alloc(i, i * 7, i * 9, i * 11)) |
| assert_change(1, reset_peak=(i % 2 == 0)) |
| yield |
| |
| tensors2.append(alloc(0, 0, 0)) |
| assert_change(0) |
| yield |
| |
| permute = [] |
| for i in torch.randperm(len(tensors2)): |
| permute.append(tensors2[i]) |
| assert_change(0) |
| yield |
| |
| del tensors2 |
| assert_change(0) |
| yield |
| tensors2 = permute |
| assert_change(0) |
| yield |
| del permute |
| assert_change(0, reset_peak=True) |
| yield |
| |
| for i in range(int(N / 2)): |
| x = tensors2[i].numel() |
| del tensors2[i] |
| assert_change(-x) # in case that tensors2[i] is empty |
| yield |
| |
| for i in range(2, int(2 * N / 3) + 2): |
| tensors2.append(alloc(i, i * 3, i * 8)) |
| assert_change(1) |
| yield |
| |
| del tensors2 |
| assert_change(-1, reset_peak=True) |
| assert_change(0) |
| self.assertEqual(torch.cuda.memory_allocated(device), m1) |
| yield True |
| |
| del tensors1 |
| assert_change(-1, reset_peak=True) |
| self.assertEqual(torch.cuda.memory_allocated(device), m0) |
| |
| # test empty_cache and reset_peak |
| assert_change(0, empty_cache=True) |
| assert_change(0, reset_peak=True) |
| |
| def test_cudart_register(self): |
| t = torch.ones(20) |
| self.assertFalse(t.is_pinned()) |
| cudart = torch.cuda.cudart() |
| r = cudart.cudaHostRegister(t.data_ptr(), t.numel() * t.element_size(), 0) |
| self.assertEqual(r, 0) |
| self.assertTrue(t.is_pinned()) |
| r = cudart.cudaHostUnregister(t.data_ptr()) |
| self.assertEqual(r, 0) |
| self.assertFalse(t.is_pinned()) |
| |
| def test_memory_stats(self): |
| gc.collect() |
| torch.cuda.empty_cache() |
| for _ in self._test_memory_stats_generator(self): |
| self._check_memory_stat_consistency() |
| |
| def test_memory_allocation(self): |
| gc.collect() |
| torch.cuda.empty_cache() |
| mem = None |
| size = 1 |
| prev = 0 |
| try: |
| prev = torch.cuda.memory_allocated() |
| mem = torch.cuda.caching_allocator_alloc(size) |
| self.assertGreater(torch.cuda.memory_allocated(), prev) |
| finally: |
| if mem is not None: |
| torch.cuda.caching_allocator_delete(mem) |
| self.assertEqual(torch.cuda.memory_allocated(), prev) |
| |
| def test_check_error(self): |
| # Assert this call doesn't raise. |
| torch.cuda.check_error(0) |
| |
| with self.assertRaisesRegex(torch.cuda.CudaError, |
| "out of memory|hipErrorOutOfMemory"): |
| torch.cuda.check_error(2) |
| |
| def test_cuda_get_device_name(self): |
| # Testing the behaviour with None as an argument |
| current_device = torch.cuda.current_device() |
| current_device_name = torch.cuda.get_device_name(current_device) |
| device_name_None = torch.cuda.get_device_name(None) |
| self.assertEqual(current_device_name, device_name_None) |
| |
| # Testing the behaviour for No argument |
| device_name_no_argument = torch.cuda.get_device_name() |
| self.assertEqual(current_device_name, device_name_no_argument) |
| |
| def test_cuda_get_device_capability(self): |
| # Testing the behaviour with None as an argument |
| current_device = torch.cuda.current_device() |
| current_device_capability = torch.cuda.get_device_capability(current_device) |
| device_capability_None = torch.cuda.get_device_capability(None) |
| self.assertEqual(current_device_capability, device_capability_None) |
| |
| # Testing the behaviour for No argument |
| device_capability_no_argument = torch.cuda.get_device_capability() |
| self.assertEqual(current_device_capability, device_capability_no_argument) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_memory_stats_multigpu(self): |
| # advance a generator with a end flag |
| def advance(gen, end): |
| if not end: |
| try: |
| next(gen) |
| except StopIteration: |
| end = True |
| return end |
| |
| # interlace |
| torch.cuda.empty_cache() |
| gen0 = self._test_memory_stats_generator(self, device='cuda:0', N=35) |
| gen1 = self._test_memory_stats_generator(self, device=torch.device('cuda:1'), N=35) |
| end0 = end1 = False |
| while not (end0 and end1): |
| end0 = advance(gen0, end0) |
| end1 = advance(gen1, end1) |
| |
| # semi-random order |
| torch.cuda.empty_cache() |
| gen0 = self._test_memory_stats_generator(self, device=0, N=35) |
| gen1 = self._test_memory_stats_generator(self, device=torch.device('cuda:1'), N=35) |
| end0 = end1 = False |
| |
| while not (end0 and end1): |
| end0 = advance(gen0, end0) |
| if not end0: |
| gen1_max_times = torch.LongTensor(1).random_(0, 3)[0] |
| else: |
| gen1_max_times = inf |
| t = 0 |
| while t < gen1_max_times and not end1: |
| end1 = advance(gen1, end1) |
| t += 1 |
| |
| def test_out_of_memory(self): |
| tensor = torch.zeros(1024, device='cuda') |
| |
| with self.assertRaisesRegex(RuntimeError, "Tried to allocate 800000000.00 GiB"): |
| torch.empty(1024 * 1024 * 1024 * 800000000, dtype=torch.int8, device='cuda') |
| |
| with self.assertRaisesRegex(RuntimeError, "Tried to allocate more than 1EB memory"): |
| torch.empty(1024 * 1024 * 1024 * 8000000000, dtype=torch.int8, device='cuda') |
| |
| # ensure out of memory error doesn't disturb subsequent kernel |
| tensor.fill_(1) |
| self.assertTrue((tensor == 1).all()) |
| |
| def test_set_per_process_memory_fraction(self): |
| # test invalid fraction value. |
| with self.assertRaisesRegex(TypeError, "Invalid type"): |
| torch.cuda.set_per_process_memory_fraction(int(1)) |
| with self.assertRaisesRegex(ValueError, "Invalid fraction value"): |
| torch.cuda.set_per_process_memory_fraction(-0.1) |
| with self.assertRaisesRegex(ValueError, "Invalid fraction value"): |
| torch.cuda.set_per_process_memory_fraction(2.0) |
| |
| tensor = torch.zeros(1024, device='cuda') |
| torch.cuda.empty_cache() |
| total_memory = torch.cuda.get_device_properties(0).total_memory |
| torch.cuda.set_per_process_memory_fraction(0.5, 0) |
| |
| # test 0.499 allocation is ok. |
| application = int(total_memory * 0.499) - torch.cuda.max_memory_reserved() |
| tmp_tensor = torch.empty(application, dtype=torch.int8, device='cuda') |
| del tmp_tensor |
| torch.cuda.empty_cache() |
| |
| application = int(total_memory * 0.5) |
| # it will get OOM when try to allocate more than half memory. |
| with self.assertRaisesRegex(RuntimeError, "out of memory"): |
| torch.empty(application, dtype=torch.int8, device='cuda') |
| |
| # ensure out of memory error doesn't disturb subsequent kernel |
| tensor.fill_(1) |
| self.assertTrue((tensor == 1).all()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_autogpu(self): |
| x = torch.randn(5, 5).cuda() |
| y = torch.randn(5, 5).cuda() |
| self.assertEqual(x.get_device(), 0) |
| self.assertEqual(x.get_device(), 0) |
| with torch.cuda.device(1): |
| z = torch.randn(5, 5).cuda() |
| self.assertEqual(z.get_device(), 1) |
| q = x.add(y) |
| self.assertEqual(q.get_device(), 0) |
| w = torch.randn(5, 5).cuda() |
| self.assertEqual(w.get_device(), 1) |
| self.assertEqual(y.cuda().get_device(), 1) |
| z = z.cuda() |
| self.assertEqual(z.get_device(), 0) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_new(self): |
| x = torch.randn(3, 3).cuda() |
| self.assertEqual(x.new([0, 1, 2]).get_device(), 0) |
| self.assertEqual(x.new([0, 1, 2], device=1).get_device(), 1) |
| |
| with torch.cuda.device(1): |
| self.assertEqual(x.new([0, 1, 2]).get_device(), 0) |
| self.assertEqual(x.new([0, 1, 2], device=1).get_device(), 1) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_copy_device(self): |
| x = torch.randn(5, 5).cuda() |
| with torch.cuda.device(1): |
| y = x.cuda() |
| self.assertEqual(y.get_device(), 1) |
| self.assertIs(y.cuda(), y) |
| z = y.cuda(0) |
| self.assertEqual(z.get_device(), 0) |
| self.assertIs(z.cuda(0), z) |
| |
| x = torch.randn(5, 5) |
| with torch.cuda.device(1): |
| y = x.cuda() |
| self.assertEqual(y.get_device(), 1) |
| self.assertIs(y.cuda(), y) |
| z = y.cuda(0) |
| self.assertEqual(z.get_device(), 0) |
| self.assertIs(z.cuda(0), z) |
| |
| def _test_copy_sync_current_stream(self, x, y): |
| x_plus_one = x + 1 |
| s0 = torch.cuda.Stream(device=x.device) |
| s1 = torch.cuda.Stream(device=y.device) |
| s2 = torch.cuda.Stream(device=x.device) |
| s3 = torch.cuda.Stream(device=y.device) |
| |
| # same dst stream different src streams |
| with torch.cuda.stream(s0): |
| torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES) |
| with torch.cuda.stream(s1): |
| y.copy_(x_plus_one) |
| |
| with torch.cuda.stream(s2), torch.cuda.stream(s1): |
| y.copy_(x) |
| |
| s1.synchronize() |
| # The copy() is synchronized on the current streams of both src and dst. |
| # In the above test, the _sleep() op on s0 will not block the copy() on |
| # s2, but both copies are synchronized on s1 in the dst device. Hence, |
| # x is copied to y after x_plus_one is copied to y. If x and y are on |
| # the same device, both copy() ops are synchronized on s1. |
| self.assertEqual(y, x) |
| |
| # same src stream different dst streams |
| with torch.cuda.stream(s1): |
| torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES) |
| with torch.cuda.stream(s0): |
| y.copy_(x_plus_one) |
| |
| with torch.cuda.stream(s3), torch.cuda.stream(s0): |
| y.copy_(x) |
| |
| s0.synchronize() |
| # Similarly, both copy() ops are synchronized on s0. |
| self.assertEqual(y, x) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_copy_streams(self): |
| d0 = torch.device('cuda:0') |
| x0 = torch.zeros(5, 5, device=d0) |
| |
| d1 = torch.device('cuda:1') |
| x1 = torch.zeros(5, 5, device=d1) |
| self._test_copy_sync_current_stream(x0, x1) |
| |
| x2 = torch.zeros(5, 5, device=d0) |
| self._test_copy_sync_current_stream(x0, x2) |
| |
| def test_copy_non_blocking(self): |
| def _test_copy_non_blocking(a, b): |
| event = torch.cuda.Event() |
| a.copy_(b, non_blocking=True) |
| event.record() |
| event.synchronize() |
| self.assertEqual(a, b) |
| |
| # 10MB copies |
| x = torch.ones(10000000, dtype=torch.uint8).cuda() |
| y = torch.zeros(10000000, dtype=torch.uint8).pin_memory() |
| _test_copy_non_blocking(x, y) |
| |
| x = torch.zeros(10000000, dtype=torch.uint8).pin_memory() |
| y = torch.ones(10000000, dtype=torch.uint8).cuda() |
| _test_copy_non_blocking(x, y) |
| |
| # Test the case where the pinned data_ptr is not equal to the storage data_ptr. |
| x_base = torch.zeros(10000000, dtype=torch.uint8).pin_memory() |
| x = x_base[1:] |
| self.assertTrue(x.is_pinned()) |
| self.assertTrue(x_base.is_pinned()) |
| self.assertNotEqual(x_base.data_ptr(), x.data_ptr()) |
| self.assertEqual(x_base.storage().data_ptr(), x.storage().data_ptr()) |
| y = torch.ones(10000000 - 1, dtype=torch.uint8).cuda() |
| _test_copy_non_blocking(x, y) |
| |
| |
| def test_to_non_blocking(self): |
| stream = torch.cuda.current_stream() |
| |
| def _test_to_non_blocking(a, non_blocking, dst): |
| torch.cuda.synchronize() |
| # Pushes an 0.1 second spin to stream so if the copy is non blocking, |
| # stream will almost surely be active when we query(). |
| torch.cuda._sleep(int(100 * get_cycles_per_ms())) |
| b = a.to(device=dst, non_blocking=non_blocking) |
| self.assertEqual(stream.query(), not non_blocking) |
| stream.synchronize() |
| self.assertEqual(a, b) |
| self.assertTrue(b.is_pinned() == (non_blocking and dst == "cpu")) |
| |
| for dst, try_non_blocking in product(("cuda", "cpu"), (True, False)): |
| # Creates source on the opposite device from destination. |
| src = torch.randn(1000000, |
| device="cuda" if dst == "cpu" else "cpu", |
| pin_memory=True if dst == "cuda" else False) |
| _test_to_non_blocking(src, try_non_blocking, dst) |
| |
| def test_to_cpu_blocking_by_default(self): |
| src = torch.randn(1000000, device="cuda") |
| torch.cuda.synchronize() |
| torch.cuda._sleep(int(100 * get_cycles_per_ms())) |
| dst = src.to(device="cpu") |
| self.assertEqual(torch.cuda.current_stream().query(), True) |
| self.assertEqual(src, dst) |
| self.assertFalse(dst.is_pinned()) |
| |
| def test_serialization_array_with_storage(self): |
| x = torch.randn(5, 5).cuda() |
| y = torch.IntTensor(2, 5).fill_(0).cuda() |
| q = [x, y, x, y.storage()] |
| with tempfile.NamedTemporaryFile() as f: |
| torch.save(q, f) |
| f.seek(0) |
| q_copy = torch.load(f) |
| self.assertEqual(q_copy, q, atol=0, rtol=0) |
| q_copy[0].fill_(5) |
| self.assertEqual(q_copy[0], q_copy[2], atol=0, rtol=0) |
| self.assertTrue(isinstance(q_copy[0], torch.cuda.FloatTensor)) |
| self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor)) |
| self.assertTrue(isinstance(q_copy[2], torch.cuda.FloatTensor)) |
| self.assertTrue(isinstance(q_copy[3], torch.storage.TypedStorage)) |
| self.assertTrue(isinstance(q_copy[3]._storage, torch.UntypedStorage)) |
| q_copy[1].fill_(10) |
| self.assertEqual(q_copy[3], torch.cuda.IntStorage(10).fill_(10)) |
| |
| def test_cublas_allow_tf32_get_set(self): |
| skip_tf32_cublas = 'TORCH_ALLOW_TF32_CUBLAS_OVERRIDE' in os.environ and\ |
| int(os.environ['TORCH_ALLOW_TF32_CUBLAS_OVERRIDE']) |
| if skip_tf32_cublas: |
| self.assertTrue(torch.backends.cuda.matmul.allow_tf32) |
| return |
| |
| orig = torch.backends.cuda.matmul.allow_tf32 |
| self.assertEqual(torch._C._get_cublas_allow_tf32(), orig) |
| torch.backends.cuda.matmul.allow_tf32 = not orig |
| self.assertEqual(torch._C._get_cublas_allow_tf32(), not orig) |
| torch.backends.cuda.matmul.allow_tf32 = orig |
| |
| def test_float32_matmul_precision_get_set(self): |
| self.assertEqual(torch.get_float32_matmul_precision(), 'highest') |
| skip_tf32_cublas = 'TORCH_ALLOW_TF32_CUBLAS_OVERRIDE' in os.environ and\ |
| int(os.environ['TORCH_ALLOW_TF32_CUBLAS_OVERRIDE']) |
| if not skip_tf32_cublas: |
| self.assertFalse(torch.backends.cuda.matmul.allow_tf32) |
| for p in ('medium', 'high'): |
| torch.set_float32_matmul_precision(p) |
| self.assertEqual(torch.get_float32_matmul_precision(), p) |
| if not skip_tf32_cublas: |
| self.assertTrue(torch.backends.cuda.matmul.allow_tf32) |
| torch.set_float32_matmul_precision('highest') |
| self.assertEqual(torch.get_float32_matmul_precision(), 'highest') |
| if not skip_tf32_cublas: |
| self.assertFalse(torch.backends.cuda.matmul.allow_tf32) |
| |
| def test_cublas_allow_fp16_reduced_precision_reduction_get_set(self): |
| orig = torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction |
| self.assertEqual(torch._C._get_cublas_allow_fp16_reduced_precision_reduction(), orig) |
| torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = not orig |
| self.assertEqual(torch._C._get_cublas_allow_fp16_reduced_precision_reduction(), not orig) |
| torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = orig |
| |
| def test_cudnn_allow_tf32_get_set(self): |
| with torch.backends.cudnn.flags(enabled=None, benchmark=None, deterministic=None, allow_tf32=False): |
| self.assertFalse(torch.backends.cudnn.allow_tf32) |
| with torch.backends.cudnn.flags(enabled=None, benchmark=None, deterministic=None, allow_tf32=True): |
| self.assertTrue(torch.backends.cudnn.allow_tf32) |
| |
| def test_type_conversions(self): |
| x = torch.randn(5, 5) |
| self.assertIsInstance(x.float(), torch.FloatTensor) |
| self.assertIsInstance(x.cuda().double(), torch.cuda.DoubleTensor) |
| self.assertIsInstance(x.cuda().float(), torch.cuda.FloatTensor) |
| self.assertIsInstance(x.cuda().float().cpu(), torch.FloatTensor) |
| self.assertIsInstance(x.cuda().float().cpu().int(), torch.IntTensor) |
| |
| y = x.storage() |
| self.assertIsInstance(y.float(), torch.FloatStorage) |
| self.assertIsInstance(y.cuda().double(), torch.cuda.DoubleStorage) |
| self.assertIsInstance(y.cuda().float(), torch.cuda.FloatStorage) |
| self.assertIsInstance(y.cuda().float().cpu(), torch.FloatStorage) |
| self.assertIsInstance(y.cuda().float().cpu().int(), torch.IntStorage) |
| |
| @unittest.skip("was disabled due to not enough memory, but actually it always fail") |
| def test_arithmetic_large_tensor(self): |
| x = torch.empty(2**30, device='cuda') |
| |
| x.fill_(1) |
| self.assertEqual(x.sum(), 2**30) |
| |
| x += 1 |
| self.assertEqual(x.sum(), 2**31) |
| |
| x.fill_(1) |
| x -= 0.5 |
| self.assertEqual(x.sum(), 2**29) |
| |
| x.fill_(1) |
| x *= 2 |
| self.assertEqual(x.sum(), 2**31) |
| |
| x.fill_(1) |
| x /= 2 |
| self.assertEqual(x.sum(), 2**29) |
| |
| def test_gather_bool(self): |
| t = torch.tensor([[False, True], [True, True]], device='cuda') |
| self.assertEqual(torch.gather(t, 1, torch.tensor([[0, 0], [1, 0]], device='cuda')), |
| torch.tensor([[False, False], [True, True]], device='cuda')) |
| |
| def test_torch_manual_seed_seeds_cuda_devices(self): |
| with freeze_rng_state(): |
| x = torch.zeros(4, 4).float().cuda() |
| torch.manual_seed(2) |
| self.assertEqual(torch.cuda.initial_seed(), 2) |
| x.uniform_() |
| torch.manual_seed(2) |
| y = x.clone().uniform_() |
| self.assertEqual(x, y) |
| self.assertEqual(torch.cuda.initial_seed(), 2) |
| |
| def test_manual_seed(self): |
| with freeze_rng_state(): |
| x = torch.zeros(4, 4).float().cuda() |
| torch.cuda.manual_seed(2) |
| self.assertEqual(torch.cuda.initial_seed(), 2) |
| x.uniform_() |
| a = torch.bernoulli(torch.full_like(x, 0.5)) |
| torch.cuda.manual_seed(2) |
| y = x.clone().uniform_() |
| b = torch.bernoulli(torch.full_like(x, 0.5)) |
| self.assertEqual(x, y) |
| self.assertEqual(a, b) |
| self.assertEqual(torch.cuda.initial_seed(), 2) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_cat_autogpu(self): |
| x = torch.randn(4, 4).cuda(1) |
| y = torch.randn(4, 4).cuda(1) |
| z = torch.cat([x, y], 0) |
| self.assertEqual(z.get_device(), x.get_device()) |
| |
| @unittest.skipIf(torch.cuda.device_count() >= 10, "Loading a cuda:9 tensor") |
| def test_load_nonexistent_device(self): |
| # Setup: create a serialized file object with a 'cuda:9' restore location |
| tensor = torch.randn(2, device='cuda') |
| buf = io.BytesIO() |
| torch.save(tensor, buf) |
| # NB: this might not work in the future if serialization changes |
| buf = io.BytesIO(buf.getvalue().replace(b'cuda:0', b'cuda:9')) |
| |
| msg = r'Attempting to deserialize object on CUDA device 9' |
| with self.assertRaisesRegex(RuntimeError, msg): |
| _ = torch.load(buf) |
| |
| def test_specify_improper_device_name(self): |
| import os |
| fname = "tempfile.pt" |
| try: |
| with self.assertRaisesRegex(RuntimeError, "Invalid device string"): |
| torch.save([torch.nn.Parameter(torch.randn(10, 10))], fname, |
| _use_new_zipfile_serialization=True) |
| torch.load(fname, 'cuda0') |
| finally: |
| if os.path.exists(fname): |
| os.remove(fname) |
| |
| def test_get_device_index(self): |
| from torch.cuda._utils import _get_device_index |
| with self.assertRaisesRegex(RuntimeError, "Invalid device string"): |
| _get_device_index('cuda0', optional=True) |
| |
| with self.assertRaisesRegex(ValueError, "Expected a cuda device"): |
| cpu_device = torch.device('cpu') |
| _get_device_index(cpu_device, optional=True) |
| |
| def test_serialization_array_with_empty(self): |
| x = [torch.randn(4, 4).cuda(), torch.cuda.FloatTensor()] |
| with tempfile.NamedTemporaryFile() as f: |
| torch.save(x, f) |
| f.seek(0) |
| x_copy = torch.load(f) |
| for original, copy in zip(x, x_copy): |
| self.assertEqual(copy, original) |
| self.assertIs(type(copy), type(original)) |
| self.assertEqual(copy.get_device(), original.get_device()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_multigpu_serialization_remap(self): |
| x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)] |
| |
| def gpu_remap(storage, location): |
| if location == 'cuda:1': |
| return storage.cuda(0) |
| |
| with tempfile.NamedTemporaryFile() as f: |
| torch.save(x, f) |
| f.seek(0) |
| x_copy = torch.load(f, map_location=gpu_remap) |
| |
| for original, copy in zip(x, x_copy): |
| self.assertEqual(copy, original) |
| self.assertIs(type(copy), type(original)) |
| self.assertEqual(copy.get_device(), 0) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_multigpu_serialization_remap_dict(self): |
| x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)] |
| with tempfile.NamedTemporaryFile() as f: |
| torch.save(x, f) |
| f.seek(0) |
| x_copy = torch.load(f, map_location={'cuda:1': 'cuda:0'}) |
| for original, copy in zip(x, x_copy): |
| self.assertEqual(copy, original) |
| self.assertIs(type(copy), type(original)) |
| self.assertEqual(copy.get_device(), 0) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_multigpu_storage_clone(self): |
| x = torch.randn(4, 4, device='cuda:1').storage() |
| y = x.clone() |
| self.assertEqual(x.get_device(), y.get_device()) |
| for t in ['byte', 'char', 'short', 'int', 'long', 'half', 'double']: |
| self.assertEqual(getattr(x, t)().get_device(), x.get_device()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_cuda_set_device(self): |
| x = torch.randn(5, 5) |
| with torch.cuda.device(1): |
| self.assertEqual(x.cuda().get_device(), 1) |
| torch.cuda.set_device(0) |
| self.assertEqual(x.cuda().get_device(), 0) |
| with torch.cuda.device(1): |
| self.assertEqual(x.cuda().get_device(), 1) |
| self.assertEqual(x.cuda().get_device(), 0) |
| torch.cuda.set_device(1) |
| self.assertEqual(x.cuda().get_device(), 0) |
| |
| def test_cuda_synchronize(self): |
| torch.cuda.synchronize() |
| torch.cuda.synchronize('cuda') |
| torch.cuda.synchronize('cuda:0') |
| torch.cuda.synchronize(0) |
| torch.cuda.synchronize(torch.device('cuda:0')) |
| |
| if TEST_MULTIGPU: |
| torch.cuda.synchronize('cuda:1') |
| torch.cuda.synchronize(1) |
| torch.cuda.synchronize(torch.device('cuda:1')) |
| |
| with self.assertRaisesRegex(ValueError, "Expected a cuda device, but"): |
| torch.cuda.synchronize(torch.device("cpu")) |
| |
| with self.assertRaisesRegex(ValueError, "Expected a cuda device, but"): |
| torch.cuda.synchronize("cpu") |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_current_stream(self): |
| d0 = torch.device('cuda:0') |
| d1 = torch.device('cuda:1') |
| |
| s0 = torch.cuda.current_stream() |
| s1 = torch.cuda.current_stream(device=1) |
| s2 = torch.cuda.current_stream(device=0) |
| |
| self.assertEqual(d0, s0.device) |
| self.assertEqual(d1, s1.device) |
| self.assertEqual(d0, s2.device) |
| self.assertEqual(s0, s2) |
| |
| with torch.cuda.device(d1): |
| s0 = torch.cuda.current_stream() |
| s1 = torch.cuda.current_stream(1) |
| s2 = torch.cuda.current_stream(d0) |
| |
| self.assertEqual(d1, s0.device) |
| self.assertEqual(d1, s1.device) |
| self.assertEqual(d0, s2.device) |
| self.assertEqual(s0, s1) |
| |
| with self.assertRaisesRegex(ValueError, |
| "Expected a cuda device, but got: cpu"): |
| torch.cuda.current_stream(torch.device('cpu')) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| @skipCUDANonDefaultStreamIf(True) |
| def test_default_stream(self): |
| d0 = torch.device('cuda:0') |
| d1 = torch.device('cuda:1') |
| |
| with torch.cuda.device(d0): |
| s0 = torch.cuda.default_stream() |
| |
| with torch.cuda.device(d1): |
| s1 = torch.cuda.default_stream() |
| |
| s2 = torch.cuda.default_stream(device=0) |
| s3 = torch.cuda.default_stream(d1) |
| |
| self.assertEqual(d0, s0.device) |
| self.assertEqual(d1, s1.device) |
| self.assertEqual(d0, s2.device) |
| self.assertEqual(d1, s3.device) |
| self.assertEqual(s0, s2) |
| self.assertEqual(s1, s3) |
| |
| with torch.cuda.device(d0): |
| self.assertEqual(torch.cuda.current_stream(), s0) |
| |
| with torch.cuda.device(d1): |
| self.assertEqual(torch.cuda.current_stream(), s1) |
| |
| with self.assertRaisesRegex(ValueError, |
| "Expected a cuda device, but got: cpu"): |
| torch.cuda.default_stream(torch.device('cpu')) |
| |
| @skipCUDANonDefaultStreamIf(True) |
| def test_streams(self): |
| default_stream = torch.cuda.current_stream() |
| user_stream = torch.cuda.Stream() |
| self.assertEqual(torch.cuda.current_stream(), default_stream) |
| self.assertNotEqual(default_stream, user_stream) |
| self.assertEqual(default_stream.cuda_stream, 0) |
| self.assertNotEqual(user_stream.cuda_stream, 0) |
| with torch.cuda.stream(user_stream): |
| self.assertEqual(torch.cuda.current_stream(), user_stream) |
| self.assertTrue(user_stream.query()) |
| tensor1 = torch.ByteTensor(5).pin_memory() |
| tensor2 = tensor1.cuda(non_blocking=True) + 1 |
| default_stream.synchronize() |
| self.assertTrue(default_stream.query()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_stream_event_device(self): |
| d0 = torch.device('cuda:0') |
| d1 = torch.device('cuda:1') |
| e0 = torch.cuda.Event() |
| |
| self.assertEqual(None, e0.device) |
| |
| with torch.cuda.device(d0): |
| s0 = torch.cuda.current_stream() |
| s0.record_event(e0) |
| |
| with torch.cuda.device(d1): |
| s1 = torch.cuda.Stream() |
| e1 = s1.record_event() |
| |
| self.assertEqual(s0.device, torch.device('cuda:0')) |
| self.assertEqual(e0.device, torch.device('cuda:0')) |
| self.assertEqual(s1.device, torch.device('cuda:1')) |
| self.assertEqual(e1.device, torch.device('cuda:1')) |
| |
| def test_stream_event_repr(self): |
| s = torch.cuda.current_stream() |
| self.assertTrue("torch.cuda.Stream" in s.__repr__()) |
| e = torch.cuda.Event() |
| self.assertTrue("torch.cuda.Event" in e.__repr__()) |
| s.record_event(e) |
| self.assertTrue("torch.cuda.Event" in e.__repr__()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_stream_context(self): |
| s0 = torch.cuda.current_stream() |
| s1 = torch.cuda.Stream(device=1) |
| s2 = torch.cuda.Stream(device=0) |
| |
| with torch.cuda.device(s1.device): |
| prev_stream_on_cuda1 = torch.cuda.current_stream() |
| |
| self.assertEqual(torch.cuda.current_stream(), s0) |
| self.assertEqual(0, torch.cuda.current_device()) |
| with torch.cuda.stream(s1): |
| self.assertEqual(torch.cuda.current_stream(), s1) |
| self.assertEqual(1, torch.cuda.current_device()) |
| with torch.cuda.stream(s2): |
| self.assertEqual(torch.cuda.current_stream(), s2) |
| self.assertEqual(0, torch.cuda.current_device()) |
| with torch.cuda.stream(s0): |
| self.assertEqual(torch.cuda.current_stream(), s0) |
| self.assertEqual(0, torch.cuda.current_device()) |
| self.assertEqual(torch.cuda.current_stream(), s2) |
| self.assertEqual(0, torch.cuda.current_device()) |
| self.assertEqual(torch.cuda.current_stream(), s1) |
| self.assertEqual(1, torch.cuda.current_device()) |
| |
| with torch.cuda.device(s1.device): |
| self.assertEqual(prev_stream_on_cuda1, torch.cuda.current_stream()) |
| |
| self.assertEqual(torch.cuda.current_stream(), s0) |
| self.assertEqual(0, torch.cuda.current_device()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_streams_multi_gpu(self): |
| default_stream = torch.cuda.current_stream() |
| self.assertEqual(default_stream.device, torch.device('cuda:0')) |
| stream = torch.cuda.Stream(device=1) |
| self.assertEqual(stream.device, torch.device('cuda:1')) |
| with torch.cuda.device(1): |
| self.assertEqual( |
| torch.cuda.current_stream().device, torch.device('cuda:1')) |
| self.assertNotEqual(torch.cuda.current_stream(), default_stream) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_streams_multi_gpu_query(self): |
| d0 = torch.device('cuda:0') |
| d1 = torch.device('cuda:1') |
| torch.cuda.synchronize(d0) |
| torch.cuda.synchronize(d1) |
| |
| with torch.cuda.device(d0): |
| s0 = torch.cuda.current_stream() |
| |
| with torch.cuda.device(d1): |
| s1 = torch.cuda.current_stream() |
| torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES) |
| |
| self.assertTrue(s0.query()) |
| self.assertFalse(s1.query()) |
| |
| with torch.cuda.device(d0): |
| self.assertTrue(s0.query()) |
| self.assertFalse(s1.query()) |
| |
| with torch.cuda.device(d1): |
| self.assertTrue(s0.query()) |
| self.assertFalse(s1.query()) |
| |
| # deliberately using a different device |
| with torch.cuda.device(d0): |
| s1.synchronize() |
| |
| self.assertTrue(s0.query()) |
| self.assertTrue(s1.query()) |
| |
| with torch.cuda.device(d0): |
| self.assertTrue(s0.query()) |
| self.assertTrue(s1.query()) |
| |
| with torch.cuda.device(d1): |
| self.assertTrue(s0.query()) |
| self.assertTrue(s1.query()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_streams_multi_gpu_eq(self): |
| d0 = torch.device('cuda:0') |
| d1 = torch.device('cuda:1') |
| |
| with torch.cuda.device(d0): |
| s0 = torch.cuda.current_stream() |
| s1 = torch.cuda.current_stream() |
| |
| with torch.cuda.device(d1): |
| s2 = torch.cuda.current_stream() |
| s3 = torch.cuda.current_stream() |
| |
| self.assertTrue(s0 == s0) |
| self.assertTrue(s0 == s1) |
| self.assertTrue(s2 == s2) |
| self.assertTrue(s2 == s3) |
| self.assertFalse(s0 == s2) |
| self.assertFalse(s1 == s3) |
| |
| self.assertEqual(s0.device, s1.device) |
| self.assertEqual(s0.cuda_stream, s1.cuda_stream) |
| self.assertEqual(s2.device, s3.device) |
| self.assertEqual(s2.cuda_stream, s3.cuda_stream) |
| self.assertNotEqual(s0.device, s3.device) |
| |
| self.assertEqual(hash(s0), hash(s1)) |
| self.assertEqual(hash(s2), hash(s3)) |
| self.assertNotEqual(hash(s0), hash(s3)) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") |
| def test_streams_priority(self): |
| low, high = torch.cuda.Stream.priority_range() |
| s0 = torch.cuda.Stream(device=0, priority=low) |
| |
| self.assertEqual(low, s0.priority) |
| self.assertEqual(torch.device('cuda:0'), s0.device) |
| |
| s1 = torch.cuda.Stream(device=1, priority=high) |
| |
| self.assertEqual(high, s1.priority) |
| self.assertEqual(torch.device('cuda:1'), s1.device) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") |
| def test_tensor_device(self): |
| self.assertEqual(torch.cuda.FloatTensor(1).get_device(), 0) |
| self.assertEqual(torch.cuda.FloatTensor(1, device=1).get_device(), 1) |
| with torch.cuda.device(1): |
| self.assertEqual(torch.cuda.FloatTensor(1).get_device(), 1) |
| self.assertEqual(torch.cuda.FloatTensor(1, device=0).get_device(), 0) |
| self.assertEqual(torch.cuda.FloatTensor(1, device=None).get_device(), 1) |
| |
| def test_events(self): |
| stream = torch.cuda.current_stream() |
| event = torch.cuda.Event(enable_timing=True) |
| self.assertTrue(event.query()) |
| start_event = torch.cuda.Event(enable_timing=True) |
| stream.record_event(start_event) |
| torch.cuda._sleep(int(50 * get_cycles_per_ms())) |
| stream.record_event(event) |
| self.assertFalse(event.query()) |
| event.synchronize() |
| self.assertTrue(event.query()) |
| self.assertGreater(start_event.elapsed_time(event), 0) |
| |
| @staticmethod |
| def _stream_synchronize(self, spin_time_cycles): |
| s = torch.cuda.current_stream() |
| e_tik = torch.cuda.Event(enable_timing=True) |
| e_tok = torch.cuda.Event(enable_timing=True) |
| |
| e_tik.record(s) |
| torch.cuda._sleep(spin_time_cycles) |
| e_tok.record(s) |
| s.synchronize() |
| |
| self.assertTrue(s.query()) |
| |
| # not necessary to check e_tik and e_tok, as elapsed_time would throw |
| # exception if otherwise. |
| return e_tik.elapsed_time(e_tok) |
| |
| @staticmethod |
| def _event_synchronize(self, spin_time_cycles): |
| s = torch.cuda.current_stream() |
| e_tik = torch.cuda.Event(enable_timing=True) |
| e_tok = torch.cuda.Event(enable_timing=True) |
| |
| e_tik.record(s) |
| torch.cuda._sleep(spin_time_cycles) |
| s.record_event(e_tok) |
| e_tok.synchronize() |
| |
| self.assertTrue(s.query()) |
| |
| # not necessary to check e_tik and e_tok, as elapsed_time would throw |
| # exception if otherwise. |
| return e_tik.elapsed_time(e_tok) |
| |
| @staticmethod |
| def _event_wait(self, spin_time_cycles): |
| s0 = torch.cuda.current_stream() |
| s1 = torch.cuda.Stream() |
| e_tik = torch.cuda.Event(blocking=True, enable_timing=True) |
| e_tok = torch.cuda.Event(blocking=True, enable_timing=True) |
| |
| e_tik.record(s0) |
| torch.cuda._sleep(spin_time_cycles - 10) |
| e_sync = torch.cuda.Event(blocking=True) |
| e_sync.record() |
| e_sync.wait(s1) |
| with torch.cuda.stream(s1): |
| torch.cuda._sleep(10) |
| s1.synchronize() |
| e_tok.record() |
| e_tok.synchronize() |
| |
| self.assertTrue(s0.query()) |
| self.assertTrue(s1.query()) |
| self.assertTrue(e_sync.query()) |
| |
| # not necessary to check e_tik and e_tok, as elapsed_time would throw |
| # exception if otherwise. |
| return e_tik.elapsed_time(e_tok) |
| |
| @staticmethod |
| def _test_stream_event_nogil(self, sync_func, p2c, c2p): |
| with torch.cuda.device('cuda:1'): |
| c2p.put(0) |
| p2c.get() |
| c2p.put(sync_func(self, TestCuda.FIFTY_MIL_CYCLES)) |
| |
| # Skip the test for ROCm as per https://github.com/pytorch/pytorch/issues/53190 |
| @skipIfRocm |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_stream_event_nogil(self): |
| for sync_func in [TestCuda._stream_synchronize, |
| TestCuda._event_synchronize, |
| TestCuda._event_wait]: |
| p2c = queue.Queue() |
| c2p = queue.Queue() |
| e_tik = torch.cuda.Event(enable_timing=True) |
| e_tok = torch.cuda.Event(enable_timing=True) |
| |
| t = threading.Thread( |
| target=TestCuda._test_stream_event_nogil, |
| args=(self, sync_func, p2c, c2p)) |
| t.daemon = True |
| t.start() |
| |
| c2p.get() |
| with torch.cuda.device('cuda:0'): |
| e_tik.record() |
| p2c.put(0) |
| parent_time = sync_func(self, TestCuda.FIFTY_MIL_CYCLES) |
| child_time = c2p.get() |
| e_tok.record() |
| e_tok.synchronize() |
| total_time = e_tik.elapsed_time(e_tok) |
| |
| # Without GIL, synchronizations in parent and child threads can |
| # overlap. The total execution time should be a little bit longer |
| # than spinning fifty million cycles and much shorter than twice of |
| # that. However, testing absolute execution time is not reliable as |
| # it may vary on different hardware in different environments. |
| # Therefore, this test uses relative comparisons, checking if the |
| # sum of parent and child threads execution time is greater than the |
| # real execution time by least 40%. |
| self.assertGreater(parent_time + child_time, total_time * 1.4) |
| |
| # This test is flaky for ROCm, see issue #62602 |
| @skipIfRocm |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_events_wait(self): |
| d0 = torch.device('cuda:0') |
| d1 = torch.device('cuda:1') |
| torch.cuda.synchronize(d0) |
| torch.cuda.synchronize(d1) |
| |
| with torch.cuda.device(d0): |
| s0 = torch.cuda.current_stream() |
| torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES) |
| e0 = torch.cuda.Event() |
| s0.record_event(e0) |
| |
| with torch.cuda.device(d1): |
| s1 = torch.cuda.current_stream() |
| |
| self.assertFalse(s0.query()) |
| self.assertTrue(s1.query()) |
| |
| s1.wait_event(e0) |
| s1.synchronize() |
| |
| self.assertTrue(e0.query()) |
| self.assertTrue(s0.query()) |
| self.assertTrue(s1.query()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_events_multi_gpu_query(self): |
| d0 = torch.device('cuda:0') |
| d1 = torch.device('cuda:1') |
| |
| with torch.cuda.device(d0): |
| s0 = torch.cuda.current_stream() |
| e0 = s0.record_event() |
| s0.synchronize() |
| |
| with torch.cuda.device(d1): |
| s1 = torch.cuda.current_stream() |
| torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES) |
| e1 = s1.record_event() |
| |
| self.assertTrue(e0.query()) |
| self.assertFalse(e1.query()) |
| |
| with torch.cuda.device(d0): |
| self.assertTrue(e0.query()) |
| self.assertFalse(e1.query()) |
| |
| with torch.cuda.device(d1): |
| self.assertTrue(e0.query()) |
| self.assertFalse(e1.query()) |
| |
| # deliberately using a different device |
| with torch.cuda.device(d0): |
| e1.synchronize() |
| |
| self.assertTrue(e0.query()) |
| self.assertTrue(e1.query()) |
| |
| with torch.cuda.device(d0): |
| self.assertTrue(e0.query()) |
| self.assertTrue(e1.query()) |
| |
| with torch.cuda.device(d1): |
| self.assertTrue(e0.query()) |
| self.assertTrue(e1.query()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| @skipIfRocm |
| def test_events_multi_gpu_elapsed_time(self): |
| d0 = torch.device('cuda:0') |
| d1 = torch.device('cuda:1') |
| |
| with torch.cuda.device(d0): |
| s0 = torch.cuda.current_stream() |
| e0 = torch.cuda.Event(enable_timing=True) |
| torch.cuda._sleep(10) |
| s0.record_event(e0) |
| |
| with torch.cuda.device(d1): |
| s1 = torch.cuda.current_stream() |
| e1 = torch.cuda.Event(enable_timing=True) |
| torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES) |
| s1.record_event(e1) |
| |
| e0.synchronize() |
| e1.synchronize() |
| with torch.cuda.device(d0): |
| with self.assertRaises(RuntimeError): |
| self.assertGreater(e0.elapsed_time(e1), 0) |
| |
| with torch.cuda.device(d1): |
| with self.assertRaises(RuntimeError): |
| self.assertGreater(e0.elapsed_time(e1), 0) |
| |
| with torch.cuda.device(d0): |
| s0 = torch.cuda.current_stream() |
| e2 = torch.cuda.Event(enable_timing=True) |
| torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES) |
| s0.record_event(e2) |
| s0.synchronize() |
| |
| self.assertGreater(e0.elapsed_time(e2), 0) |
| |
| # deliberately calling from a different device |
| with torch.cuda.device(d1): |
| self.assertGreater(e0.elapsed_time(e2), 0) |
| |
| def test_record_stream(self): |
| cycles_per_ms = get_cycles_per_ms() |
| |
| t = torch.FloatTensor([1, 2, 3, 4]).pin_memory() |
| result = torch.cuda.FloatTensor(t.size()) |
| stream = torch.cuda.Stream() |
| ptr = [None] |
| |
| # Performs the CPU->GPU copy in a background stream |
| def perform_copy(): |
| with torch.cuda.stream(stream): |
| tmp = t.cuda(non_blocking=True) |
| ptr[0] = tmp.data_ptr() |
| torch.cuda.current_stream().wait_stream(stream) |
| tmp.record_stream(torch.cuda.current_stream()) |
| torch.cuda._sleep(int(50 * cycles_per_ms)) # delay the copy |
| result.copy_(tmp) |
| |
| perform_copy() |
| with torch.cuda.stream(stream): |
| tmp2 = torch.cuda.FloatTensor(t.size()) |
| tmp2.zero_() |
| self.assertNotEqual(tmp2.data_ptr(), ptr[0], msg='allocation re-used to soon') |
| |
| self.assertEqual(result.tolist(), [1, 2, 3, 4]) |
| |
| # Check that the block will be re-used after the main stream finishes |
| torch.cuda.current_stream().synchronize() |
| with torch.cuda.stream(stream): |
| tmp3 = torch.cuda.FloatTensor(t.size()) |
| self.assertEqual(tmp3.data_ptr(), ptr[0], msg='allocation not re-used') |
| |
| def test_record_stream_on_shifted_view(self): |
| # See issue #27366 |
| |
| # This test detects unexpected block reallocation. For reliable test, |
| # the stream to allocate tensors is isolated. The allocator will not |
| # reuse free blocks which were allocated from another stream. |
| stream_alloc = torch.cuda.Stream() |
| with torch.cuda.stream(stream_alloc): |
| base = torch.cuda.FloatTensor([10, 10]) |
| |
| # Record another stream on a shifted view tensor. |
| view = base[5:] |
| assert view.storage_offset() > 0 |
| |
| stream_record = torch.cuda.Stream() |
| with torch.cuda.stream(stream_record): |
| torch.cuda._sleep(int(50 * get_cycles_per_ms())) |
| |
| view.record_stream(stream_record) |
| |
| # Delete those tensors to make the block free soon. |
| data_ptr = base.data_ptr() |
| del base, view |
| |
| # A new tensor should not be allocated to the block above. |
| stream_alloc.synchronize() |
| |
| with torch.cuda.stream(stream_alloc): |
| try_realloc = torch.cuda.FloatTensor([10, 10]) |
| |
| self.assertNotEqual(try_realloc.data_ptr(), data_ptr) |
| |
| @contextlib.contextmanager |
| def _get_external_stream(self, device): |
| cudart = torch.cuda.cudart() |
| stream = ctypes.c_ulonglong(0) |
| stream_p = ctypes.POINTER(ctypes.c_void_p)(stream) |
| stream_p_int = ctypes.cast(stream_p, ctypes.c_void_p).value |
| with device: |
| try: |
| out = cudart.cudaStreamCreate(stream_p_int) |
| self.assertEqual(out, 0) |
| self.assertNotEqual(stream.value, 0) |
| yield stream.value |
| finally: |
| out = cudart.cudaStreamDestroy(stream.value) |
| self.assertEqual(out, 0) |
| |
| def test_external_streams(self): |
| device = torch.cuda.device(0) |
| with self._get_external_stream(device) as stream_v: |
| ext_stream = torch.cuda.ExternalStream(stream_v) |
| self.assertEqual(stream_v, ext_stream.cuda_stream) |
| self.assertEqual(ext_stream.device.index, device.idx) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_external_streams_multi_device(self): |
| device = torch.cuda.device(1) |
| with self._get_external_stream(device) as stream_v: |
| ext_stream = torch.cuda.ExternalStream( |
| stream_v, device=device) |
| self.assertEqual(stream_v, ext_stream.cuda_stream) |
| self.assertEqual(ext_stream.device.index, device.idx) |
| |
| def test_noncontiguous_pinned_memory(self): |
| # See issue #3266 |
| x = torch.arange(0, 10).view((2, 5)) |
| self.assertEqual(x.t(), x.t().pin_memory()) |
| |
| def test_caching_pinned_memory(self): |
| cycles_per_ms = get_cycles_per_ms() |
| |
| # check that allocations are re-used after deletion |
| t = torch.FloatTensor([1]).pin_memory() |
| ptr = t.data_ptr() |
| del t |
| t = torch.FloatTensor([1]).pin_memory() |
| self.assertEqual(t.data_ptr(), ptr, msg='allocation not reused') |
| |
| # check that the allocation is not re-used if it's in-use by a copy |
| gpu_tensor = torch.cuda.FloatTensor([0]) |
| torch.cuda._sleep(int(1000 * cycles_per_ms)) # delay the copy by 1s |
| gpu_tensor.copy_(t, non_blocking=True) |
| del t |
| t = torch.FloatTensor([1]).pin_memory() |
| self.assertNotEqual(t.data_ptr(), ptr, msg='allocation re-used too soon') |
| self.assertEqual(list(gpu_tensor), [1]) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_caching_pinned_memory_multi_gpu(self): |
| # checks that the events preventing pinned memory from being re-used |
| # too early are recorded on the correct GPU |
| cycles_per_ms = get_cycles_per_ms() |
| |
| t = torch.FloatTensor([1]).pin_memory() |
| ptr = t.data_ptr() |
| gpu_tensor0 = torch.cuda.FloatTensor([0], device=0) |
| gpu_tensor1 = torch.cuda.FloatTensor([0], device=1) |
| |
| with torch.cuda.device(1): |
| torch.cuda._sleep(int(1000 * cycles_per_ms)) # delay the copy by 1s |
| gpu_tensor1.copy_(t, non_blocking=True) |
| |
| del t |
| t = torch.FloatTensor([2]).pin_memory() |
| self.assertNotEqual(t.data_ptr(), ptr, msg='allocation re-used too soon') |
| |
| with torch.cuda.device(0): |
| gpu_tensor0.copy_(t, non_blocking=True) |
| |
| self.assertEqual(gpu_tensor1[0], 1) |
| self.assertEqual(gpu_tensor0[0], 2) |
| |
| def test_caching_allocator_record_stream_oom(self): |
| """allocations delayed by a record_stream call should still be freed on |
| an out-of-memory in cuda_malloc_retry. see issue #19219""" |
| stream = torch.cuda.Stream() |
| |
| with torch.cuda.stream(stream): |
| y = torch.zeros(40 * 1024 * 1024, device='cuda') |
| |
| for _ in range(100): |
| x = torch.empty(40 * 1024 * 1024, device='cuda') |
| with torch.cuda.stream(stream): |
| y += x |
| # delays re-use of `x` until after all operations in `stream` |
| x.record_stream(stream) |
| del x |
| |
| # we've made a mess by allocating up to the device capacity. free any |
| # cached blocks in case it affects future tests. |
| torch.cuda.empty_cache() |
| |
| # Tests for historic illegal memory access, see #17040. |
| def test_reduction_gpu_memory_accessing(self): |
| x = torch.ones(512, 8, dtype=torch.float32, device='cuda') |
| torch.sum(x, 0) |
| |
| def test_sum_fp16(self): |
| x = torch.zeros(10, device='cuda', dtype=torch.float16) |
| self.assertEqual(x.sum(), 0) |
| |
| x = torch.ones(65504, device='cuda', dtype=torch.float16) |
| self.assertEqual(x.sum(), 65504) |
| self.assertEqual(x.sum(dtype=torch.float32), 65504) |
| |
| x = torch.ones(65536, device='cuda', dtype=torch.float16) |
| self.assertEqual(x.sum(dtype=torch.float32), 65536) |
| |
| a = torch.zeros(1203611).bernoulli_(0.0005) |
| x = a.to(device='cuda', dtype=torch.float16) |
| self.assertEqual(x.sum().item(), a.sum().item()) |
| |
| a = torch.zeros(100, 121, 80).bernoulli_(0.0005) |
| x = a.to(device='cuda', dtype=torch.float16) |
| self.assertEqual(x.sum((0, 2)).float().cpu(), a.sum((0, 2))) |
| |
| def test_mean_fp16(self): |
| x = torch.ones(65536, device='cuda', dtype=torch.float16) |
| self.assertEqual(x.mean(), 1) |
| |
| x = torch.ones(65536, device='cuda', dtype=torch.float16) |
| self.assertEqual(x.mean(dtype=torch.float32), 1) |
| |
| def test_prod_large(self): |
| # tests global reduction (should_global_reduce = true) in case of non-zero identity element |
| x = torch.ones(240000, device='cuda', dtype=torch.float32) |
| self.assertEqual(x.prod(), 1) |
| |
| # test for complex types. Note 240k is divisible by 4 |
| for dtype in [torch.cfloat, torch.cdouble]: |
| x = torch.ones(240000, device='cuda', dtype=dtype) * (0 + 1j) |
| self.assertEqual(x.prod(), 1) |
| |
| def test_multinomial_ext(self): |
| # Test two corner cases from older PyTorch (Issue #4858) |
| freqs = torch.cuda.FloatTensor([ |
| 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, |
| 0.03178183361887932, 0.027680952101945877, 0.033176131546497345, |
| 0.046052902936935425, 0.07742464542388916, 0.11543981730937958, |
| 0.14148041605949402, 0.15784293413162231, 0.13180233538150787, |
| 0.08271478116512299, 0.049702685326337814, 0.027557924389839172, |
| 0.018125897273421288, 0.011851548217236996, 0.010252203792333603, |
| 0.007422595750540495, 0.005372154992073774, 0.0045109698548913, |
| 0.0036087757907807827, 0.0035267581697553396, 0.0018864056328311563, |
| 0.0024605290964245796, 0.0022964938543736935, 0.0018453967059031129, |
| 0.0010662291897460818, 0.0009842115687206388, 0.00045109697384759784, |
| 0.0007791675161570311, 0.00020504408166743815, 0.00020504408166743815, |
| 0.00020504408166743815, 0.00012302644609007984, 0.0, |
| 0.00012302644609007984, 4.100881778867915e-05, 0.0, 0.0, 0.0, 0.0, |
| 0.0, 0.0]) |
| |
| torch.cuda.manual_seed(11042) |
| sample = torch.multinomial(freqs, 1000, True) |
| self.assertNotEqual(freqs[sample].min(), 0) |
| |
| p = torch.zeros(3421, 2, device="cuda", dtype=torch.float) |
| p[:, 1] = 1 |
| torch.cuda.manual_seed(5214) |
| r = torch.multinomial(p, 1) |
| self.assertNotEqual(r.min().item(), 0) |
| |
| # test corner case from Issue #13867 |
| torch.cuda.manual_seed(33) |
| probs = torch.randn(1000000, device='cuda').clamp(min=0) * 3e-5 |
| samples = probs.multinomial(1000000, replacement=True) |
| self.assertGreater(probs[samples].min().item(), 0) |
| |
| def _spawn_test_multinomial_invalid_probs_cuda(self, probs): |
| import subprocess |
| try: |
| p = subprocess.Popen([sys.executable, '-c', f"""\ |
| import sys |
| import torch |
| from torch._six import inf, nan |
| try: |
| with torch.random.fork_rng(devices=[0]): |
| torch.multinomial(torch.tensor({probs}).to('cuda'), 2, replacement=True) |
| torch.cuda.synchronize() |
| sys.exit(-1) # Should not be reached |
| except RuntimeError as e: |
| sys.exit(-2) |
| """], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) |
| out, err = p.communicate(timeout=10) |
| p.wait(timeout=10) |
| except subprocess.TimeoutExpired as e: |
| p.kill() |
| out, err = p.communicate() |
| expected_messages = [ |
| 'device-side assert triggered', # CUDA |
| 'Assertion', # CUDA |
| 'HSA_STATUS_ERROR_EXCEPTION', # ROCm |
| 'Device-side assertion' # ROCm |
| ] |
| self.assertTrue(any([msg in out or msg in err for msg in expected_messages])) |
| |
| @slowTest |
| @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support device side asserts") |
| @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \ |
| don't support multiprocessing with spawn start method") |
| def test_multinomial_invalid_probs_cuda(self): |
| self._spawn_test_multinomial_invalid_probs_cuda([1., -1., 1.]) |
| self._spawn_test_multinomial_invalid_probs_cuda([1., inf, 1.]) |
| self._spawn_test_multinomial_invalid_probs_cuda([1., -inf, 1.]) |
| self._spawn_test_multinomial_invalid_probs_cuda([1., 1., nan]) |
| |
| @slowTest |
| @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory") |
| def test_huge_index(self): |
| src = torch.empty(15000000, 45, device='cuda', dtype=torch.long).random_(0, 2**22) |
| idx = torch.randperm(src.shape[0], device='cuda') |
| res = src[idx] |
| res_cpu = src.cpu()[idx.cpu()] |
| self.assertEqual(res.cpu(), res_cpu) |
| |
| def test_min_max_inits(self): |
| # Testing if THC_reduceAll received the correct index initialization. |
| # This affects the result of THC_reduceAll operations at extreme values |
| x = torch.cuda.ByteTensor([0]) |
| y = torch.cuda.ByteTensor([255]) |
| expected = torch.cuda.LongTensor([0])[0] |
| |
| _, v = x.max(dim=0) |
| self.assertEqual(v, expected) |
| |
| _, v = y.min(dim=0) |
| self.assertEqual(v, expected) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_get_set_rng_state_all(self): |
| states = torch.cuda.get_rng_state_all() |
| before0 = torch.cuda.FloatTensor(100, device=0).normal_() |
| before1 = torch.cuda.FloatTensor(100, device=1).normal_() |
| torch.cuda.set_rng_state_all(states) |
| after0 = torch.cuda.FloatTensor(100, device=0).normal_() |
| after1 = torch.cuda.FloatTensor(100, device=1).normal_() |
| self.assertEqual(before0, after0, atol=0, rtol=0) |
| self.assertEqual(before1, after1, atol=0, rtol=0) |
| |
| def test_nvtx(self): |
| # Just making sure we can see the symbols |
| torch.cuda.nvtx.range_push("foo") |
| torch.cuda.nvtx.mark("bar") |
| torch.cuda.nvtx.range_pop() |
| range_handle = torch.cuda.nvtx.range_start("range_start") |
| torch.cuda.nvtx.range_end(range_handle) |
| |
| def test_bincount_ext(self): |
| # ensure CUDA code coverage |
| input_size = (5000,) |
| w = torch.randn(input_size, dtype=torch.double, device='cuda') |
| w_cpu = w.cpu() |
| # test shared memory impl |
| t = torch.randint(50, input_size, dtype=torch.int8, device='cuda') |
| self.assertEqual(t.cpu().bincount(), t.bincount()) |
| self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w)) |
| # test multi block memory impl |
| # see `THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM` in SummaryOps.cu |
| t = torch.randint(500, input_size, dtype=torch.int64, device='cuda') |
| self.assertEqual(t.cpu().bincount(), t.bincount()) |
| self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w)) |
| # test global memory impl |
| # see `THRESH_NUMBER_BINS_FOR_GLOBAL_MEM` in SummaryOps.cu |
| t = torch.randint(2000, input_size, dtype=torch.int64, device='cuda') |
| self.assertEqual(t.cpu().bincount(), t.bincount()) |
| self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w)) |
| |
| t = torch.zeros([10], dtype=torch.int32, device='cuda') |
| # 35488 * 65536 as int32 would cause overflow to negative value |
| # giving negative bin offset |
| t[0] = 35488 |
| counted = t.bincount(minlength=65536) |
| self.assertEqual(torch.sum(counted), 10) |
| |
| def test_tiny_half_norm_(self): |
| a = torch.arange(25).cuda().float() |
| a /= 100000000 |
| b = a.half() |
| self.assertGreater(b.norm().item(), 0) |
| |
| def test_norm_type_conversion(self): |
| a = torch.ones(65536).cuda().half() |
| self.assertEqual(a.norm(p=0, dtype=torch.float32), 65536) |
| |
| # Verifies that mem_get_info works, including when called for a different device |
| def test_mem_get_info(self): |
| def _test(idx): |
| before_free_bytes, before_available_bytes = torch.cuda.mem_get_info(idx) |
| # increasing to 8MB to force acquiring a new block and overcome blocksize differences across platforms |
| t = torch.randn(1024 * 1024 * 8, device='cuda:' + str(idx)) |
| after_free_bytes, after_available_bytes = torch.cuda.mem_get_info(idx) |
| |
| self.assertTrue(after_free_bytes < before_free_bytes) |
| self.assertEqual(before_available_bytes, after_available_bytes) |
| |
| _test(0) |
| if TEST_MULTIGPU: |
| _test(1) |
| |
| # Test that wrap_with_cuda_memory_check successfully detects leak |
| # skip for ROCM. Look into #62533. |
| @skipIfRocm |
| def test_cuda_memory_leak_detection(self): |
| l = [] |
| |
| @self.wrap_with_cuda_memory_check |
| def no_leak(): |
| pass |
| |
| @self.wrap_with_cuda_memory_check |
| def leak_gpu0(): |
| # increasing to 8MB to force acquiring a new block and overcome blocksize differences across platforms |
| l.append(torch.randn(1024 * 1024 * 8, device=torch.device("cuda:0"))) |
| |
| no_leak() |
| |
| with self.assertRaisesRegex(RuntimeError, r"CUDA driver API confirmed .+ on device 0.+"): |
| leak_gpu0() |
| |
| if TEST_MULTIGPU: |
| @self.wrap_with_cuda_memory_check |
| def leak_gpu1(): |
| # increasing to 8MB to force acquiring a new block and overcome blocksize differences across platforms |
| l.append(torch.randn(1024 * 1024 * 8, device=torch.device("cuda:1"))) |
| |
| with self.assertRaisesRegex(RuntimeError, r"CUDA driver API confirmed .+ on device 1.+"): |
| leak_gpu1() |
| |
| def test_cuda_memory_leak_detection_propagates_errors(self): |
| with self.assertRaisesRegex(RuntimeError, r"The size of tensor a \(3\) must match"): |
| with self.assertLeaksNoCudaTensors(): |
| x = torch.randn(3, 1, device='cuda') |
| y = torch.randn(2, 1, device='cuda') |
| z = x + y |
| |
| def test_trilu_indices(self): |
| for test_args in tri_tests_args: |
| _compare_trilu_indices(self, *test_args, device='cuda') |
| |
| # test default options |
| x = torch.ones( |
| 3, 3, dtype=torch.long, device='cuda', layout=torch.strided) |
| self.assertEqual( |
| x.tril(0).nonzero().transpose(0, 1), |
| torch.tril_indices(3, 3, device='cuda')) |
| self.assertEqual( |
| x.triu(0).nonzero().transpose(0, 1), |
| torch.triu_indices(3, 3, device='cuda')) |
| |
| def test_large_trilu_indices(self): |
| for test_args in tri_large_tests_args: |
| _compare_large_trilu_indices(self, *test_args, device='cuda') |
| |
| @unittest.skipIf(not TEST_MEDIUM_TENSOR, "not enough memory") |
| def test_cuda_kernel_loop_overflow(self): |
| # Issue #24309: In extreme cases, the loop variable could overflow and continue |
| # the kernel loop with a negative index, causing a RuntimeError (invalid write): |
| x = torch.randn(1, 1, 1, 2**30 + 1, dtype=torch.float16, device="cuda") |
| expected = x[0, 0, 0, 2**30] |
| y = torch.nn.functional.avg_pool2d(x, kernel_size=1) |
| torch.cuda.synchronize() |
| self.assertEqual(y[0, 0, 0, 2**30], expected) |
| |
| @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory") |
| def test_cuda_kernel_loop_overflow_large(self): |
| # Make sure input.numel() > INT_MAX is handled: |
| x = torch.randn(1, 1, 1, 2**31, dtype=torch.float16, device="cuda") |
| with self.assertRaisesRegex(RuntimeError, "integer out of range"): |
| y = torch.nn.functional.avg_pool2d(x, kernel_size=1) |
| |
| # Issue #24309: In extreme cases, the loop variable could overflow and continue |
| # the kernel loop with a negative index, causing a RuntimeError (invalid write): |
| x = torch.randn(1, 1, 1, 2**31 - 1, dtype=torch.float16, device="cuda") |
| expected = x[0, 0, 0, 2**31 - 2] |
| y = torch.nn.functional.avg_pool2d(x, kernel_size=1) |
| torch.cuda.synchronize() |
| self.assertEqual(y[0, 0, 0, 2**31 - 2], expected) |
| |
| # this might create a reference cycle on self... |
| def _make_multiply_in_stream(self): |
| class MultiplyInStream(torch.autograd.Function): |
| @staticmethod |
| def forward(ctx, x, val): |
| ctx.val = val |
| ctx.stream = torch.cuda.current_stream() |
| return x * val |
| |
| @staticmethod |
| def backward(ctx, grad): |
| self.assertEqual(torch.cuda.current_stream(), ctx.stream) |
| # delays the operation in the the background stream |
| torch.cuda._sleep(1000 * 5000) |
| return grad * ctx.val, None |
| |
| return MultiplyInStream |
| |
| @skipCUDANonDefaultStreamIf(True) |
| def test_streaming_backwards_sync(self): |
| default_stream = torch.cuda.current_stream() |
| stream = torch.cuda.Stream() |
| |
| MultiplyInStream = self._make_multiply_in_stream() |
| |
| # Tests using grads outside the backward() stream context |
| # See "Stream semantics of backward passes" on https://pytorch.org/docs/stable/notes/cuda.html |
| x = torch.randn(5, 5, device='cuda', requires_grad=True) |
| with torch.cuda.stream(stream): |
| stream.wait_stream(default_stream) |
| output = MultiplyInStream.apply(x, 2) |
| output.sum().backward() |
| # sync needed |
| default_stream.wait_stream(stream) |
| self.assertEqual(x.grad, torch.ones_like(x) * 2) |
| self.assertEqual(torch.cuda.current_stream(), default_stream) |
| |
| # Tests that using grads in the same stream context as backward() |
| # is safe regardless what streams bwd ops ran on |
| bwd_ambient_stream = torch.cuda.Stream() |
| x = torch.randn(5, 5, device='cuda', requires_grad=True) |
| with torch.cuda.stream(stream): |
| stream.wait_stream(default_stream) |
| output = MultiplyInStream.apply(x, 3) |
| with torch.cuda.stream(bwd_ambient_stream): |
| bwd_ambient_stream.wait_stream(stream) |
| output.sum().backward() |
| # x was first used on "stream" so its AccumulateGrad leaf should run on "stream". |
| # The end of backward() should have synced "bwd_ambient_stream" with "stream" |
| # so it should be safe to use x.grad here without any syncs. |
| self.assertEqual(x.grad, torch.ones_like(x) * 3) |
| self.assertEqual(torch.cuda.current_stream(), bwd_ambient_stream) |
| |
| # Skip the test for ROCm as per https://github.com/pytorch/pytorch/issues/53190 |
| @skipIfRocm |
| def test_streaming_backwards_multiple_streams(self): |
| MultiplyInStream = self._make_multiply_in_stream() |
| |
| class StreamModel(torch.nn.Module): |
| def __init__(self): |
| super(StreamModel, self).__init__() |
| self.event = torch.cuda.Event() |
| self.stream0 = torch.cuda.Stream() |
| self.stream1 = torch.cuda.Stream() |
| |
| def forward(self, x, x_first_use_on_ambient): |
| if x_first_use_on_ambient: |
| x0 = x.clone() |
| self.stream0.wait_stream(torch.cuda.current_stream()) |
| self.stream1.wait_stream(torch.cuda.current_stream()) |
| with torch.cuda.stream(self.stream0): |
| if not x_first_use_on_ambient: |
| x0 = x.clone() |
| y0 = MultiplyInStream.apply(x0, 2) |
| self.event.record(stream=torch.cuda.current_stream()) |
| |
| with torch.cuda.stream(self.stream1): |
| y1 = MultiplyInStream.apply(x, 3) |
| self.stream1.wait_event(self.event) |
| return y0 + y1 |
| |
| stream = torch.cuda.Stream() |
| |
| for x_first_use_on_ambient in (True, False): |
| # the out_of_place=False, iters=1 case stresses if proper syncs are inserted |
| # when grads are initially None and stolen by backward ops. |
| for out_of_place, iters in ((True, 1), |
| (False, 1), |
| (False, 5)): |
| with torch.cuda.stream(stream): |
| x = torch.randn(5, 5, device='cuda', requires_grad=True) |
| model = StreamModel().cuda() |
| x.register_hook(lambda grad: self.assertEqual(torch.cuda.current_stream(), |
| stream if x_first_use_on_ambient else model.stream0)) |
| for p in model.parameters(): |
| self.assertTrue(p.grad is None) |
| for i in range(iters): |
| loss = model(x, x_first_use_on_ambient).sum() |
| if out_of_place: |
| x_grad = torch.autograd.grad((loss,), (x,))[0] |
| else: |
| loss.backward() |
| # See "Stream semantics of backward passes" on https://pytorch.org/docs/stable/notes/cuda.html |
| torch.cuda.current_stream().wait_stream(stream) |
| |
| if out_of_place: |
| self.assertEqual(x_grad, torch.ones_like(x) * 5 * iters) |
| else: |
| self.assertEqual(x.grad, torch.ones_like(x) * 5 * iters) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_streaming_backwards_device_transfer(self): |
| # This function must run with non-default current streams on all devices, otherwise it's meaningless. |
| # The intention is to test that to()'s backward (CopyBackward) interacts properly with the |
| # synchronization logic in torch/csrc/autograd/input_buffer.cpp. |
| dev0 = torch.device("cuda:0") |
| dev1 = torch.device("cuda:1") |
| |
| # Unfortunately I need to make the tensors largeish. |
| # Bigger tensors = longer D2D transfers = more likely to expose races. |
| size = 2**26 |
| |
| a = torch.full((size,), 1, device=dev1, dtype=torch.float64, requires_grad=True) |
| b = torch.full((size,), 1, device=dev1, dtype=torch.float64, requires_grad=True) |
| |
| # Here to_backward_recipient = a*b is used only once, so MulBackward's InputBuffer slot only expects 1 input. |
| # This tests the situation where we don't call InputBuffer::accumulate for MulBackward's InputBuffer. |
| to_backward_recipient = a * b |
| s = to_backward_recipient.to(device="cuda:0").sum() |
| torch.cuda.synchronize(device=dev0) |
| torch.cuda.synchronize(device=dev1) |
| s.backward() |
| self.assertTrue(a.grad.sum().item() == size) |
| self.assertTrue(b.grad.sum().item() == size) |
| |
| # Here to_backward_recipient = a*b is used twice, so MulBackward's InputBuffer slot expects 2 inputs. |
| # This tests the situation where we do call InputBuffer::accumulate for MulBackward's InputBuffer. |
| a.grad = None |
| b.grad = None |
| to_backward_recipient = a * b |
| # Multiply by 2 here so to's backward creates gradient values that are different from the case above, |
| # to mitigate weirdness if the caching allocator happens to reuse memory regions that were populated |
| # with 1s by the case above |
| s0 = to_backward_recipient.to(device="cuda:0").sum() * 2. |
| s1 = to_backward_recipient.to(device="cuda:0").sum() * 2. |
| torch.cuda.synchronize(device=dev0) |
| torch.cuda.synchronize(device=dev1) |
| s0.backward(retain_graph=True) |
| s1.backward() |
| self.assertTrue(a.grad.sum().item() == 4 * size) |
| self.assertTrue(b.grad.sum().item() == 4 * size) |
| |
| def test_streaming_backwards_sync_graph_root(self): |
| # This function tests if bwd ops running on a side stream properly sync with the GraphRoot. |
| # The potential bug it targets is a race condition. The test uses multiple trials and |
| # torch.cuda._sleep such that if the race condition exists, the test will almost certainly fail, |
| # but there's a chance it may spuriously pass. Passing does not guarantee the backend is bug-free, |
| # but failure does guarantee there is a bug. |
| fwd_bwd_op_stream = torch.cuda.Stream() |
| bwd_ambient_stream = torch.cuda.Stream() |
| # We need these streams to be different otherwise the test is meaningless. |
| self.assertTrue(fwd_bwd_op_stream != bwd_ambient_stream) |
| |
| size = int(1e3) |
| |
| a = torch.full((size,), 2.0, device="cuda", requires_grad=True) |
| b = torch.full((size,), 3.0, device="cuda", requires_grad=True) |
| |
| # I don't think we need any manual record_streams below. |
| # a and b remain in scope for the entire test. |
| # c and grad remain in scope for each iteration, and there's a full sync between iterations. |
| for trial in range(5): |
| torch.cuda.synchronize() |
| a.grad = b.grad = None |
| with torch.cuda.stream(fwd_bwd_op_stream): |
| c = a * b |
| |
| with torch.cuda.stream(bwd_ambient_stream): |
| torch.cuda.synchronize() |
| # Long-running dummy kernel on bwd_ambient_stream delays filling of grad |
| torch.cuda._sleep(int(50 * get_cycles_per_ms())) |
| # Fills grad on bwd_ambient_stream |
| grad = torch.full((size,), float(trial + 1), device="cuda") |
| |
| # Bwd ops still run on fwd_bwd_ops_stream, so the following will likely fail if |
| # bwd ops don't sync with bwd_ambient_stream before consuming grad. |
| torch.autograd.backward(tensors=c, grad_tensors=grad) |
| |
| # See https://github.com/pytorch/pytorch/issues/47028 |
| # assertEquals below run on bwd_ambient_stream, so this test may also fail |
| # if backward() fails to sync with bwd_ambient_stream at the end. |
| # Synchronizing here works around the issue until a proper fix can be made. |
| torch.cuda.synchronize() |
| with torch.no_grad(): |
| self.assertEqual(a.grad, grad * b) |
| self.assertEqual(b.grad, grad * a) |
| |
| def test_streaming_backwards_callback(self): |
| # Tests if autograd callbacks sync properly with respect to leaf streams and |
| # the user-facing stream surrounding backward(). If it fails, first suspect is |
| # sync logic where "final_callbacks_" are called in torch/csrc/autograd/engine.cpp |
| MultiplyInStream = self._make_multiply_in_stream() |
| |
| size = int(1e3) |
| a = torch.full((size,), 1, device="cuda", dtype=torch.float, requires_grad=True) |
| b = torch.full((size,), 1, device="cuda", dtype=torch.float, requires_grad=True) |
| |
| s0 = torch.cuda.Stream() |
| s1 = torch.cuda.Stream() |
| s2 = torch.cuda.Stream() |
| |
| stash = [] |
| |
| # sets up a nontrivial structure of leaf streams |
| s0.wait_stream(torch.cuda.current_stream()) |
| with torch.cuda.stream(s0): |
| c = MultiplyInStream.apply(a, 2) |
| |
| s1.wait_stream(torch.cuda.current_stream()) |
| with torch.cuda.stream(s1): |
| d = MultiplyInStream.apply(b, 3) |
| s1.wait_stream(s0) |
| e = c * d |
| |
| def clone_leaf_grads(): |
| stash.append(a.grad.clone()) |
| stash.append(b.grad.clone()) |
| |
| # Use a hook on e to install the callback |
| e.register_hook(lambda grad: torch.autograd.Variable._execution_engine.queue_callback(clone_leaf_grads)) |
| |
| s2.wait_stream(s1) |
| with torch.cuda.stream(s2): |
| e.sum().backward() |
| # The autograd engine should sync s2 with all leaf streams then run the callback clone_leaf_grads on s2. |
| # If those things happened properly, checking the values of the cloned grads on s2 should be safe: |
| self.assertEqual(stash[0], torch.full_like(a, 6)) |
| self.assertEqual(stash[1], torch.full_like(a, 6)) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| @unittest.skipIf(IS_SANDCASTLE or IS_REMOTE_GPU, "Does not work on Sandcastle") |
| def test_cuda_init_race(self): |
| # See https://github.com/pytorch/pytorch/issues/16559 |
| import subprocess |
| subprocess.check_call([sys.executable, '-c', """\ |
| import torch |
| import threading |
| |
| def worker(rank): |
| torch.tensor([1.]).cuda(rank) |
| |
| t1 = threading.Thread(target=worker, args=(0,)) |
| t2 = threading.Thread(target=worker, args=(1,)) |
| t1.start() |
| t2.start() |
| """]) |
| |
| @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support device side asserts") |
| def test_fixed_cuda_assert_async(self): |
| with self.assertRaisesRegex(RuntimeError, "Boolean value of Tensor with no values is ambiguous"): |
| torch._assert_async(torch.tensor([], device="cuda")) |
| with self.assertRaisesRegex(RuntimeError, "Boolean value of Tensor with more than one value is ambiguous"): |
| torch._assert_async(torch.tensor([0, 0], device="cuda")) |
| |
| torch._assert_async(torch.tensor(1, device="cuda")) |
| torch._assert_async(torch.tensor(0.1, device="cuda")) |
| torch._assert_async(torch.tensor(-0.1, device="cuda")) |
| torch._assert_async(torch.tensor(True, device="cuda")) |
| torch._assert_async(torch.tensor(0 + 0.1j, device="cuda")) |
| |
| fail_stmts = [ |
| "torch._assert_async(torch.tensor(0, device='cuda'))", |
| "torch._assert_async(torch.tensor(0.0, device='cuda'))", |
| "torch._assert_async(torch.tensor(False, device='cuda'))", |
| "torch._assert_async(torch.tensor(0 + 0j, device='cuda'))", |
| ] |
| |
| import subprocess |
| for stmt in fail_stmts: |
| with self.subTest(stmt=stmt): |
| r = subprocess.call([sys.executable, '-c', f"""\ |
| import torch |
| |
| {stmt} |
| torch.cuda.synchronize() |
| """]) |
| self.assertTrue(r != 0) |
| |
| |
| def test_grad_scaling_unscale(self, dtype=torch.float): |
| inv_scale = torch.full((1,), 0.25, dtype=torch.float, device="cuda:0") |
| found_inf = torch.full((1,), 0.0, dtype=torch.float, device="cuda:0") |
| |
| size = 10 |
| g = torch.full((size, size), 4.0, dtype=dtype, device="cuda:0") |
| ginf = g.clone() |
| ginf[2, 2] = float('inf') |
| gnan = g.clone() |
| gnan[2, 2] = float('nan') |
| |
| # Tries selected combinations of |
| # - contiguous grads |
| # - g.clone().t() which is not contiguous but still non overlapping and dense |
| # - variants of g.clone()[:, :5] which are not non overlapping and dense |
| # Non overlapping and dense grads route into a multi tensor apply kernel, |
| # others use a fallback per-tensor kernel, so we should try both. |
| cases = ( |
| ([g.clone(), g.clone()], False), |
| ([g.clone(), g.clone().t()], False), |
| ([g.clone(), g.clone()[:, :5]], False), |
| ([g.clone()[:, :5], g.clone()[:, :5]], False), |
| ([g.clone(), ginf.clone()], True), |
| ([g.clone(), gnan.clone()], True), |
| ([g.clone(), ginf.clone()[:, :5]], True), |
| ([g.clone(), gnan.clone()[:, :5]], True), |
| ([ginf.clone(), g.clone()[:, :5]], True), |
| ([ginf.clone()[:, :5], g.clone()[:, :5]], True), |
| ) |
| |
| for grads, has_inf in cases: |
| found_inf.zero_() |
| torch._amp_foreach_non_finite_check_and_unscale_(grads, found_inf, inv_scale) |
| if has_inf: |
| self.assertEqual(found_inf, 1.0) |
| else: |
| self.assertEqual(found_inf, 0.0) |
| for grad in grads: |
| self.assertEqual(grad, torch.ones_like(grad), rtol=1e-5, atol=1e-7) |
| |
| # When passing lists with mismatched dtypes to a raw |
| # _amp_foreach_non_finite_check_and_unscale_ call, |
| # it's expected to fall back to single-tensor TensorIterator kernel. |
| grads = [g.clone(), g.to(dtype=torch.float16)] |
| torch._amp_foreach_non_finite_check_and_unscale_(grads, found_inf, inv_scale) |
| for grad in grads: |
| self.assertEqual(grad, torch.ones_like(grad), rtol=1e-5, atol=1e-7) |
| |
| # Passing lists with mismatched devices to a raw |
| # _amp_foreach_non_finite_check_and_unscale_ call should raise errors. |
| if TEST_MULTIGPU: |
| with self.assertRaisesRegex(RuntimeError, r"Expected all tensors to be on the same device"): |
| torch._amp_foreach_non_finite_check_and_unscale_([g.clone(), g.to(device="cuda:1")], |
| found_inf, |
| inv_scale) |
| |
| # Creates a list of grads with mismatched dtypes and devices, to ensure |
| # scaler._unscale_grads_ organizes grads by dtype and device before calling |
| # _amp_foreach_non_finite_check_and_unscale_ on each set. |
| # If inject_inf >= 0, writes an inf into one grad for _unscale_grads_ to find. |
| def perfect_storm_grads(inject_inf): |
| grads = [g.clone(), g.clone()[:, :5], g.to(dtype=torch.float16), g.to(dtype=torch.float16)] |
| if TEST_MULTIGPU: |
| grads += [g.to(device="cuda:1"), |
| g.to(device="cuda:1")[:, :5], |
| g.to(device="cuda:1", dtype=torch.float16), |
| g.to(device="cuda:1", dtype=torch.float16)] |
| if inject_inf >= 0: |
| grads[inject_inf][2, 2] = float('inf') |
| return grads |
| |
| scaler = torch.cuda.amp.GradScaler() |
| dummy_params = [torch.empty_like(g) for g in perfect_storm_grads(-1)] |
| dummy_opt = torch.optim.SGD(dummy_params, lr=1.) |
| |
| # Ensures the inf/nan checking can find an inf injected onto any grad in the perfect storm. |
| for inject_inf in range(-1, len(dummy_params)): |
| found_inf = torch.full((1,), 0.0, dtype=torch.float, device="cuda:0") |
| grads = perfect_storm_grads(inject_inf) |
| for i, p in enumerate(dummy_params): |
| p.grad = grads[i] |
| found_inf_per_device = scaler._unscale_grads_(dummy_opt, inv_scale, found_inf, True) |
| if inject_inf < 0: |
| # No inf was injected, ensures unscaling worked normally. |
| self.assertTrue(sum(v.item() for v in found_inf_per_device.values()) == 0) |
| for grad in grads: |
| self.assertEqual(grad, torch.ones_like(grad), rtol=1e-5, atol=1e-7) |
| else: |
| # inf was injected, ensures inf was found. |
| self.assertTrue(sum(v.item() for v in found_inf_per_device.values()) == 1) |
| |
| def test_grad_scaling_update_scale(self, device="cuda", dtype=torch.float): |
| growth = 2.0 |
| backoff = 0.25 |
| growth_interval = 2 |
| scale = torch.full((1,), 4.0, dtype=dtype, device=device) |
| growth_tracker = torch.full((1,), 0.0, dtype=torch.int32, device=device) |
| found_inf = torch.full((1,), 0.0, dtype=torch.float, device="cuda:0") |
| |
| # Simulates 2 consecutive unskipped iterations |
| torch._amp_update_scale_(scale, growth_tracker, found_inf, growth, backoff, growth_interval) |
| self.assertEqual(growth_tracker, 1) |
| self.assertEqual(scale, 4.0) |
| torch._amp_update_scale_(scale, growth_tracker, found_inf, growth, backoff, growth_interval) |
| self.assertEqual(growth_tracker, 0) |
| self.assertEqual(scale, 8.0) |
| |
| # Simulates a skipped iteration |
| found_inf.fill_(1.0) |
| torch._amp_update_scale_(scale, growth_tracker, found_inf, growth, backoff, growth_interval) |
| self.assertEqual(growth_tracker, 0) |
| self.assertEqual(scale, 2.0) |
| |
| def test_grad_scaling_unscale_sparse(self, device="cuda", dtype=torch.float): |
| scaler = torch.cuda.amp.GradScaler() |
| |
| inv_scale = torch.full((1,), 0.25, dtype=dtype, device=device) |
| found_inf = torch.empty((1,), dtype=dtype, device=device) |
| cur = found_inf.device |
| |
| # As of d0c925f (4/16/20), docs are unclear about best API for sparse cuda tensor construction. |
| # https://pytorch.org/docs/master/tensors.html shows torch.sparse_coo_tensor(...), but it has no docstring. |
| # The same page shows several tensors with layout=torch.sparse_coo, but no constructors using that layout. |
| # Meanwhile, https://pytorch.org/docs/master/sparse.html shows torch.sparse.FloatTensor(...), which looks |
| # legacy and does not accept a device="cuda" kwarg. Going with torch.sparse_coo_tensor. |
| i = torch.tensor([[0, 1, 1], |
| [2, 0, 2]], device="cuda", dtype=torch.int64) |
| v = torch.tensor([16., 32., 64.], device="cuda", dtype=torch.float) |
| s = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cuda", dtype=dtype) |
| |
| p = s.clone() |
| assert p.is_sparse |
| opt = torch.optim.SGD([p], lr=1.) |
| |
| p.grad = s.clone() |
| found_inf.zero_() |
| found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, False)[cur] |
| self.assertEqual(found_inf, 0.0) |
| self.assertEqual(p.grad.to_dense(), (s / 4).to_dense()) |
| |
| v = torch.FloatTensor([16., 32., float('inf')]) |
| p.grad = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cuda", dtype=dtype) |
| found_inf.zero_() |
| found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, False)[cur] |
| self.assertEqual(found_inf, 1.0) |
| |
| v = torch.FloatTensor([16., 32., float('nan')]) |
| p.grad = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cuda", dtype=dtype) |
| found_inf.zero_() |
| found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, False)[cur] |
| self.assertEqual(found_inf, 1.0) |
| |
| p = s.clone().half() |
| assert p.is_sparse |
| opt = torch.optim.SGD([p], lr=1.) |
| |
| p.grad = s.clone().half() |
| found_inf.zero_() |
| found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, True)[cur] |
| self.assertEqual(found_inf, 0.0) |
| self.assertEqual(p.grad.to_dense(), (s.half() / 4).to_dense()) |
| |
| # Creates fp16 sparse tensor with duplicated indices (uncoalesced). The uncoalesced representation |
| # does not overflow in fp16, but the coalesced representation would, because 64000 + 64000 > fp16 max. |
| # _amp_non_finite_check_and_unscale_ should report an overflow here. |
| i = torch.LongTensor([[0, 1, 0], |
| [2, 0, 2]]) |
| v = torch.FloatTensor([64000., 32., 64000.]) |
| p.grad = torch.sparse_coo_tensor(i, v, torch.Size([2, 3]), device="cuda", dtype=torch.float16) |
| found_inf.zero_() |
| found_inf = scaler._unscale_grads_(opt, inv_scale, found_inf, True)[cur] |
| self.assertEqual(found_inf, 1.0) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_grad_scaling_device_as_key(self): |
| # Ensure that different instances of "device" objects that point to the same device |
| # are treated as identical keys by dicts. GradScaler relies on this behavior, and may |
| # error otherwise in a way that's difficult to detect (a silent performance hit). |
| d = {} |
| t = torch.empty((1,), device="cuda:0") |
| dev0a = torch.device("cuda:0") |
| dev0b = torch.device("cuda:0") |
| dev1a = torch.device("cuda:1") |
| dev1b = torch.device("cuda:1") |
| |
| self.assertTrue(hash(dev0a) == hash(dev0b)) |
| self.assertTrue(hash(dev1a) == hash(dev1b)) |
| |
| d[dev0a] = "0a" |
| d[dev0b] = "0b" |
| self.assertTrue(len(d) == 1) |
| self.assertTrue(d[dev0a] == "0b") |
| d[t.device] = "t" |
| self.assertTrue(len(d) == 1) |
| self.assertTrue(d[dev0a] == "t") |
| |
| d[dev1a] = "1a" |
| d[dev1b] = "1b" |
| self.assertTrue(len(d) == 2) |
| self.assertTrue(d[dev1a] == "1b") |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_grad_scaling_scale(self): |
| scaler = torch.cuda.amp.GradScaler(init_scale=2.) |
| t0 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:0") |
| t1 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:1") |
| # Create some nested iterables of tensors on different devices. |
| outputs = (t1.clone(), (t0.clone(), t1.clone()), [t0.clone(), (t1.clone(), t0.clone())]) |
| outputs = scaler.scale(outputs) |
| self.assertTrue(outputs[0] == 8.0 and outputs[1][0] == 8.0 and outputs[1][1] == 8.0 and |
| outputs[2][0] == 8.0 and outputs[2][1][0] == 8.0 and outputs[2][1][1] == 8.0) |
| self.assertTrue(scaler._scale.device == t1.device) |
| |
| def test_grad_scaling_state_dict(self): |
| for lazy_init_scale in True, False: |
| s0 = torch.cuda.amp.GradScaler(init_scale=3., growth_factor=4., backoff_factor=.5, growth_interval=2) |
| s1 = torch.cuda.amp.GradScaler(init_scale=6., growth_factor=7., backoff_factor=.8, growth_interval=1) |
| |
| # sets a random value for load_state_dict to overwrite |
| s1._init_growth_tracker = 7 |
| |
| if lazy_init_scale: |
| # Dummy scale() call to ensure the scale tensor is lazily initialized. |
| s1.scale(torch.full((1,), 4.0, dtype=torch.float32, device="cuda:0")) |
| self.assertTrue(isinstance(s1._scale, torch.cuda.FloatTensor)) |
| |
| s1.load_state_dict(s0.state_dict()) |
| |
| self.assertEqual(s1.get_scale(), 3.) |
| self.assertEqual(s1.get_growth_factor(), 4.) |
| self.assertEqual(s1.get_backoff_factor(), .5) |
| self.assertEqual(s1.get_growth_interval(), 2) |
| self.assertEqual(s1._init_growth_tracker, 0) |
| |
| def _create_scaling_models_optimizers(self, device="cuda"): |
| # Create a module+optimizer that will use scaling, and a control module+optimizer |
| # that will not use scaling, against which the scaling-enabled module+optimizer can be compared. |
| mod_control = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)).to(device=device) |
| mod_scaling = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)).to(device=device) |
| for c, s in zip(mod_control.parameters(), mod_scaling.parameters()): |
| s.data.copy_(c.data) |
| |
| opt_control = torch.optim.SGD(mod_control.parameters(), lr=1.0) |
| opt_scaling = torch.optim.SGD(mod_scaling.parameters(), lr=1.0) |
| |
| return mod_control, mod_scaling, opt_control, opt_scaling |
| |
| def _create_scaling_case(self, device="cuda", dtype=torch.float): |
| data = [(torch.randn((8, 8), dtype=dtype, device=device), torch.randn((8, 8), dtype=dtype, device=device)), |
| (torch.randn((8, 8), dtype=dtype, device=device), torch.randn((8, 8), dtype=dtype, device=device)), |
| (torch.randn((8, 8), dtype=dtype, device=device), torch.randn((8, 8), dtype=dtype, device=device)), |
| (torch.randn((8, 8), dtype=dtype, device=device), torch.randn((8, 8), dtype=dtype, device=device))] |
| |
| loss_fn = torch.nn.MSELoss().cuda() |
| |
| skip_iter = 2 |
| |
| return self._create_scaling_models_optimizers(device=device) + (data, loss_fn, skip_iter) |
| |
| # _run_scaling_case generalizes some single-optimizer test logic to avoid too much copy-pasting below. |
| def _run_scaling_case(self, run, unskipped, skipped, atol=1e-7): |
| # Ensure scaling can be disabled without changing user control flow. |
| for enabled in True, False: |
| mod_control, mod_scaling, opt_control, opt_scaling, data, loss_fn, skip_iter = self._create_scaling_case() |
| |
| # For functionality, test with a modest initial scale, and an unrealistically-large growth factor |
| # so any potential errors with the growth factor handling will be magnified. |
| scaler = torch.cuda.amp.GradScaler(init_scale=128., growth_factor=2.0, enabled=enabled, growth_interval=1) |
| |
| _ = run(data, mod_control, opt_control, scaler, loss_fn, skip_iter, False) |
| ret = run(data, mod_scaling, opt_scaling, scaler, loss_fn, skip_iter, True) |
| |
| # Allows run() to optionally return a different scaler instance. |
| scaler = ret if ret else scaler |
| |
| # If scaling was enabled, the scale factor should have been multiplied by the growth factor |
| # len(data) - skipped times and the backoff factor "skipped" times. |
| if enabled: |
| net_growth = scaler.get_growth_factor()**unskipped if unskipped > 0 else 1.0 |
| net_backoff = scaler.get_backoff_factor()**skipped if skipped > 0 else 1.0 |
| self.assertTrue(scaler.get_scale() == (128. * net_growth * net_backoff)) |
| else: |
| self.assertTrue(scaler.get_scale() == 1.0) |
| |
| for c, s in zip(mod_control.parameters(), mod_scaling.parameters()): |
| self.assertEqual(c, s, atol=atol, rtol=1e-05) |
| |
| # Compares no scaling + no autocasting against scaling + autocasting. |
| def test_grad_scaling_autocast(self): |
| try_pickle = False |
| |
| def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api): |
| for i, (input, target) in enumerate(data): |
| optimizer.zero_grad() |
| with torch.autocast('cuda', enabled=try_scaling_api): |
| output = model(input) |
| loss = loss_fn(output, target) |
| if try_scaling_api: |
| scaler.scale(loss).backward() |
| if i == skip_iter and scaler.is_enabled(): |
| model[1].weight.grad.data.fill_(float('inf')) |
| scaler.step(optimizer) |
| scaler.update() |
| if try_pickle: |
| scaler = pickle.loads(pickle.dumps(scaler)) |
| else: |
| loss.backward() |
| if (not scaler.is_enabled()) or (i != skip_iter): |
| optimizer.step() |
| return scaler |
| |
| # sets atol=1e-3 because we're comparing pure fp32 arithmetic vs a mixture of fp16 and fp32 |
| self._run_scaling_case(run, unskipped=3, skipped=1, atol=1e-3) |
| # this will be picked up by try_pickle within run(): |
| try_pickle = True |
| self._run_scaling_case(run, unskipped=3, skipped=1, atol=1e-3) |
| |
| def test_grad_scaling_clipping(self): |
| def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api): |
| max_norm = 0.2 # A reasonable value that actually has an effect, based on printouts of grads |
| for i, (input, target) in enumerate(data): |
| optimizer.zero_grad() |
| output = model(input) |
| loss = loss_fn(output, target) |
| if try_scaling_api: |
| scaler.scale(loss).backward() |
| torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm * scaler.get_scale()) |
| if i == skip_iter and scaler.is_enabled(): |
| model[1].weight.grad.data.fill_(float('inf')) |
| scaler.step(optimizer) |
| scaler.update() |
| else: |
| loss.backward() |
| torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) |
| if (not scaler.is_enabled()) or (i != skip_iter): |
| optimizer.step() |
| |
| self._run_scaling_case(run, unskipped=3, skipped=1, atol=1e-5) |
| |
| def test_grad_scaling_clipping_separate_unscale(self): |
| def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api): |
| max_norm = 0.2 # A reasonable value that actually has an effect, based on printouts of grads |
| for i, (input, target) in enumerate(data): |
| optimizer.zero_grad() |
| output = model(input) |
| loss = loss_fn(output, target) |
| if try_scaling_api: |
| scaler.scale(loss).backward() |
| if i == skip_iter and scaler.is_enabled(): |
| model[1].weight.grad.data.fill_(float('inf')) |
| scaler.unscale_(optimizer) |
| torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm, error_if_nonfinite=False) |
| scaler.step(optimizer) |
| scaler.update() |
| else: |
| loss.backward() |
| torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) |
| if (not scaler.is_enabled()) or (i != skip_iter): |
| optimizer.step() |
| |
| self._run_scaling_case(run, unskipped=3, skipped=1) |
| |
| @unittest.skipIf(IS_WINDOWS, 'FIXME: fix this test for Windows') |
| def test_grad_scaling_penalty(self): |
| def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api): |
| for i, (input, target) in enumerate(data): |
| optimizer.zero_grad() |
| output = model(input) |
| loss = loss_fn(output, target) |
| |
| if try_scaling_api: |
| grad_params = torch.autograd.grad(scaler.scale(loss), |
| model.parameters(), create_graph=True) |
| inv_scale = 1. / scaler.get_scale() |
| grad_params = [p * inv_scale for p in grad_params] |
| else: |
| grad_params = torch.autograd.grad(loss, model.parameters(), create_graph=True) |
| |
| grad_norm = 0 |
| for grad in grad_params: |
| grad_norm += grad.pow(2).sum() |
| grad_norm = grad_norm.sqrt() |
| loss = loss + grad_norm |
| |
| if try_scaling_api: |
| scaler.scale(loss).backward() |
| if i == skip_iter and scaler.is_enabled(): |
| model[1].weight.grad.data.fill_(float('inf')) |
| scaler.step(optimizer) |
| scaler.update() |
| else: |
| loss.backward() |
| if (not scaler.is_enabled()) or (i != skip_iter): |
| optimizer.step() |
| |
| self._run_scaling_case(run, unskipped=3, skipped=1) |
| |
| def test_grad_scaling_accumulation(self): |
| def run(data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api): |
| iters_to_accumulate = 2 |
| for i, (input, target) in enumerate(data): |
| output = model(input) |
| loss = loss_fn(output, target) |
| loss = loss / iters_to_accumulate |
| if try_scaling_api: |
| scaler.scale(loss).backward() |
| else: |
| loss.backward() |
| if (i + 1) % iters_to_accumulate == 0: |
| if try_scaling_api: |
| scaler.step(optimizer) |
| scaler.update() |
| optimizer.zero_grad() |
| else: |
| optimizer.step() |
| optimizer.zero_grad() |
| |
| self._run_scaling_case(run, unskipped=2, skipped=0) |
| |
| def test_grad_scaling_multiple(self): |
| # Tests gradient scaling with 2 models and 2 optimizers that both receive gradients from 2 losses. |
| # Some of the logic here cannot reuse the generic helper functions created for the 1-optimizer cases. |
| for enabled in True, False: |
| mod_control0, mod_scaling0, opt_control0, opt_scaling0, data, loss_fn, skip_iter = \ |
| self._create_scaling_case() |
| mod_control1, mod_scaling1, opt_control1, opt_scaling1 = \ |
| self._create_scaling_models_optimizers() |
| |
| scaler = torch.cuda.amp.GradScaler(init_scale=128., growth_factor=2.0, enabled=enabled, growth_interval=1) |
| |
| def run(model0, model1, optimizer0, optimizer1, try_scaling_api): |
| for i, (input, target) in enumerate(data): |
| optimizer0.zero_grad() |
| optimizer1.zero_grad() |
| output0 = model0(input) |
| output1 = model1(input) |
| loss0 = loss_fn(0.3 * output0 + 0.7 * output1, target) |
| loss1 = loss_fn(0.6 * output0 - 0.4 * output1, target) |
| |
| if try_scaling_api: |
| scaler.scale(loss0).backward(retain_graph=True) |
| scaler.scale(loss1).backward() |
| if i == skip_iter and scaler.is_enabled(): |
| model1[1].weight.grad.data.fill_(float('inf')) |
| |
| # As an additional stress test, separately unscale for one of the optimizers. |
| scaler.unscale_(optimizer0) |
| |
| scaler.step(optimizer0) |
| scaler.step(optimizer1) |
| scaler.update() |
| else: |
| loss0.backward(retain_graph=True) |
| loss1.backward() |
| optimizer0.step() |
| if (not scaler.is_enabled()) or (i != skip_iter): |
| optimizer1.step() |
| |
| run(mod_control0, mod_control1, opt_control0, opt_control1, False) |
| run(mod_scaling0, mod_scaling1, opt_scaling0, opt_scaling1, True) |
| |
| # The loss scale should have been multiplied by the growth factor 3 times and the backoff factor once. |
| self.assertTrue(scaler.get_scale() == (128. * scaler.get_growth_factor()**3 * |
| scaler.get_backoff_factor()**1) if enabled else 1.0) |
| |
| for c, s in zip(chain(mod_control0.parameters(), mod_control1.parameters()), |
| chain(mod_scaling0.parameters(), mod_scaling1.parameters())): |
| self.assertEqual(c, s, rtol=1e-5, atol=1e-7) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_grad_scaling_multigpu(self): |
| # Same as above, but runs some of the models on device 1. |
| # GradScaler should transparently handle losses and gradients on multiple devices. |
| # This test could be combined with the test above, but I think it makes sense to treat |
| # multi-GPU operations separately. |
| dev0 = torch.device("cuda:0") |
| dev1 = torch.device("cuda:1") |
| |
| for enabled in True, False: |
| mod_control0, mod_scaling0, opt_control0, opt_scaling0, data, loss_fn, skip_iter = \ |
| self._create_scaling_case() |
| mod_control1, mod_scaling1, opt_control1, opt_scaling1 = \ |
| self._create_scaling_models_optimizers(device=dev1) |
| |
| scaler = torch.cuda.amp.GradScaler(init_scale=128., growth_factor=2.0, enabled=enabled, growth_interval=1) |
| |
| def run(model0, model1, optimizer0, optimizer1, try_scaling_api): |
| for i, (input, target) in enumerate(data): |
| optimizer0.zero_grad() |
| optimizer1.zero_grad() |
| output0 = model0(input) |
| output1 = model1(input.to(dev1)) |
| loss0 = loss_fn(0.3 * output0 + 0.7 * output1.to(dev0), target) |
| loss1 = loss_fn(0.6 * output0.to(dev1) - 0.4 * output1, target.to(dev1)) |
| |
| if try_scaling_api: |
| scaler.scale(loss0).backward(retain_graph=True) |
| scaler.scale(loss1).backward() |
| if i == skip_iter and scaler.is_enabled(): |
| model1[1].weight.grad.data.fill_(float('inf')) |
| |
| # As an additional stress test, separately unscale for one of the optimizers. |
| scaler.unscale_(optimizer0) |
| |
| scaler.step(optimizer0) |
| scaler.step(optimizer1) |
| |
| # Make sure the found_infs were collected properly across optimizers and devices. |
| if scaler.is_enabled(): |
| self.assertTrue(len(scaler._found_inf_per_device(optimizer0)) == 1) |
| self.assertTrue(len(scaler._found_inf_per_device(optimizer1)) == 1) |
| self.assertTrue(scaler._found_inf_per_device(optimizer0)[dev0].item() == 0.) |
| self.assertTrue(scaler._found_inf_per_device(optimizer1)[dev1].item() == |
| float(i == skip_iter)) |
| |
| scaler.update() |
| else: |
| loss0.backward(retain_graph=True) |
| loss1.backward() |
| optimizer0.step() |
| if (not scaler.is_enabled()) or (i != skip_iter): |
| optimizer1.step() |
| |
| run(mod_control0, mod_control1, opt_control0, opt_control1, False) |
| run(mod_scaling0, mod_scaling1, opt_scaling0, opt_scaling1, True) |
| |
| # The loss scale should have been multiplied by the growth factor 3 times and the backoff factor once. |
| self.assertTrue(scaler.get_scale() == (128. * scaler.get_growth_factor()**3 * |
| scaler.get_backoff_factor()**1) if enabled else 1.0) |
| |
| # Copy mod_control1 and mod_scaling1 back the device 0 for comparison |
| mod_control1.to(dev0) |
| mod_scaling1.to(dev0) |
| |
| for c, s in zip(chain(mod_control0.parameters(), mod_control1.parameters()), |
| chain(mod_scaling0.parameters(), mod_scaling1.parameters())): |
| self.assertEqual(c, s, rtol=1e-5, atol=1e-7) |
| |
| def test_cublas_multiple_threads_same_device(self): |
| # Note, these parameters should be very carefully tuned |
| # Too small number makes it hard for the racing condition |
| # to happen, while too large number sometimes cause hang |
| size = 1024 |
| num_threads = 2 |
| trials = 3 |
| test_iters = 100 |
| |
| weight = torch.ones((size, size), device='cuda') |
| results = {} |
| barrier = threading.Barrier(num_threads) |
| |
| def _worker(t): |
| my_stream = torch.cuda.Stream() |
| # Hard sync so we don't need to worry about creating and using tensors |
| # across streams or the fact that default streams are thread-local. |
| # Those issues are not the target of this test. |
| torch.cuda.synchronize() |
| # Line up threads to increase likelihood of race conditions. |
| barrier.wait() |
| with torch.cuda.stream(my_stream): |
| for i in range(test_iters): |
| # If all threads are sharing the same cublas handle, |
| # the following sequence may occur: |
| # thread 0 calls cublasSetStream() |
| # thread 1 calls cublasSetStream() |
| # thread 0 launches its raw gemm, which it thinks is in |
| # its own stream, but is actually in thread 1's stream. |
| # thread 0 enqueues its div_, which IS is its own stream, |
| # but actually now races with its gemm. |
| results[t] = torch.mm(results[t], weight) |
| results[t].div_(float(size)) |
| torch.cuda.synchronize() |
| |
| for _ in range(trials): |
| for t in range(num_threads): |
| results[t] = torch.ones((size, size), device='cuda') |
| |
| threads = [threading.Thread(target=_worker, |
| args=(t,)) for t in range(num_threads)] |
| |
| for thread in threads: |
| thread.start() |
| for thread in threads: |
| thread.join() |
| |
| for t in range(num_threads): |
| self.assertEqual(results[t].sum().item(), size * size) |
| |
| # Test is flaky on Windows (https://github.com/pytorch/pytorch/issues/57401) |
| @unittest.skipIf(IS_WINDOWS, 'Test is flaky on Windows (see issue 57401)') |
| @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') |
| @skipIfRocm |
| def test_cudnn_multiple_threads_same_device(self): |
| # This function is intended to test the lazy creation and reuse of per-thread |
| # cudnn handles on each device in aten/src/ATen/cudnn/Handles.cpp. |
| # Failure here likely indicates something wrong with that logic. |
| weight = torch.ones((1, 1, 2, 2), device='cuda') |
| |
| results = {} |
| |
| num_threads = 2 |
| trials = 3 |
| test_iters = 1000 |
| barrier = threading.Barrier(num_threads) |
| |
| with torch.backends.cudnn.flags(enabled=True): |
| def _worker(t): |
| my_stream = torch.cuda.Stream() |
| # Hard sync so we don't need to worry about creating and using tensors |
| # across streams or the fact that default streams are thread-local. |
| # Those issues are not the target of this test. |
| torch.cuda.synchronize() |
| # Line up threads to increase likelihood of race conditions. |
| barrier.wait() |
| with torch.cuda.stream(my_stream): |
| for _ in range(test_iters): |
| # If all threads are sharing the same cudnn handle, |
| # the following sequence may occur: |
| # thread 0 calls setCuDNNStreamToCurrent() |
| # thread 1 calls setCuDNNStreamToCurrent() |
| # thread 0 launches its raw convolution, which it thinks is in |
| # its own stream, but is actually in thread 1's stream. |
| # thread 0 enqueues its div_, which IS is its own stream, |
| # but now races with its convolution. |
| results[t] = torch.nn.functional.conv2d(results[t], weight, padding=0) |
| results[t].div_(4.0) |
| torch.cuda.synchronize() |
| |
| for _ in range(trials): |
| for t in range(num_threads): |
| results[t] = torch.ones((1, 1, 2048, 2048), device='cuda') |
| |
| threads = [threading.Thread(target=_worker, |
| args=(t,)) for t in range(num_threads)] |
| |
| for thread in threads: |
| thread.start() |
| for thread in threads: |
| thread.join() |
| |
| for t in range(num_threads): |
| self.assertEqual(results[t].sum().item(), |
| (2048 - test_iters) * (2048 - test_iters)) |
| |
| def test_cusparse_multiple_threads_same_device(self): |
| size = 1024 |
| num_threads = 2 |
| trials = 3 |
| test_iters = 500 |
| |
| def ones_sparse(size): |
| a = torch.arange(size, device='cuda') |
| indices = torch.cartesian_prod(a, a).t() |
| values = torch.ones(size * size, device='cuda') |
| return torch.sparse_coo_tensor(indices, values) |
| |
| weight = ones_sparse(size) |
| results = {} |
| barrier = threading.Barrier(num_threads) |
| |
| def _worker(t): |
| my_stream = torch.cuda.Stream() |
| # Hard sync so we don't need to worry about creating and using tensors |
| # across streams or the fact that default streams are thread-local. |
| # Those issues are not the target of this test. |
| torch.cuda.synchronize() |
| # Line up threads to increase likelihood of race conditions. |
| barrier.wait() |
| with torch.cuda.stream(my_stream): |
| for i in range(test_iters): |
| # If all threads are sharing the same cublas handle, |
| # the following sequence may occur: |
| # thread 0 calls cublasSetStream() |
| # thread 1 calls cublasSetStream() |
| # thread 0 launches its raw gemm, which it thinks is in |
| # its own stream, but is actually in thread 1's stream. |
| # thread 0 enqueues its div_, which IS is its own stream, |
| # but actually now races with its gemm. |
| results[t] = weight.mm(results[t]) |
| results[t].div_(float(size)) |
| torch.cuda.synchronize() |
| |
| for _ in range(trials): |
| for t in range(num_threads): |
| results[t] = torch.ones((size, size), device='cuda') |
| |
| threads = [threading.Thread(target=_worker, |
| args=(t,)) for t in range(num_threads)] |
| |
| for thread in threads: |
| thread.start() |
| for thread in threads: |
| thread.join() |
| |
| for t in range(num_threads): |
| self.assertEqual(results[t].sum().item(), size * size) |
| |
| def _run_autocast_outofplace(self, op, args, run_as_type, out_type=None, module=torch, add_kwargs=None): |
| # helper to cast args |
| def cast(val, to_type): |
| if isinstance(val, torch.Tensor): |
| return val.to(to_type) if val.is_floating_point() else val |
| elif isinstance(val, collections.abc.Iterable): |
| return type(val)(cast(v, to_type) for v in val) |
| else: |
| return val |
| |
| if add_kwargs is None: |
| add_kwargs = {} |
| fast_dtype = torch.bfloat16 if run_as_type == torch.bfloat16 else torch.float16 |
| self.assertFalse(torch.is_autocast_enabled()) |
| with torch.autocast('cuda', dtype=fast_dtype): |
| self.assertTrue(torch.is_autocast_enabled()) |
| |
| out_type = out_type if out_type is not None else run_as_type |
| output = output_method = None |
| |
| # Try module.* variant, if requested: |
| if module is not None and hasattr(module, op): |
| output = getattr(module, op)(*args, **add_kwargs) |
| if isinstance(output, torch.Tensor): |
| self.assertTrue(out_type == output.dtype, |
| "autocast for torch.{} produced {}, should produce {}" |
| .format(op, output.dtype, out_type)) |
| |
| # Try Tensor.* variant: |
| if hasattr(torch.Tensor, op): |
| output_method = getattr(args[0], op)(*args[1:], **add_kwargs) |
| if isinstance(output_method, torch.Tensor): |
| self.assertTrue(out_type == output_method.dtype, |
| "autocast for torch.{} produced {}, should produce torch.{}" |
| .format(op, output_method.dtype, out_type)) |
| |
| self.assertTrue((output is not None) or (output_method is not None), |
| "{} not found as an attribute on either Tensor or the requested module {}".format( |
| op, module)) |
| |
| # Accounts for ops that return Tensors, iterables, and other non-Tensors. |
| # For example, lstm_cell returns a tuple and equal returns bool. |
| def compare(first, second): |
| if isinstance(first, torch.Tensor): |
| return torch.equal(first, second) |
| elif isinstance(first, collections.abc.Iterable): |
| return all(compare(f, s) for f, s in zip(first, second)) |
| else: |
| return first == second |
| |
| # If both torch.* and Tensor.* variants were found, check outputs are identical |
| if (output is not None) and (output_method is not None): |
| self.assertTrue(type(output) == type(output_method)) |
| comparison = compare(output, output_method) |
| self.assertTrue(comparison, "torch.{0} result did not match Tensor.{0} result".format(op)) |
| |
| # Compare numerics to Python-side "autocasting" that (we expect) does the same thing |
| # as the C++-side autocasting, and should be bitwise accurate. |
| output_to_compare = output if output is not None else output_method |
| with torch.autocast('cuda', enabled=False): |
| self.assertFalse(torch.is_autocast_enabled()) |
| |
| if module is not None and hasattr(module, op): |
| control = getattr(module, op)(*cast(args, run_as_type), **add_kwargs) |
| else: |
| control = getattr(args[0].to(run_as_type), op)(*cast(args[1:], run_as_type), **add_kwargs) |
| self.assertTrue(type(output_to_compare) == type(control)) |
| comparison = compare(output_to_compare, control) |
| self.assertTrue(comparison, "torch.{} result did not match control".format(op)) |
| self.assertTrue(torch.is_autocast_enabled()) |
| self.assertFalse(torch.is_autocast_enabled()) |
| |
| def args_maybe_kwargs(self, op_with_args): |
| if len(op_with_args) == 2: |
| return op_with_args[0], op_with_args[1], {} |
| else: |
| return op_with_args[0], op_with_args[1], op_with_args[2] |
| |
| @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') |
| def test_autocast_torch_fp16(self): |
| with torch.backends.cudnn.flags(enabled=True, deterministic=True): |
| for op_with_args in self.autocast_lists.torch_fp16: |
| skip_test = False |
| op, args = op_with_args[0], op_with_args[1] |
| if len(op_with_args) == 3: |
| skip_test = op_with_args[2] # TEST_WITH_ROCM |
| if not skip_test: |
| self._run_autocast_outofplace(op, args, torch.float16) |
| |
| @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') |
| def test_autocast_torch_bf16(self): |
| with torch.backends.cudnn.flags(enabled=True, deterministic=True): |
| for op_with_args in self.autocast_lists.torch_fp16: |
| skip_test = False |
| op, args = op_with_args[0], op_with_args[1] |
| if len(op_with_args) == 3: |
| skip_test = op_with_args[2] # TEST_WITH_ROCM |
| should_error_from_cudnn = 'cudnn' in op and not\ |
| ('TORCH_CUDNN_V8_API_ENABLED' in os.environ and |
| int(os.environ['TORCH_CUDNN_V8_API_ENABLED']) and |
| torch.cuda.get_device_capability() >= (8, 0)) |
| should_error_from_not_implemented = should_error_from_cudnn or 'prelu' in op or 'thnn' in op \ |
| or 'fused' in op or 'gru' in op or op == '_thnn_fused_lstm_cell' or op == 'lstm_cell' |
| if not skip_test: |
| if should_error_from_not_implemented: |
| with self.assertRaises(RuntimeError, msg=str(op) + ' should not be supported for bfloat16!'): |
| self._run_autocast_outofplace(op, args, torch.bfloat16) |
| else: |
| if torch.cuda.is_bf16_supported(): |
| self._run_autocast_outofplace(op, args, torch.bfloat16) |
| else: |
| with self.assertRaisesRegex(RuntimeError, 'Device does not support bfloat16'): |
| self._run_autocast_outofplace(op, args, torch.bfloat16) |
| |
| @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') |
| def test_autocast_torch_fp32(self): |
| for op_with_args in self.autocast_lists.torch_fp32: |
| op, args, maybe_kwargs = self.args_maybe_kwargs(op_with_args) |
| self._run_autocast_outofplace(op, args, torch.float32, add_kwargs=maybe_kwargs) |
| |
| @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') |
| def test_autocast_torch_need_autocast_promote(self): |
| for op, args in self.autocast_lists.torch_need_autocast_promote: |
| self._run_autocast_outofplace(op, args, torch.float32) |
| |
| @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') |
| def test_autocast_torch_expect_builtin_promote(self): |
| for op, args, out_type in self.autocast_lists.torch_expect_builtin_promote: |
| self._run_autocast_outofplace(op, args, torch.float32, out_type=out_type) |
| |
| @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') |
| def test_autocast_nn_fp16(self): |
| with torch.backends.cudnn.flags(enabled=True, deterministic=True): |
| for op, args in self.autocast_lists.nn_fp16: |
| self._run_autocast_outofplace(op, args, torch.float16, module=torch._C._nn) |
| |
| |
| |
| @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') |
| def test_autocast_nn_bf16(self): |
| with torch.backends.cudnn.flags(enabled=True, deterministic=True): |
| for op, args in self.autocast_lists.nn_fp16: |
| if torch.cuda.is_bf16_supported(): |
| self._run_autocast_outofplace(op, args, torch.bfloat16, module=torch._C._nn) |
| else: |
| with self.assertRaisesRegex(RuntimeError, 'Device does not support bfloat16'): |
| self._run_autocast_outofplace(op, args, torch.bfloat16, module=torch._C._nn) |
| |
| @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') |
| def test_autocast_nn_fp32(self): |
| for op, args in self.autocast_lists.nn_fp32: |
| self._run_autocast_outofplace(op, args, torch.float32, module=torch._C._nn) |
| |
| @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') |
| def test_autocast_linalg_fp16(self): |
| with torch.backends.cudnn.flags(enabled=True, deterministic=True): |
| for op, args in self.autocast_lists.linalg_fp16: |
| self._run_autocast_outofplace(op, args, torch.float16, module=torch._C._linalg) |
| |
| @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') |
| def test_autocast_methods_fp16(self): |
| with torch.backends.cudnn.flags(enabled=True, deterministic=True): |
| for op, args in self.autocast_lists.methods_fp16: |
| self._run_autocast_outofplace(op, args, torch.float16, module=None) |
| |
| @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') |
| def test_autocast_methods_fp32(self): |
| for op, args in self.autocast_lists.methods_fp32: |
| self._run_autocast_outofplace(op, args, torch.float32, module=None) |
| |
| @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') |
| def test_autocast_methods_expect_builtin_promote(self): |
| for op, args, out_type in self.autocast_lists.methods_expect_builtin_promote: |
| self._run_autocast_outofplace(op, args, torch.float32, module=None, out_type=out_type) |
| |
| def test_autocast_banned(self): |
| with torch.autocast('cuda'): |
| for op, args, module in self.autocast_lists.banned: |
| with self.assertRaises(RuntimeError): |
| getattr(module, op)(*args) |
| |
| def test_autocast_ignored_types(self): |
| with torch.autocast('cuda'): |
| for ignore_type in (torch.double, torch.int32): |
| a_ignore = torch.ones((8, 8), dtype=ignore_type, device="cuda:0") |
| b_ignore = torch.ones((8, 8), dtype=ignore_type, device="cuda:0") |
| c_16 = torch.ones((8, 8), dtype=torch.float16, device="cuda:0") |
| |
| # Tests if CastPolicy::fp16 ops ignore double and int |
| # Currently, no ops belonging to this policy support integer inputs. |
| if ignore_type is torch.double: |
| with self.assertRaises(RuntimeError): |
| torch.mm(a_ignore, c_16) |
| with torch.autocast('cuda', enabled=False): |
| type_no_autocast = torch.mm(a_ignore, b_ignore).dtype |
| self.assertTrue(torch.mm(a_ignore, b_ignore).dtype is type_no_autocast) |
| |
| # Tests if CastPolicy::fp32 ops ignore double and int |
| with torch.autocast('cuda', enabled=False): |
| type_no_autocast = torch.pow(a_ignore, 2.0).dtype |
| self.assertTrue(torch.pow(a_ignore, 2.0).dtype is type_no_autocast) |
| |
| # Tests if CastPolicy::fp32_set_opt_dtype ops ignore double and int |
| with torch.autocast('cuda', enabled=False): |
| type_no_autocast = torch.sum(a_ignore).dtype |
| self.assertTrue(torch.sum(a_ignore).dtype is type_no_autocast) |
| |
| # Tests if CastPolicy::fp32_append_dtype ops ignore double and int |
| # Currently, no ops belonging to this policy support integer inputs. |
| if ignore_type is torch.double: |
| with torch.autocast('cuda', enabled=False): |
| type_no_autocast = torch.norm(a_ignore).dtype |
| self.assertTrue(torch.norm(a_ignore).dtype is type_no_autocast) |
| |
| def test_autocast_custom_enabled(self): |
| class MyMM(torch.autograd.Function): |
| @staticmethod |
| @torch.cuda.amp.custom_fwd |
| def forward(ctx, a, b): |
| self.assertTrue(a.dtype is torch.float32) |
| self.assertTrue(b.dtype is torch.float32) |
| self.assertTrue(torch.is_autocast_enabled()) |
| ctx.save_for_backward(a, b) |
| return a.mm(b) |
| |
| @staticmethod |
| @torch.cuda.amp.custom_bwd |
| def backward(ctx, grad): |
| self.assertTrue(torch.is_autocast_enabled()) |
| a, b = ctx.saved_tensors |
| return grad.mm(b.t()), a.t().mm(grad) |
| |
| mymm = MyMM.apply |
| |
| x = torch.randn((8, 8), device="cuda", dtype=torch.float32, requires_grad=True) |
| y = torch.randn((8, 8), device="cuda", dtype=torch.float32, requires_grad=True) |
| |
| with torch.cuda.amp.autocast(): |
| output = mymm(x, y) |
| self.assertTrue(output.dtype is torch.float16) |
| loss = output.sum() |
| loss.backward() |
| |
| def test_autocast_custom_cast_inputs(self): |
| class MyMM(torch.autograd.Function): |
| @staticmethod |
| @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32) |
| def forward(ctx, a, container, expect_type): |
| b = container[1][0] |
| self.assertTrue(a.dtype is expect_type) |
| self.assertTrue(b.dtype is expect_type) |
| self.assertFalse(torch.is_autocast_enabled()) |
| ctx.save_for_backward(a, b) |
| return a.mm(b) |
| |
| @staticmethod |
| @torch.cuda.amp.custom_bwd |
| def backward(ctx, grad): |
| self.assertFalse(torch.is_autocast_enabled()) |
| a, b = ctx.saved_tensors |
| return grad.mm(b.t()), None, None |
| |
| mymm = MyMM.apply |
| |
| x = torch.randn((8, 8), device="cuda", dtype=torch.float16, requires_grad=True) |
| # Puts one input tensor in a nested container. y's contained Tensor won't receive a gradient, |
| # because torch.autograd.Function can't hand gradients back to non-Tensor forward arguments. |
| # Sets requires_grad=False explicitly so we don't lie about expecting a gradient. |
| y = (0, {0: torch.randn((8, 8), device="cuda", dtype=torch.float16, requires_grad=False)}) |
| |
| with torch.autocast('cuda', ): |
| output = mymm(x, y, torch.float32) |
| self.assertTrue(output.dtype is torch.float32) |
| loss = output.sum() |
| loss.backward() |
| |
| # Tests if custom_fwd becomes a no-op when mymm runs outside an autocast-enabled region. |
| output = mymm(x, y, torch.float16) |
| self.assertTrue(output.dtype is torch.float16) |
| loss = output.sum() |
| loss.backward() |
| |
| def test_autocast_cat_jit(self): |
| # Reported at https://github.com/pytorch/pytorch/issues/38958 |
| |
| class Model(torch.nn.Module): |
| def forward(self): |
| a = torch.randn(1) |
| b = torch.randn(1) |
| c = torch.cat((a, b), 0) |
| d = torch.stack([c, c], 0) |
| return d |
| |
| # The JIT here doesn't really matter, we just need to call |
| # cat via the boxed API |
| model = Model() |
| model_jit_script = torch.jit.script(model) |
| |
| with torch.autocast('cuda', enabled=True): |
| model() |
| model_jit_script() |
| |
| # cudnn RNNs require special backend handling (weights are cast to FP16 and reflattened) |
| # so they get a dedicated test. |
| # Despite the large number of RNN cases it tries, the test takes < 15 seconds on a Titan V (similar to V100). |
| @skipIfRocm |
| @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') |
| def test_autocast_rnn(self): |
| with torch.backends.cudnn.flags(enabled=True, deterministic=True): |
| # seq, batch, features, hidden size |
| clses = ("RNN", "GRU", "LSTM") |
| T, B, F, H = 3, 4, 5, 6 |
| dtypes = (torch.float16, torch.float32) |
| input_layouts = ("seq_first", "batch_first", "packed") |
| |
| for (cls, num_layers, bias, input_layout, bidirectional, try_nonpreflattened_weights, |
| input_dtype, hidden_dtype, weight_dtype) in \ |
| product(clses, (1, 2), (True, False), input_layouts, (True, False), (True, False), |
| dtypes, dtypes, dtypes): |
| if input_layout == "seq_first": |
| batch_first = False |
| x = torch.randn((T, B, F), device="cuda", dtype=input_dtype) |
| elif input_layout == "batch_first": |
| batch_first = True |
| x = torch.randn((B, T, F), device="cuda", dtype=input_dtype) |
| elif input_layout == "packed": |
| batch_first = False |
| x = torch.nn.utils.rnn.pack_padded_sequence(torch.randn((T, B, F), |
| device="cuda", dtype=input_dtype), |
| lengths=(3, 2, 1, 3), |
| enforce_sorted=False) |
| |
| rnn = getattr(torch.nn, cls)(F, H, num_layers=num_layers, bidirectional=bidirectional, |
| bias=bias, batch_first=batch_first).cuda().to(dtype=weight_dtype) |
| |
| if try_nonpreflattened_weights: |
| for p in rnn.parameters(): |
| with torch.no_grad(): |
| p.set_(p.clone()) |
| |
| h = torch.randn((num_layers * (2 if bidirectional else 1), B, H), |
| device="cuda", dtype=hidden_dtype) |
| if cls == "LSTM": |
| c = torch.randn((num_layers * (2 if bidirectional else 1), B, H), |
| device="cuda", dtype=hidden_dtype) |
| h = (h, c) |
| |
| with torch.autocast('cuda', ): |
| out, h_out = rnn(x, h) |
| out = out.data if input_layout == "packed" else out |
| self.assertEqual(out.dtype, torch.float16) |
| # Autocast wrapper requires at::_cudnn_rnn is autograd-exposed. This check can't guarantee |
| # at::_cudnn_rnn is autograd-exposed, but if it fires, it indicates some funny business has |
| # occurred and we should double check that at::_cudnn_rnn remains autograd-exposed. |
| self.assertEqual(out.grad_fn.name(), "CudnnRnnBackward0") |
| out.sum().backward() |
| grads = [p.grad.clone() for p in rnn.parameters()] |
| |
| rnn.zero_grad() |
| |
| if cls == "LSTM": |
| out_control, h_out_control = rnn.to(dtype=torch.float16)(x.half(), (h[0].half(), h[1].half())) |
| else: |
| out_control, h_out_control = rnn.to(dtype=torch.float16)(x.half(), h.half()) |
| out_control = out_control.data if input_layout == "packed" else out_control |
| out_control.sum().backward() |
| grads_control = [p.grad.clone() for p in rnn.parameters()] |
| |
| # Compares with default tolerances, even for FP16 execution. Barring nondeterminism, |
| # autocast and control results should be bitwise identical. |
| self.assertEqual(out, out_control) |
| |
| if cls == "LSTM": |
| self.assertTrue(h_out[0].dtype is torch.float16 and h_out[1].dtype is torch.float16) |
| self.assertEqual(h_out[0], h_out_control[0]) |
| self.assertEqual(h_out[1], h_out_control[1]) |
| else: |
| self.assertEqual(h_out.dtype, torch.float16) |
| self.assertEqual(h_out, h_out_control) |
| for grad, grad_control in zip(grads, grads_control): |
| self.assertEqual(grad.half(), grad_control) |
| |
| def test_autocast_cache_leak(self): |
| # Reported at https://github.com/pytorch/pytorch/issues/48049 |
| # Test is used to check, if autocast recaches the same parameters |
| # when executed in a `torch.no_grad()` block. |
| |
| linear = torch.nn.Linear(10, 10).to('cuda') |
| data = torch.randn(1, 10, device='cuda') |
| |
| with torch.autocast('cuda', ): |
| with torch.no_grad(): |
| out = linear(data) |
| first_iter_mem = torch.cuda.memory_allocated() |
| for _ in range(3): |
| out = linear(data) |
| self.assertTrue(first_iter_mem == torch.cuda.memory_allocated()) |
| |
| def test_autocast_checkpointing(self): |
| model = torch.nn.Sequential(torch.nn.Linear(8, 8), |
| torch.nn.Linear(8, 8), |
| torch.nn.Linear(8, 8)).cuda() |
| input = torch.rand((8, 8), device="cuda", dtype=torch.float16, requires_grad=True) |
| with torch.autocast('cuda', ): |
| output = checkpoint_sequential(model, 2, input) |
| self.assertTrue(output.requires_grad) |
| self.assertTrue(output.dtype is torch.float16) |
| output.sum().backward() |
| |
| @slowTest |
| @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory") |
| def test_max_large_axis(self): |
| x = torch.zeros(2**32, device='cuda', dtype=torch.int8) |
| x[-1] = 1 |
| val, idx = x.max(0) |
| self.assertEqual(val, 1) |
| self.assertEqual(idx, x.shape[0] - 1) |
| |
| @unittest.skipIf(not TEST_NUMPY, "Numpy not found") |
| def test_to_numpy(self): |
| self.assertRaises(TypeError, lambda: torch.empty(1, device="cuda").numpy()) |
| |
| def test_graph_is_current_stream_capturing(self): |
| self.assertFalse(torch.cuda.is_current_stream_capturing()) |
| |
| if (TEST_CUDA and (not TEST_WITH_ROCM) and int(torch.version.cuda.split(".")[0]) >= 11): |
| s = torch.cuda.Stream() |
| with torch.cuda.stream(s): |
| g = torch.cuda.CUDAGraph() |
| self.assertFalse(torch.cuda.is_current_stream_capturing()) |
| g.capture_begin() |
| self.assertTrue(torch.cuda.is_current_stream_capturing()) |
| g.capture_end() |
| |
| @unittest.skipIf((not TEST_CUDA) or |
| TEST_WITH_ROCM or |
| int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") |
| def test_graph_capture_simple(self): |
| s = torch.cuda.Stream() |
| |
| with torch.cuda.stream(s): |
| a = torch.full((1000,), 1, device="cuda") |
| g = torch.cuda.CUDAGraph() |
| torch.cuda.empty_cache() |
| g.capture_begin() |
| b = a |
| for _ in range(10): |
| b = b + 1 |
| g.capture_end() |
| torch.cuda.current_stream().wait_stream(s) |
| |
| g.replay() |
| |
| self.assertTrue(b.sum().item() == 11000.) |
| |
| @unittest.skipIf((not TEST_CUDA) or |
| TEST_WITH_ROCM or |
| int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") |
| def test_graph_capture_oom(self): |
| with self.assertRaisesRegex(RuntimeError, "out of memory"): |
| with torch.cuda.graph(torch.cuda.CUDAGraph()): |
| torch.zeros(2 ** 40, device="cuda") |
| |
| @unittest.skipIf((not TEST_CUDA) or |
| TEST_WITH_ROCM or |
| int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") |
| def test_graph_rng_functional(self): |
| ops_with_kwargs = ((torch.nn.functional.dropout, {"p": 0.1}), |
| (torch.nn.functional.rrelu, {"training": True}),) |
| size = 10000 |
| |
| def run(op, kwargs): |
| a = torch.randn((size,), device="cuda", dtype=torch.float) |
| |
| # Control |
| torch.cuda.manual_seed(5) |
| eager_out = a |
| for _ in range(6): |
| eager_out = op(eager_out, **kwargs) |
| |
| graph_in = a.clone() |
| stream = torch.cuda.Stream() |
| stream.wait_stream(torch.cuda.current_stream()) |
| with torch.cuda.stream(stream): |
| torch.cuda.manual_seed(5) |
| |
| g = torch.cuda.CUDAGraph() |
| torch.cuda.empty_cache() |
| g.capture_begin() |
| graph_out = graph_in |
| for _ in range(2): |
| graph_out = op(graph_out, **kwargs) |
| g.capture_end() |
| torch.cuda.current_stream().wait_stream(stream) |
| |
| # Runs a graphed->eager->graphed sequence of RNG ops. |
| # replay() plays 2 invocations of the op, so the sequence has 6 |
| # invocations total, matching Control. |
| # replay() reads from graph_in and writes to graph_out. |
| g.replay() |
| out = op(graph_out, **kwargs) |
| out = op(out, **kwargs) |
| graph_in.copy_(out) |
| g.replay() |
| |
| # If replay() updated RNG state correctly, graph_out |
| # should now hold data equal to eager_out. |
| try: |
| self.assertEqual(eager_out, graph_out) |
| except Exception as e: |
| raise RuntimeError("Failed on ", op) from e |
| |
| # We hold references to all tensors used across streams up til this sync, |
| # so no need to call record_stream on those tensors. |
| torch.cuda.synchronize() |
| |
| for op, kwargs in ops_with_kwargs: |
| run(op, kwargs) |
| |
| @unittest.skipIf((not TEST_CUDA) or |
| TEST_WITH_ROCM or |
| int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") |
| def test_graph_rng_distributions(self): |
| size = 10000 |
| input = torch.rand((size,), device="cuda", dtype=torch.float) |
| alloc = torch.empty((size,), device="cuda", dtype=torch.float) |
| |
| # Torch ops to test with sample args (tuple) and kwargs (dict) |
| torch_with_args = (("bernoulli", (input.clone(),), {}), |
| # multinomial uses some uncapturable CUDA calls. |
| # TODO: reenable multinomial tests if/when the implementation is capturable. |
| # ("multinomial", (input.clone(), size, True), {}), |
| # ("multinomial", (input.clone(), size // 2, False), {}), |
| # TODO: reenable normal test, where std is a device |
| # tensor, when graph test failures are fixed |
| # ("normal", (input.clone() + 1, input.clone()), {}), |
| ("normal", (input.clone() + 1, 1.0), {}), |
| ("poisson", (input.clone(),), {}), |
| ("rand", (size,), {"device": "cuda", "dtype": torch.float}), |
| ("randint", (0, 3, (size,)), {"device": "cuda", "dtype": torch.float}), |
| ("randn", (size,), {"device": "cuda", "dtype": torch.float}),) |
| |
| # Tensor methods to test with sample args (tuple) |
| tensor_with_args = (("bernoulli_", (input.clone(),)), |
| ("cauchy_", ()), |
| ("exponential_", ()), |
| ("geometric_", (0.3,)), |
| ("log_normal_", ()), |
| ("normal_", ()), |
| ("random_", ()), |
| ("uniform_", ()),) |
| |
| def run(module, op, args, kwargs): |
| torch.cuda.manual_seed(5) |
| |
| # Each path runs a dummy op to increment the state a bit before creating controls. |
| if (module == "torch"): |
| dummy = getattr(torch, op)(*args, **kwargs) |
| control1 = getattr(torch, op)(*args, **kwargs) |
| control2 = getattr(torch, op)(*args, **kwargs) |
| else: |
| dummy = alloc.clone() |
| control1 = alloc.clone() |
| control2 = alloc.clone() |
| getattr(dummy, op)(*args) |
| getattr(control1, op)(*args) |
| getattr(control2, op)(*args) |
| |
| stream = torch.cuda.Stream() |
| stream.wait_stream(torch.cuda.current_stream()) |
| with torch.cuda.stream(stream): |
| torch.cuda.manual_seed(5) |
| |
| g = torch.cuda.CUDAGraph() |
| torch.cuda.empty_cache() |
| if (module == "torch"): |
| g.capture_begin() |
| t1 = getattr(torch, op)(*args, **kwargs) |
| t2 = getattr(torch, op)(*args, **kwargs) |
| g.capture_end() |
| else: |
| t1 = alloc.clone() |
| t2 = alloc.clone() |
| g.capture_begin() |
| getattr(t1, op)(*args) |
| getattr(t2, op)(*args) |
| g.capture_end() |
| torch.cuda.current_stream().wait_stream(stream) |
| |
| try: |
| self.assertNotEqual(control1, t1) |
| self.assertNotEqual(control2, t2) |
| except Exception as e: |
| raise RuntimeError("Failed on " + module + "." + op) from e |
| |
| # Runs a dummy op prelude, as for controls, to make sure replay() |
| # picks up the dummy op's state increment. |
| if module == "torch": |
| dummy = getattr(torch, op)(*args, **kwargs) |
| else: |
| dummy = alloc.clone() |
| getattr(dummy, op)(*args) |
| |
| # Runs RNG ops that fill t1 and t2. |
| g.replay() |
| |
| try: |
| self.assertEqual(control1, t1) |
| self.assertEqual(control2, t2) |
| except Exception as e: |
| raise RuntimeError("Failed on " + module + "." + op) from e |
| |
| # We hold references to all tensors used across streams up til this sync, |
| # so no need to call record_stream on those tensors. |
| torch.cuda.synchronize() |
| |
| for op_with_args in torch_with_args: |
| run("torch", *op_with_args) |
| |
| for meth_with_args in tensor_with_args: |
| # Adds an empty dict for kwargs, which none of the Tensor methods use |
| run("Tensor", *(meth_with_args + ({},))) |
| |
| @unittest.skipIf((not TEST_CUDA) or |
| TEST_WITH_ROCM or |
| int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") |
| def test_graph_two_successive(self): |
| torch.cuda.empty_cache() |
| |
| size = 1000 |
| kSmallBuffer = 2097152 |
| |
| def func_with_temps(t, val): |
| x = t.clone() + val |
| y = t.clone() + val |
| return x + y |
| |
| s = torch.cuda.Stream() |
| |
| for share_mem in ("Don't share", "via pool()", "via graph_pool_handle()"): |
| g0 = torch.cuda.CUDAGraph() |
| g1 = torch.cuda.CUDAGraph() |
| |
| a = torch.ones((size,), device="cuda") |
| |
| s.wait_stream(torch.cuda.current_stream()) |
| with torch.cuda.stream(s): |
| g0_args = (torch.cuda.graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else () |
| g0.capture_begin(*g0_args) |
| b = a.clone() |
| for _ in range(5): |
| b = func_with_temps(b, 1) |
| g0.capture_end() |
| |
| g1_args = (g0.pool(),) if share_mem == "via pool()" else g0_args |
| g1.capture_begin(*g1_args) |
| for _ in range(5): |
| b = func_with_temps(b, 1) |
| g1.capture_end() |
| torch.cuda.current_stream().wait_stream(s) |
| |
| # mixes unrelated eager ops with replays |
| c = a.clone() |
| for _ in range(2): |
| c = func_with_temps(c, 3) |
| g0.replay() |
| for _ in range(2): |
| c = func_with_temps(c, 3) |
| g1.replay() |
| for _ in range(2): |
| c = func_with_temps(c, 3) |
| |
| self.assertEqual(b.sum().item(), size * 3070) |
| self.assertEqual(c.sum().item(), size * 442) |
| |
| if share_mem != "Don't share": |
| self.assertEqual(reserved_no_sharing - torch.cuda.memory_stats()["reserved_bytes.all.current"], |
| kSmallBuffer) |
| else: |
| reserved_no_sharing = torch.cuda.memory_stats()["reserved_bytes.all.current"] |
| |
| del a, b, c, g0, g1 |
| # Tensors used across streams (a and b) were held until just now, so no need to call record_stream on them. |
| torch.cuda.synchronize() |
| torch.cuda.empty_cache() |
| |
| @unittest.skip("Temporarily disabled due to a graphs bug in libcuda.so, " + |
| "see https://github.com/pytorch/pytorch/pull/57556") |
| @unittest.skipIf((not TEST_CUDA) or |
| TEST_WITH_ROCM or |
| int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") |
| def test_graph_concurrent_replay(self): |
| torch.cuda.empty_cache() |
| |
| size = 1000000 # largeish to help expose race conditions |
| |
| def func_with_temps(t, val): |
| x = t.clone() + val |
| y = t.clone() + val |
| return x + y |
| |
| s = torch.cuda.Stream() |
| |
| for share_mem in ("Don't share", "via pool()", "via graph_pool_handle()"): |
| g0 = torch.cuda.CUDAGraph() |
| g1 = torch.cuda.CUDAGraph() |
| |
| s0 = torch.cuda.Stream() |
| s1 = torch.cuda.Stream() |
| |
| a = torch.ones((size,), device="cuda") |
| |
| s.wait_stream(torch.cuda.current_stream()) |
| with torch.cuda.stream(s): |
| g0_args = (torch.cuda.graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else () |
| g0.capture_begin(*g0_args) |
| b = a.clone() |
| for _ in range(5): |
| b = func_with_temps(b, 1) |
| g0.capture_end() |
| |
| g1_args = (g0.pool(),) if share_mem == "via pool()" else g0_args |
| g1.capture_begin(*g1_args) |
| c = a.clone() |
| for _ in range(5): |
| c = func_with_temps(c, 2) |
| g1.capture_end() |
| |
| # To reproduce data corruption, I need g0 and g1's kernels to run concurrently. |
| # But replay() (especially cudaGraphLaunch) can incur significant CPU overhead. |
| # The following pattern helps align device-side execution of g0 and g1's kernels. |
| torch.cuda.synchronize() |
| with torch.cuda.stream(s0): |
| torch.cuda._sleep(1000000) |
| s1.wait_stream(s0) |
| g0.replay() |
| with torch.cuda.stream(s1): |
| g1.replay() |
| torch.cuda.current_stream().wait_stream(s0) |
| torch.cuda.current_stream().wait_stream(s1) |
| |
| if share_mem != "Don't share": |
| # Confirms concurrent replays using the same mempool corrupted each other. |
| self.assertNotEqual(b.sum().item(), size * 94) |
| self.assertNotEqual(c.sum().item(), size * 156) |
| else: |
| # Confirms concurrent replays using different mempools did not corrupt each other. |
| self.assertEqual(b.sum().item(), size * 94) |
| self.assertEqual(c.sum().item(), size * 156) |
| |
| del a, b, c, g0, g1 |
| # Tensors used across streams (a, b, c) were held until just now, so no need to call record_stream on them. |
| torch.cuda.synchronize() |
| torch.cuda.empty_cache() |
| |
| @unittest.skipIf((not TEST_CUDA) or |
| TEST_WITH_ROCM or |
| int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") |
| def test_graph_three_successive(self): |
| torch.cuda.empty_cache() |
| |
| size = 1000 |
| |
| s = torch.cuda.Stream() |
| |
| for share_mem in ("Don't share", "via pool()", "via graph_pool_handle()"): |
| a = torch.ones((size,), device="cuda") |
| |
| g0 = torch.cuda.CUDAGraph() |
| g1 = torch.cuda.CUDAGraph() |
| g2 = torch.cuda.CUDAGraph() |
| |
| s.wait_stream(torch.cuda.current_stream()) |
| with torch.cuda.stream(s): |
| g0_args = (torch.cuda.graph_pool_handle(),) if share_mem == "via graph_pool_handle()" else () |
| g0.capture_begin(*g0_args) |
| b = a.clone() |
| c = b + 1 |
| d = b + 2 |
| g0.capture_end() |
| |
| args = (g0.pool(),) if share_mem == "via pool()" else g0_args |
| |
| g1.capture_begin(*args) |
| e = c + 3 |
| del c |
| g1.capture_end() |
| |
| g2.capture_begin(*args) |
| f = d + 4 |
| g2.capture_end() |
| torch.cuda.current_stream().wait_stream(s) |
| |
| # Tests that replaying in capture order is valid |
| g0.replay() |
| g1.replay() |
| g2.replay() |
| |
| self.assertEqual(e.sum().item(), size * 5) |
| self.assertEqual(f.sum().item(), size * 7) |
| |
| # Tests that replaying as g0, g2, g1 is only valid if they don't share a pool |
| g0.replay() |
| g2.replay() |
| g1.replay() |
| |
| # If share_mem is True, g2's capture should have reused c's memory for f. We replayed g2 then g1, |
| # so we expect g1's captured "e = c + 3" mistakenly filled e with "f's vals + 3". |
| self.assertEqual(e.sum().item(), size * (7 + 3) if share_mem != "Don't share" else size * 5) |
| self.assertEqual(f.sum().item(), size * 7) |
| |
| del a, b, d, e, f, g0, g1, g2 |
| # Tensors used across streams (a, e, f) were held until just now, so no need to call record_stream on them. |
| torch.cuda.synchronize() |
| torch.cuda.empty_cache() |
| |
| @unittest.skipIf((not TEST_CUDA) or |
| TEST_WITH_ROCM or |
| int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") |
| def test_graph_memory_stats_and_use_result_after_destroy_graph(self): |
| kSmallSize = 1048576 |
| kSmallBuffer = 2097152 |
| kLargeBuffer = 20971520 |
| kMinLargeAlloc = 10485760 |
| kRoundLarge = 2097152 |
| |
| elem = 4 |
| |
| # this was annoying to write but stresses the expectations pretty rigorously |
| cases = ((512 // elem, 1, kSmallBuffer, kSmallBuffer, "small_pool"), |
| (kSmallSize // elem, 2, 2 * kSmallBuffer, kSmallBuffer, "small_pool"), |
| ((kSmallSize + 512) // elem, 1, kLargeBuffer, kLargeBuffer, "large_pool"), |
| ((kMinLargeAlloc - 512) // elem, 2, 2 * kLargeBuffer, kLargeBuffer, "large_pool"), |
| ((kMinLargeAlloc + 512) // elem, 3, |
| 3 * (kRoundLarge * ((kMinLargeAlloc + 512 + kRoundLarge - 1) // kRoundLarge)), |
| kRoundLarge * ((kMinLargeAlloc + 512 + kRoundLarge - 1) // kRoundLarge), |
| "large_pool"),) |
| |
| stats_to_check = ("segment.", |
| "reserved_bytes.", |
| "active.", |
| "active_bytes.") |
| |
| gc.collect() |
| torch.cuda.empty_cache() |
| |
| s = torch.cuda.Stream() |
| |
| for (numel, |
| delta_cudaMallocs, |
| delta_cudaMalloc_bytes, |
| delta_cudaMalloc_bytes_post_del_g, |
| pool_string) in cases: |
| if pool_string == "small_pool": |
| delta_active_blocks = 2 # one from "b" plus a sneaky one from CUDAGraph's one-element rng offset holder |
| delta_active_bytes = numel * elem + 512 # + 512 for CUDAGraph's rng offset holder |
| else: |
| delta_active_blocks = 1 # We only check the large pool, which isn't affected by rng offset holder |
| delta_active_bytes = numel * elem |
| |
| g = torch.cuda.CUDAGraph() |
| s.wait_stream(torch.cuda.current_stream()) |
| with torch.cuda.stream(s): |
| # Allocation stat estimates assume input is created on the same stream as capture_begin() |
| # (in other words, the same stream silo as the rng offset holder, which is not allocated from the |
| # capture's private pool). |
| a = torch.ones((numel,), device="cuda") |
| |
| precapture_stats = torch.cuda.memory_stats() |
| |
| g.capture_begin() |
| b = a.clone() |
| for _ in range(5): |
| b = b.clone() + 1 |
| g.capture_end() |
| torch.cuda.current_stream().wait_stream(s) |
| |
| gc.collect() |
| |
| postcapture_stats = torch.cuda.memory_stats() |
| |
| expecteds = (delta_cudaMallocs, |
| delta_cudaMalloc_bytes, |
| delta_active_blocks, |
| delta_active_bytes) |
| # Double checks replay and stats before and after a call to empty_cache |
| for i in range(2): |
| for stat, expected in zip(stats_to_check, expecteds): |
| stat = stat + pool_string + ".current" |
| current = postcapture_stats[stat] - precapture_stats[stat] |
| self.assertEqual(current, expected, "Pre to post capture delta of " + |
| stat + " = {}, expected = {}, numel = {}".format(current, expected, numel)) |
| |
| g.replay() |
| self.assertEqual(b.sum().item(), 6 * numel) |
| if i == 0: |
| torch.cuda.empty_cache() |
| |
| del g |
| gc.collect() |
| torch.cuda.empty_cache() |
| postdel_stats = torch.cuda.memory_stats() |
| |
| # Uses graph result b after graph has been deleted |
| self.assertEqual(b.sum().item(), 6 * numel) |
| |
| # b should be the only live reference remaining from the graph's private pool |
| expecteds = (1, delta_cudaMalloc_bytes_post_del_g, 1, numel * elem) |
| for stat, expected in zip(stats_to_check, expecteds): |
| stat = stat + pool_string + ".current" |
| current = postdel_stats[stat] - precapture_stats[stat] |
| self.assertEqual(current, expected, "Pre capture to post graph delete delta of " + |
| stat + " = {}, expected = {}, numel = {}".format(current, expected, numel)) |
| |
| # del a, b before the next case is essential, otherwise overwriting a and b in the next case |
| # can throw off its allocation/deallocation counts. |
| del a, b |
| # Tensors used across streams (a and b) were held until just now, so no need to call record_stream on them. |
| torch.cuda.synchronize() |
| torch.cuda.empty_cache() |
| |
| @unittest.skipIf((not TEST_CUDA) or |
| TEST_WITH_ROCM or |
| int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") |
| def test_graph_record_stream(self): |
| # Makes sure graph capture defers attempting to reclaim allocations used across streams. See |
| # "Q. Why skip process_events if a capture might be underway?" in c10/cuda/CUDACachingAllocator.cpp |
| torch.cuda.empty_cache() |
| |
| potential_problem = torch.zeros((3,), device="cuda") |
| a = torch.zeros((3,), device="cuda") |
| s0 = torch.cuda.Stream() |
| s1 = torch.cuda.Stream() |
| s2 = torch.cuda.Stream() |
| g = torch.cuda.CUDAGraph() |
| |
| torch.cuda.synchronize() |
| with torch.cuda.stream(s0): |
| potential_problem.record_stream(s0) |
| torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES) |
| potential_problem.fill_(1.) |
| del potential_problem |
| |
| with torch.cuda.stream(s1): |
| g.capture_begin() |
| # potential_problem's allocation should still be outstanding. if DeviceCachingAllocator::malloc |
| # mistakenly calls process_events, it will trigger cudaEventQueries on potential_problem's end-of-life |
| # event, which will cause the capture to error. |
| b = a.clone() |
| |
| # Let's also see what happens if we record_stream on a tensor during capture. |
| s2.wait_stream(s1) |
| with torch.cuda.stream(s2): |
| b.fill_(1.) |
| b.record_stream(s2) # dummy record_stream |
| del b |
| s1.wait_stream(s2) |
| g.capture_end() |
| torch.cuda.synchronize() |
| |
| # dummy allocation triggers process_events, Hopefully successfully processes b's end-of-life event. |
| c = torch.zeros((3,), device="cuda") |
| |
| @unittest.skipIf((not TEST_CUDA) or |
| TEST_WITH_ROCM or |
| int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") |
| # If this test is the first in the process to try cudnn rnns with dropout, it'll initialize |
| # DropoutState's long-lived internal buffer. Calling code perceives this (correct) behavior |
| # as a memory leak unless we skip the leak check. |
| @skipCUDAMemoryLeakCheckIf(True) |
| def test_graph_cudnn_dropout(self): |
| # Tests the interaction of cuda graph capture with DropoutState's syncs in ATen/native/cudnn/RNN.cpp. |
| # In particular, if user runs a sequence of captured and noncaptured cudnn rnns, DropoutState should |
| # avoid syncing noncapturing streams with captured events or vice versa. |
| torch.cuda.empty_cache() |
| |
| model = torch.nn.LSTM(512, 512, 2, dropout=0.5).cuda() |
| x = torch.ones(100, 192, 512, device="cuda") |
| |
| y = model(x) |
| |
| g = torch.cuda.CUDAGraph() |
| s = torch.cuda.Stream() |
| s.wait_stream(torch.cuda.current_stream()) |
| with torch.cuda.stream(s): |
| g.capture_begin() |
| y = model(x) |
| g.capture_end() |
| torch.cuda.current_stream().wait_stream(s) |
| |
| y = model(x) |
| |
| @unittest.skipIf((not TEST_CUDA) or |
| TEST_WITH_ROCM or |
| int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") |
| def test_graph_grad_scaling(self): |
| torch.cuda.empty_cache() |
| |
| scaler = torch.cuda.amp.GradScaler(init_scale=4.) |
| g = torch.cuda.CUDAGraph() |
| s = torch.cuda.Stream() |
| |
| weight = torch.ones((100,), device="cuda", requires_grad=True) |
| opt = torch.optim.SGD([weight], lr=0.1) |
| static_input = torch.ones_like(weight) |
| static_grad = torch.ones_like(weight) |
| |
| # warmup |
| s = torch.cuda.Stream() |
| s.wait_stream(torch.cuda.current_stream()) |
| with torch.cuda.stream(s): |
| loss = (weight.half() * static_input).sum() |
| scaler.scale(loss).backward() |
| torch.cuda.current_stream().wait_stream(s) |
| |
| opt.zero_grad(set_to_none=True) |
| |
| # capture |
| with torch.cuda.graph(g): |
| loss = (weight.half() * static_input).sum() |
| scaler.scale(loss).backward() |
| |
| input_vals = [5, 20000, 5, 40000] |
| # If the scale gets updated properly, these are the scale, growth tracker, |
| # and grad values we expect. |
| expected_scales = [4, 2, 2, 1] |
| expected_growth_trackers = [1, 0, 1, 0] |
| expected_grad_vals = [5 * 4, float("inf"), 5 * 2, float("inf")] |
| |
| for data, scale, growth_tracker, grad_val in zip(input_vals, |
| expected_scales, |
| expected_growth_trackers, |
| expected_grad_vals): |
| static_input.fill_(data) |
| g.replay() |
| self.assertEqual(weight.grad, torch.full_like(weight.grad, grad_val)) |
| scaler.step(opt) |
| scaler.update() |
| self.assertEqual(scaler._scale, scale) |
| self.assertEqual(scaler._growth_tracker, growth_tracker) |
| |
| @unittest.skipIf((not TEST_CUDA) or |
| TEST_WITH_ROCM or |
| int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") |
| def test_graph_make_graphed_callables(self): |
| torch.manual_seed(5) |
| torch.cuda.manual_seed(5) |
| |
| N, D_in, H, D_out = 640, 4096, 2048, 1024 |
| |
| models = [] |
| for _ in range(2): |
| model_section1 = torch.nn.Sequential(torch.nn.Linear(D_in, H), |
| torch.nn.Dropout(p=0.1)).cuda() |
| model_section2 = torch.nn.Sequential(torch.nn.Linear(H, D_out), |
| torch.nn.Dropout(p=0.2)).cuda() |
| models.append(torch.nn.Sequential(model_section1, model_section2)) |
| |
| model_graphed = models[0] |
| model_control = models[1] |
| |
| model_graphed.load_state_dict(model_control.state_dict()) |
| |
| opt_graphed = torch.optim.SGD(model_graphed.parameters(), lr=0.1) |
| opt_control = torch.optim.SGD(model_control.parameters(), lr=0.1) |
| |
| x = torch.randn(N, D_in, device='cuda') |
| h = torch.randn(N, H, device='cuda', requires_grad=True) |
| y_pred = torch.randn(N, D_out, device='cuda', requires_grad=True) |
| y = torch.randn(N, D_out, device='cuda') |
| |
| loss_fn_control = torch.nn.functional.mse_loss |
| relu_control = torch.nn.functional.relu |
| |
| # This is a good stress test. It graphs four callables: two Modules and two python functions. |
| model_graphed[0], model_graphed[1], relu_graphed, loss_fn_graphed = \ |
| torch.cuda.make_graphed_callables((model_graphed[0], model_graphed[1], relu_control, loss_fn_control), |
| ((x,), (h,), (y_pred,), (y_pred, y))) |
| |
| real_inputs = [torch.rand_like(x) for _ in range(10)] |
| real_targets = [torch.rand_like(y) for _ in range(10)] |
| |
| for m, opt, relu, loss_fn in zip((model_graphed, model_control), |
| (opt_graphed, opt_control), |
| (relu_graphed, relu_control), |
| (loss_fn_graphed, loss_fn_control)): |
| # Resets RNC states before iterations for graphed and ungraphed models, |
| # so dropout math should be bitwise identical for both. |
| torch.manual_seed(5) |
| torch.cuda.manual_seed(5) |
| for data, target in zip(real_inputs, real_targets): |
| opt.zero_grad(set_to_none=True) |
| y_pred = m(data) |
| y_pred = relu(y_pred) |
| loss = loss_fn(y_pred, target) |
| loss.backward() |
| opt.step() |
| |
| for p, pc in zip(model_graphed.parameters(), model_control.parameters()): |
| self.assertEqual(p, pc) |
| |
| # We graphed the models in training mode. Eval should still run ungraphed. |
| model_graphed.eval() |
| model_control.eval() |
| self.assertEqual(model_graphed(real_inputs[0]), model_control(real_inputs[0])) |
| |
| @unittest.skipIf((not TEST_CUDA) or |
| TEST_WITH_ROCM or |
| int(torch.version.cuda.split(".")[0]) < 11, "CUDA >= 11.0 required for graphs") |
| def test_graph_adam_adamw(self): |
| OptClasses = (torch.optim.Adam, torch.optim.AdamW) |
| cases = [] |
| # Needs generalization if we want to extend this test to non-Adam-like optimizers. |
| for Class, foreach, amsgrad in product(OptClasses, (False, True), (False, True)): |
| cases.append((Class, {"lr": 0.1, "betas": (0.8, 0.7), "foreach": foreach, "amsgrad": amsgrad})) |
| |
| steps_warmup = 3 |
| steps_train = 2 |
| |
| for OptClass, kwargs in cases: |
| for actually_do_graphs in (True, False): |
| params = [torch.randn((i + 5, i + 5), device="cuda") for i in range(2)] |
| params_control = [p.clone().requires_grad_() for p in params] |
| params_graphed = [p.clone().requires_grad_() for p in params] |
| |
| grads = [[torch.randn_like(p) for p in params] for _ in range(steps_warmup + steps_train)] |
| |
| # Control (capturable=False) |
| |
| opt = OptClass(params_control, capturable=False, **kwargs) |
| |
| for i in range(steps_warmup + steps_train): |
| for j, p in enumerate(params_control): |
| p.grad = grads[i][j] |
| opt.step() |
| |
| # capturable=True |
| |
| opt = OptClass(params_graphed, capturable=True, **kwargs) |
| |
| for i in range(steps_warmup): |
| for j, p in enumerate(params_graphed): |
| p.grad = grads[i][j] |
| opt.step() |
| |
| if actually_do_graphs: |
| g = torch.cuda.CUDAGraph() |
| with torch.cuda.graph(g): |
| opt.step() |
| |
| for i in range(steps_train): |
| if actually_do_graphs: |
| for j, p in enumerate(params_graphed): |
| p.grad.copy_(grads[i + steps_warmup][j]) |
| g.replay() |
| else: |
| # Passing capturable=True to the constructor and running without graphs should still be |
| # numerically correct, even if it's not ideal for performance. |
| for j, p in enumerate(params_graphed): |
| p.grad = grads[i + steps_warmup][j] |
| opt.step() |
| |
| for p_control, p_graphed in zip(params_control, params_graphed): |
| self.assertEqual(p_control, p_graphed) |
| |
| def test_batch_norm_gather_stats(self): |
| input = torch.randn(1, 3, 3, 3, device='cuda') |
| mean, invstd = torch.batch_norm_gather_stats( |
| input, mean=torch.ones(2, 3, device='cuda'), invstd=torch.ones(2, 3, device='cuda'), |
| running_mean=None, running_var=None , momentum=.1, eps=1e-5, count=2 |
| ) |
| self.assertEqual(mean, torch.ones(3, device='cuda')) |
| self.assertEqual(invstd, torch.ones(3, device='cuda')) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "Test needs multiple GPUs") |
| def test_cuda_device_memory_allocated(self): |
| from torch.cuda import memory_allocated |
| device_count = torch.cuda.device_count() |
| current_alloc = [memory_allocated(idx) for idx in range(device_count)] |
| x = torch.ones(10, device="cuda:0") |
| self.assertTrue(memory_allocated(0) > current_alloc[0]) |
| self.assertTrue(all(memory_allocated(torch.cuda.device(idx)) == current_alloc[idx] for idx in range(1, device_count))) |
| |
| def test_matmul_memory_use(self): |
| def get_max_used(): |
| torch.cuda.synchronize() |
| val = torch.cuda.max_memory_allocated() |
| torch.cuda.reset_peak_memory_stats() |
| return val |
| |
| a = torch.rand(1, 32, 32, device="cuda") |
| b = torch.rand(24, 32, 1, device="cuda") |
| |
| get_max_used() |
| |
| torch.matmul(a, b) |
| |
| matmul_mem = get_max_used() |
| |
| a = a.expand(24, 32, 32) |
| torch.matmul(a, b) |
| |
| matmul_expand_mem = get_max_used() |
| |
| torch.bmm(a, b) |
| |
| bmm_mem = get_max_used() |
| |
| self.assertEqual(matmul_expand_mem, matmul_mem) |
| self.assertEqual(bmm_mem, matmul_mem) |
| |
| @unittest.skipIf(not TEST_WITH_ROCM, "ROCm-only test") |
| def test_rocm_backward_pass_guard(self): |
| # The test exercises a ROCm-specific feature. |
| |
| class MyFunction(torch.autograd.Function): |
| @staticmethod |
| def forward(ctx, tensor, constant): |
| self.assertFalse(torch._C._rocm_is_backward_pass()) |
| ctx.constant = constant |
| return tensor * constant |
| |
| @staticmethod |
| def backward(ctx, grad_output): |
| self.assertTrue(torch._C._rocm_is_backward_pass()) |
| return grad_output * ctx.constant, None |
| |
| class MyModule(torch.nn.Module): |
| def __init__(self): |
| super().__init__() |
| self.a = torch.nn.Parameter(torch.randn(())) |
| |
| def forward(self, x): |
| return MyFunction.apply(x, self.a) |
| |
| model = MyModule() |
| criterion = torch.nn.MSELoss(reduction='sum') |
| optimizer = torch.optim.SGD(model.parameters(), lr=1e-6) |
| |
| x = torch.randn(5, 5) |
| result = model(x) |
| loss = criterion(result, x) |
| optimizer.zero_grad() |
| loss.backward() |
| optimizer.step() |
| |
| @unittest.skipIf(TEST_WITH_ROCM, "ROCm doesn't support CUDA_VISIBLE_DEVICES") |
| @unittest.skipIf(TEST_MULTIGPU, "Testing on one GPU is sufficient") |
| def test_lazy_init(self): |
| """ Validate that no CUDA calls are made during `import torch` call""" |
| from subprocess import check_output |
| test_script = "import os; import torch;os.environ['CUDA_VISIBLE_DEVICES']='32';print(torch.cuda.device_count())" |
| rc = check_output([sys.executable, '-c', test_script]).decode("ascii").strip() |
| self.assertEqual(rc, "0") |
| |
| |
| class TestCudaComm(TestCase): |
| def _test_broadcast(self, input): |
| if not TEST_MULTIGPU: |
| raise unittest.SkipTest("only one GPU detected") |
| # test regular |
| results = comm.broadcast(input, (0, 1)) |
| for i, t in enumerate(results): |
| self.assertEqual(t.get_device(), i) |
| self.assertEqual(t, input) |
| if input.is_cuda and input.get_device() == i: # test not copying on same device |
| self.assertEqual(t.data_ptr(), input.data_ptr()) |
| # test out= |
| for inplace in [True, False]: |
| if inplace: |
| outputs = [torch.empty_like(input, device=0), torch.empty_like(input, device=1)] |
| else: |
| outputs = [input.cuda(0), torch.empty_like(input, device=1)] |
| results = comm.broadcast(input, out=outputs) |
| for r, o in zip(results, outputs): |
| self.assertIs(r, o) |
| for i, t in enumerate(results): |
| self.assertEqual(t.get_device(), i) |
| self.assertEqual(t, input) |
| # test error msg |
| with self.assertRaisesRegex(RuntimeError, r"Exactly one of 'devices' and 'out'"): |
| comm.broadcast(input, (0, 1), out=outputs) |
| with self.assertRaisesRegex(RuntimeError, |
| r"Expected all output tensors to be CUDA tensors, but output tensor at index 1"): |
| comm.broadcast(input, out=[input.cuda(0), input.cpu()]) |
| with self.assertRaisesRegex(RuntimeError, |
| r"Expected all output tensors to have same shape as the source .+ at index 1"): |
| comm.broadcast(input, out=[input.cuda(0), input.cuda(1).unsqueeze(0)]) |
| |
| def test_broadcast_cpu(self): |
| self._test_broadcast(torch.randn(5, 5)) |
| |
| def test_broadcast_gpu(self): |
| self._test_broadcast(torch.randn(5, 5).cuda()) |
| |
| def _test_broadcast_coalesced(self, tensors, buffer_size): |
| b_tensors = [comm.broadcast(t, (0, 1)) for t in tensors] |
| for (_, bt), t in zip(b_tensors, tensors): |
| self.assertEqual(bt.get_device(), 1) |
| self.assertEqual(bt, t) |
| self.assertIsInstance(bt, type(t)) |
| |
| bc_tensors = comm.broadcast_coalesced(tensors, (0, 1), buffer_size=buffer_size) |
| bc_tensors_t = list(zip(*bc_tensors)) |
| self.assertEqual(b_tensors, bc_tensors_t) |
| for (_, bt), (_, bct) in zip(b_tensors, bc_tensors_t): |
| self.assertEqual(bt.get_device(), bct.get_device()) |
| self.assertIsInstance(bct, type(bt)) |
| |
| # check that tensors on device[0] are returned as-is |
| for out_tensors in (b_tensors, bc_tensors_t): |
| for inp_t, (out_t, _) in zip(tensors, out_tensors): |
| self.assertIs(inp_t, out_t) |
| |
| # check that the tensors not on device[0] have different version counters |
| # NOTE [ Version Counter in comm.*_coalesced ] |
| versions = [t._version for _, t in bc_tensors_t] |
| for old_version, (_, t) in zip(versions, bc_tensors_t): |
| self.assertEqual(t._version, old_version) |
| t.zero_() |
| self.assertEqual(t._version, old_version + 1) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| # Note: fails sometimes on the CI, passes on dual gfx906 |
| def test_broadcast_coalesced(self): |
| numel = 5 |
| num_bytes = numel * 8 |
| tensors = [ |
| make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 1, 2, 3), |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel).cuda(), |
| make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 10, 2, 3), |
| make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 5, 2, 3), |
| make_sparse_tensor(torch.cuda.sparse.LongTensor, 7, 3, 3), |
| make_sparse_tensor(torch.cuda.sparse.FloatTensor, 2, 2, 3), |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel).long().cuda(), |
| make_sparse_tensor(torch.cuda.sparse.LongTensor, 3, 2, 7), |
| torch.randn(numel * 2).int().cuda(), # int is 2x shorter |
| torch.randn(numel).cuda(), |
| ] |
| self._test_broadcast_coalesced(tensors, num_bytes * 5 // 2) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_broadcast_coalesced_dense_only(self): |
| numel = 5 |
| num_bytes = numel * 8 |
| tensors = [ |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel).cuda(), |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel * 2).int().cuda(), # int is 2x shorter |
| torch.randn(numel).cuda(), |
| ] |
| self._test_broadcast_coalesced(tensors, num_bytes * 5 // 2) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_broadcast_coalesced_empty_tensors(self): |
| tensors = [ |
| torch.tensor([]).byte().cuda(), |
| torch.randn(5).cuda(), |
| torch.randn(5).double().cuda() |
| ] |
| self._test_broadcast_coalesced(tensors, 256) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_reduce_add(self): |
| x = torch.randn(5, 5) |
| y = torch.randn(5, 5) |
| x_cuda = x.cuda(0) |
| y_cuda = y.cuda(1) |
| result = comm.reduce_add((x_cuda, y_cuda)) |
| self.assertEqual(result.get_device(), 0) |
| self.assertEqual(result.cpu(), x + y) |
| |
| def _test_reduce_add_coalesced(self, tensors, buffer_size): |
| dup_tensors = [tensors, [t.cuda(1) for t in tensors]] |
| |
| r_tensors = [comm.reduce_add(t) for t in zip(*dup_tensors)] |
| for r, t in zip(r_tensors, tensors): |
| self.assertEqualTypeString(r, t) |
| self.assertEqual(r.coalesce() if r.is_sparse else r, t * 2) |
| |
| rc_tensors = comm.reduce_add_coalesced(dup_tensors, buffer_size=buffer_size) |
| self.assertEqual(r_tensors, rc_tensors) |
| for r, rc in zip(r_tensors, rc_tensors): |
| self.assertEqualTypeString(rc, r) |
| |
| # Since we have both cuda:0 and cuda:1 inputs, the outputs must be new. |
| # We can check that they have different version counters. |
| # NOTE [ Version Counter in comm.*_coalesced ] |
| versions = [t._version for t in rc_tensors] |
| for old_version, t in zip(versions, rc_tensors): |
| self.assertEqual(t._version, old_version) |
| t.zero_() |
| self.assertEqual(t._version, old_version + 1) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_reduce_add_coalesced(self): |
| numel = 5 |
| num_bytes = numel * 8 |
| tensors = [ |
| make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 1, 2, 3), |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel).cuda(), |
| make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 10, 2, 3), |
| make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 5, 2, 3), |
| make_sparse_tensor(torch.cuda.sparse.LongTensor, 7, 3, 3), |
| make_sparse_tensor(torch.cuda.sparse.FloatTensor, 2, 2, 3), |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel).long().cuda(), |
| make_sparse_tensor(torch.cuda.sparse.LongTensor, 3, 2, 7), |
| torch.randn(numel * 2).int().cuda(), # int is 2x shorter |
| torch.randn(numel).cuda(), |
| ] |
| self._test_reduce_add_coalesced(tensors, num_bytes * 5 // 2) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_reduce_add_coalesced_dense_only(self): |
| numel = 5 |
| num_bytes = numel * 8 |
| tensors = [ |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel).cuda(), |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel * 2).int().cuda(), # int is 2x shorter |
| torch.randn(numel).cuda(), |
| ] |
| self._test_reduce_add_coalesced(tensors, num_bytes * 5 // 2) |
| |
| def _test_scatter(self, input, chunk_sizes=None, dim=0): |
| if not TEST_MULTIGPU: |
| raise unittest.SkipTest("only one GPU detected") |
| if chunk_sizes is None: |
| ref_chunk_sizes = tuple(repeat(input.size(dim) // 2, 2)) |
| else: |
| ref_chunk_sizes = chunk_sizes |
| |
| # test regular |
| result = comm.scatter(input, (0, 1), chunk_sizes, dim) |
| self.assertEqual(len(result), 2) |
| chunk_start = 0 |
| for i, r in enumerate(result): |
| chunk_end = chunk_start + ref_chunk_sizes[i] |
| index = [slice(None, None) for _ in range(input.dim())] |
| index[dim] = slice(chunk_start, chunk_end) |
| self.assertEqual(r, input[tuple(index)], atol=0, rtol=0) |
| chunk_start = chunk_end |
| if r.device == input.device: |
| self.assertEqual(r.data_ptr(), input.data_ptr()) # for target @ same device, a view should be returned |
| |
| # test out |
| out = [torch.empty_like(t) for t in result] |
| result = comm.scatter(input, dim=dim, out=out) |
| self.assertEqual(len(result), 2) |
| chunk_start = 0 |
| for i, r in enumerate(result): |
| self.assertIs(r, out[i]) |
| chunk_end = chunk_start + ref_chunk_sizes[i] |
| index = [slice(None, None) for _ in range(input.dim())] |
| index[dim] = slice(chunk_start, chunk_end) |
| self.assertEqual(r, input[tuple(index)], atol=0, rtol=0) |
| chunk_start = chunk_end |
| |
| # test error msg |
| if chunk_sizes is not None: |
| with self.assertRaisesRegex(RuntimeError, r"Expected devices and chunk_sizes to be of same length"): |
| comm.scatter(input, [0 for _ in range(len(chunk_sizes) + 1)], dim=dim, chunk_sizes=chunk_sizes) |
| with self.assertRaisesRegex(RuntimeError, r"'devices' must not be specified"): |
| comm.scatter(input, (0, 1), dim=dim, out=out) |
| with self.assertRaisesRegex(RuntimeError, r"Expected at least one device to scatter to"): |
| comm.scatter(input, (), dim=dim) |
| with self.assertRaisesRegex(RuntimeError, r"Expected at least one output tensor to scatter to"): |
| comm.scatter(input, dim=dim, out=[]) |
| with self.assertRaisesRegex(RuntimeError, |
| r"Expected all output tensors to be CUDA tensors, but output tensor at index 0"): |
| comm.scatter(input, dim=dim, out=([out[0].cpu()] + out[1:])) |
| with self.assertRaisesRegex(RuntimeError, r"Output tensor at index 0 has incorrect shape"): |
| comm.scatter(input, dim=dim, out=([out[0].unsqueeze(0)] + out[1:])) |
| with self.assertRaisesRegex(RuntimeError, r"Total size for output tensors along scatter dim \d+ does not match"): |
| index = [slice(None, None) for _ in range(input.dim())] |
| index[dim] = slice(1, None) |
| comm.scatter(input, dim=dim, out=([out[0][tuple(index)]] + out[1:])) |
| |
| def test_scatter_cpu(self): |
| self._test_scatter(torch.randn(4, 4), dim=0) |
| |
| def test_scatter_cpu_dim(self): |
| self._test_scatter(torch.randn(4, 4), dim=1) |
| |
| def test_scatter_cpu_neg_dim(self): |
| self._test_scatter(torch.randn(4, 4), dim=-2) |
| |
| def test_scatter_cpu_sizes(self): |
| self._test_scatter(torch.randn(6, 4), chunk_sizes=(2, 4)) |
| |
| def test_scatter_gpu(self): |
| self._test_scatter(torch.randn(4, 4).cuda(), dim=0) |
| |
| def test_scatter_gpu_dim(self): |
| self._test_scatter(torch.randn(4, 4).cuda(), dim=1) |
| |
| def test_scatter_gpu_neg_dim(self): |
| self._test_scatter(torch.randn(4, 4).cuda(), dim=-2) |
| |
| def test_scatter_gpu_sizes(self): |
| self._test_scatter(torch.randn(6, 4).cuda(), chunk_sizes=(2, 4)) |
| |
| def _test_gather(self, dim): |
| if not TEST_MULTIGPU: |
| raise unittest.SkipTest("only one GPU detected") |
| x = torch.randn(2, 5, device=0) |
| y = torch.randn(2, 5, device=1) |
| expected_size = list(x.size()) |
| expected_size[dim] += y.size(dim) |
| expected_size = torch.Size(expected_size) |
| |
| destinations = [None, torch.device('cuda:0'), torch.device('cpu')] |
| if torch.cuda.device_count() > 2: |
| destinations.append(torch.device('cuda:2')) |
| with torch.cuda.device(1): |
| for destination in destinations: |
| if destination is None: |
| expected_device = torch.device('cuda', torch.cuda.current_device()) |
| else: |
| expected_device = destination |
| for use_out in [True, False]: |
| if use_out: |
| out = torch.empty(expected_size, device=expected_device) |
| result = comm.gather((x, y), dim, out=out) |
| self.assertIs(out, result) |
| else: |
| result = comm.gather((x, y), dim, destination=destination) |
| |
| self.assertEqual(result.device, expected_device) |
| self.assertEqual(result.size(), expected_size) |
| |
| index = [slice(None, None), slice(None, None)] |
| index[dim] = slice(0, x.size(dim)) |
| self.assertEqual(result[tuple(index)], x) |
| index[dim] = slice(x.size(dim), x.size(dim) + y.size(dim)) |
| self.assertEqual(result[tuple(index)], y) |
| |
| # test error msg |
| with self.assertRaisesRegex(RuntimeError, r"'destination' must not be specified"): |
| comm.gather((x, y), dim, destination='cpu', out=torch.empty(expected_size, device='cpu')) |
| with self.assertRaisesRegex(RuntimeError, r"Expected at least one tensor to gather from"): |
| comm.gather(()) |
| with self.assertRaisesRegex(RuntimeError, r"Expected all input tensors to be CUDA tensors, "): |
| comm.gather((x.cpu(), y)) |
| with self.assertRaisesRegex(RuntimeError, r"Expected all input tensors to have the same number of dimensions"): |
| comm.gather((x, y.unsqueeze(0))) |
| with self.assertRaisesRegex(RuntimeError, r"Input tensor at index 1 has invalid shape"): |
| if dim in [0, -2]: |
| comm.gather((x, y[:, 1:]), dim=dim) |
| elif dim in [1, -1]: |
| comm.gather((x, y[1:, :]), dim=dim) |
| |
| def test_gather(self): |
| self._test_gather(0) |
| |
| def test_gather_dim(self): |
| self._test_gather(1) |
| |
| def test_gather_neg_dim(self): |
| self._test_gather(-1) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_memory_format_scatter_gather(self): |
| nhwc = torch.randn((10, 3, 32, 32), device='cpu').contiguous(memory_format=torch.channels_last) |
| results = torch.cuda.comm.scatter(nhwc, (0, 1), None, 0) |
| for result in results: |
| self.assertFalse(result.is_contiguous()) |
| self.assertTrue(result.is_contiguous(memory_format=torch.channels_last)) |
| |
| gathered = torch.cuda.comm.gather(results) |
| self.assertTrue(gathered.is_contiguous(memory_format=torch.channels_last)) |
| |
| |
| def test_matmul_device_mismatch(self): |
| cpu = torch.rand((10, 10)) |
| cuda = cpu.cuda() |
| with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"): |
| cpu @ cuda |
| with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"): |
| cuda @ cpu |
| |
| for s, m1, m2 in product((cpu, cuda), repeat=3): |
| if s.device == m1.device == m2.device: |
| torch.addmm(s, m1, m2) |
| else: |
| with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"): |
| torch.addmm(s, m1, m2) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "Test needs multiple GPUs") |
| def test_scatter_namedtuple(self): |
| # tests ability to scatter namedtuples and retrieve a list where each |
| # element is of the expected namedtuple type. |
| fields = ("a", "b") |
| TestNamedTupleInput_0 = collections.namedtuple("NamedTuple", fields) |
| num_gpus = torch.cuda.device_count() |
| a = torch.rand(num_gpus * 2, device=0) |
| b = torch.rand(num_gpus * 2, device=0) |
| a_tensors_for_gpu = [a[2 * i : 2 * i + 2].to(i) for i in range(num_gpus)] |
| b_tensors_for_gpu = [b[2 * i : 2 * i + 2].to(i) for i in range(num_gpus)] |
| |
| inp = TestNamedTupleInput_0(a, b) |
| target_gpus = [torch.device(i) for i in range(num_gpus)] |
| scatter_out = scatter_gather.scatter(inp, target_gpus) |
| |
| for i, x in enumerate(scatter_out): |
| self.assertTrue(isinstance(x, type(inp))) |
| self.assertEqual(x._fields, fields) |
| expected_a = a_tensors_for_gpu[i] |
| expected_b = b_tensors_for_gpu[i] |
| self.assertEqual(expected_a, x.a) |
| self.assertEqual(expected_b, x.b) |
| |
| class TestNamedTupleInput_1(NamedTuple): |
| a: torch.tensor |
| b: torch.tensor |
| |
| a = torch.rand(num_gpus * 2, device=0) |
| b = torch.rand(num_gpus * 2, device=0) |
| a_tensors_for_gpu = [a[2 * i : 2 * i + 2].to(i) for i in range(num_gpus)] |
| b_tensors_for_gpu = [b[2 * i : 2 * i + 2].to(i) for i in range(num_gpus)] |
| inp = TestNamedTupleInput_1(a, b) |
| |
| scatter_out = scatter_gather.scatter(inp, target_gpus) |
| for i, x in enumerate(scatter_out): |
| self.assertTrue(isinstance(x, type(inp))) |
| self.assertEqual(x._fields, fields) |
| expected_a = a_tensors_for_gpu[i] |
| expected_b = b_tensors_for_gpu[i] |
| self.assertEqual(expected_a, x.a) |
| self.assertEqual(expected_b, x.b) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "Test needs multiple GPUs") |
| def test_gather_namedtuple(self): |
| # tests ability to gather a list of namedtuples and return a namedtuple where each |
| # element is of the expected tensor type. |
| fields = ['a', 'b'] |
| TestNamedTupleInput_0 = collections.namedtuple('NamedTuple', fields) |
| |
| num_gpus = torch.cuda.device_count() |
| a = torch.rand(num_gpus * 2, device=0) |
| b = torch.rand(num_gpus * 2, device=1) |
| out1 = TestNamedTupleInput_0(a, b) |
| |
| a = torch.rand(num_gpus * 2, device=1) |
| b = torch.rand(num_gpus * 2, device=0) |
| out2 = TestNamedTupleInput_0(a, b) |
| |
| outputs = [out1, out2] |
| |
| out = scatter_gather.gather(outputs, 'cpu') # test on CPU |
| for i, x in enumerate(out): |
| self.assertTrue(isinstance(x, type(out2[-1]))) # x must be a tensor |
| cat = torch.cat((outputs[0][i].to('cpu'), outputs[1][i].to('cpu'))) |
| self.assertTrue(torch.equal(x, cat)) |
| |
| out = scatter_gather.gather(outputs, 0) # test on GPU |
| for i, x in enumerate(out): |
| self.assertTrue(isinstance(x, type(out2[-1]))) |
| cat = torch.cat((outputs[0][i].to(0), outputs[1][i].to(0))) |
| self.assertTrue(torch.equal(x, cat)) |
| |
| class TestNamedTupleInput_1(NamedTuple): |
| a: torch.tensor |
| b: torch.tensor |
| |
| a = torch.rand(num_gpus * 2, device=0) |
| b = torch.rand(num_gpus * 2, device=1) |
| out1 = TestNamedTupleInput_1(a, b) |
| |
| a = torch.rand(num_gpus * 2, device=1) |
| b = torch.rand(num_gpus * 2, device=0) |
| out2 = TestNamedTupleInput_1(a, b) |
| |
| outputs = [out1, out2] |
| |
| out = scatter_gather.gather(outputs, 0) # test on GPU |
| for i, x in enumerate(out): |
| self.assertTrue(isinstance(x, type(out2[-1]))) |
| cat = torch.cat((outputs[0][i].to(0), outputs[1][i].to(0))) |
| self.assertTrue(torch.equal(x, cat)) |
| |
| out = scatter_gather.gather(outputs, 'cpu') # test on CPU |
| for i, x in enumerate(out): |
| self.assertTrue(isinstance(x, type(out2[-1]))) |
| cat = torch.cat((outputs[0][i].to('cpu'), outputs[1][i].to('cpu'))) |
| self.assertTrue(torch.equal(x, cat)) |
| |
| if __name__ == '__main__': |
| run_tests() |