test/test_cuda_nvml_based_avail.py - platform/external/pytorch - Git at Google

 # Owner(s): ["module: cuda"]

 import multiprocessing
 import os
 import sys
 import unittest
 from unittest.mock import patch

 import torch


 # NOTE: Each of the tests in this module need to be run in a brand new process to ensure CUDA is uninitialized
 # prior to test initiation.
 with patch.dict(os.environ, {"PYTORCH_NVML_BASED_CUDA_CHECK": "1"}):
     # Before executing the desired tests, we need to disable CUDA initialization and fork_handler additions that would
     # otherwise be triggered by the `torch.testing._internal.common_utils` module import
     from torch.testing._internal.common_utils import (
         instantiate_parametrized_tests,
         IS_JETSON,
         IS_WINDOWS,
         NoTest,
         parametrize,
         run_tests,
         TestCase,
     )

     # NOTE: Because `remove_device_and_dtype_suffixes` initializes CUDA context (triggered via the import of
     # `torch.testing._internal.common_device_type` which imports `torch.testing._internal.common_cuda`) we need
     # to bypass that method here which should be irrelevant to the parameterized tests in this module.
     torch.testing._internal.common_utils.remove_device_and_dtype_suffixes = lambda x: x

     TEST_CUDA = torch.cuda.is_available()
     if not TEST_CUDA:
         print("CUDA not available, skipping tests", file=sys.stderr)
         TestCase = NoTest  # type: ignore[misc, assignment] # noqa: F811


 @torch.testing._internal.common_utils.markDynamoStrictTest
 class TestExtendedCUDAIsAvail(TestCase):
     SUBPROCESS_REMINDER_MSG = (
         "\n REMINDER: Tests defined in test_cuda_nvml_based_avail.py must be run in a process "
         "where there CUDA Driver API has not been initialized. Before further debugging, ensure you are either using "
         "run_test.py or have added --subprocess to run each test in a different subprocess."
     )

     def setUp(self):
         super().setUp()
         torch.cuda._cached_device_count = (
             None  # clear the lru_cache on this method before our test
         )

     @staticmethod
     def in_bad_fork_test() -> bool:
         _ = torch.cuda.is_available()
         return torch.cuda._is_in_bad_fork()

     # These tests validate the behavior and activation of the weaker, NVML-based, user-requested
     # `torch.cuda.is_available()` assessment. The NVML-based assessment should be attempted when
     # `PYTORCH_NVML_BASED_CUDA_CHECK` is set to 1, reverting to the default CUDA Runtime API check otherwise.
     # If the NVML-based assessment is attempted but fails, the CUDA Runtime API check should be executed
     @unittest.skipIf(IS_WINDOWS, "Needs fork")
     @parametrize("nvml_avail", [True, False])
     @parametrize("avoid_init", ["1", "0", None])
     def test_cuda_is_available(self, avoid_init, nvml_avail):
         if IS_JETSON and nvml_avail and avoid_init == "1":
             self.skipTest("Not working for Jetson")
         patch_env = {"PYTORCH_NVML_BASED_CUDA_CHECK": avoid_init} if avoid_init else {}
         with patch.dict(os.environ, **patch_env):
             if nvml_avail:
                 _ = torch.cuda.is_available()
             else:
                 with patch.object(torch.cuda, "_device_count_nvml", return_value=-1):
                     _ = torch.cuda.is_available()
             with multiprocessing.get_context("fork").Pool(1) as pool:
                 in_bad_fork = pool.apply(TestExtendedCUDAIsAvail.in_bad_fork_test)
             if os.getenv("PYTORCH_NVML_BASED_CUDA_CHECK") == "1" and nvml_avail:
                 self.assertFalse(
                     in_bad_fork, TestExtendedCUDAIsAvail.SUBPROCESS_REMINDER_MSG
                 )
             else:
                 assert in_bad_fork


 @torch.testing._internal.common_utils.markDynamoStrictTest
 class TestVisibleDeviceParses(TestCase):
     def test_env_var_parsing(self):
         def _parse_visible_devices(val):
             from torch.cuda import _parse_visible_devices as _pvd

             with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
                 return _pvd()

         # rest of the string is ignored
         self.assertEqual(_parse_visible_devices("1gpu2,2ampere"), [1, 2])
         # Negatives abort parsing
         self.assertEqual(_parse_visible_devices("0, 1, 2, -1, 3"), [0, 1, 2])
         # Double mention of ordinal returns empty set
         self.assertEqual(_parse_visible_devices("0, 1, 2, 1"), [])
         # Unary pluses and minuses
         self.assertEqual(_parse_visible_devices("2, +3, -0, 5"), [2, 3, 0, 5])
         # Random string is used as empty set
         self.assertEqual(_parse_visible_devices("one,two,3,4"), [])
         # Random string is used as separator
         self.assertEqual(_parse_visible_devices("4,3,two,one"), [4, 3])
         # GPU ids are parsed
         self.assertEqual(_parse_visible_devices("GPU-9e8d35e3"), ["GPU-9e8d35e3"])
         # Ordinals are not included in GPUid set
         self.assertEqual(_parse_visible_devices("GPU-123, 2"), ["GPU-123"])
         # MIG ids are parsed
         self.assertEqual(_parse_visible_devices("MIG-89c850dc"), ["MIG-89c850dc"])

     def test_partial_uuid_resolver(self):
         from torch.cuda import _transform_uuid_to_ordinals

         uuids = [
             "GPU-9942190a-aa31-4ff1-4aa9-c388d80f85f1",
             "GPU-9e8d35e3-a134-0fdd-0e01-23811fdbd293",
             "GPU-e429a63e-c61c-4795-b757-5132caeb8e70",
             "GPU-eee1dfbc-0a0f-6ad8-5ff6-dc942a8b9d98",
             "GPU-bbcd6503-5150-4e92-c266-97cc4390d04e",
             "GPU-472ea263-58d7-410d-cc82-f7fdece5bd28",
             "GPU-e56257c4-947f-6a5b-7ec9-0f45567ccf4e",
             "GPU-1c20e77d-1c1a-d9ed-fe37-18b8466a78ad",
         ]
         self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3"], uuids), [1])
         self.assertEqual(
             _transform_uuid_to_ordinals(["GPU-e4", "GPU-9e8d35e3"], uuids), [2, 1]
         )
         self.assertEqual(
             _transform_uuid_to_ordinals("GPU-9e8d35e3,GPU-1,GPU-47".split(","), uuids),
             [1, 7, 5],
         )
         # First invalid UUID aborts parsing
         self.assertEqual(
             _transform_uuid_to_ordinals(["GPU-123", "GPU-9e8d35e3"], uuids), []
         )
         self.assertEqual(
             _transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-123", "GPU-47"], uuids),
             [1],
         )
         # First ambigous UUID aborts parsing
         self.assertEqual(
             _transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-e", "GPU-47"], uuids), [1]
         )
         # Duplicate UUIDs result in empty set
         self.assertEqual(
             _transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-47", "GPU-9e8"], uuids),
             [],
         )

     def test_ordinal_parse_visible_devices(self):
         def _device_count_nvml(val):
             from torch.cuda import _device_count_nvml as _dc

             with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
                 return _dc()

         with patch.object(torch.cuda, "_raw_device_count_nvml", return_value=2):
             self.assertEqual(_device_count_nvml("1, 0"), 2)
             # Ordinal out of bounds aborts parsing
             self.assertEqual(_device_count_nvml("1, 5, 0"), 1)


 instantiate_parametrized_tests(TestExtendedCUDAIsAvail)

 if __name__ == "__main__":
     run_tests()
	# Owner(s): ["module: cuda"]

	import multiprocessing
	import os
	import sys
	import unittest
	from unittest.mock import patch

	import torch


	# NOTE: Each of the tests in this module need to be run in a brand new process to ensure CUDA is uninitialized
	# prior to test initiation.
	with patch.dict(os.environ, {"PYTORCH_NVML_BASED_CUDA_CHECK": "1"}):
	# Before executing the desired tests, we need to disable CUDA initialization and fork_handler additions that would
	# otherwise be triggered by the `torch.testing._internal.common_utils` module import
	from torch.testing._internal.common_utils import (
	instantiate_parametrized_tests,
	IS_JETSON,
	IS_WINDOWS,
	NoTest,
	parametrize,
	run_tests,
	TestCase,
	)

	# NOTE: Because `remove_device_and_dtype_suffixes` initializes CUDA context (triggered via the import of
	# `torch.testing._internal.common_device_type` which imports `torch.testing._internal.common_cuda`) we need
	# to bypass that method here which should be irrelevant to the parameterized tests in this module.
	torch.testing._internal.common_utils.remove_device_and_dtype_suffixes = lambda x: x

	TEST_CUDA = torch.cuda.is_available()
	if not TEST_CUDA:
	print("CUDA not available, skipping tests", file=sys.stderr)
	TestCase = NoTest # type: ignore[misc, assignment] # noqa: F811


	@torch.testing._internal.common_utils.markDynamoStrictTest
	class TestExtendedCUDAIsAvail(TestCase):
	SUBPROCESS_REMINDER_MSG = (
	"\n REMINDER: Tests defined in test_cuda_nvml_based_avail.py must be run in a process "
	"where there CUDA Driver API has not been initialized. Before further debugging, ensure you are either using "
	"run_test.py or have added --subprocess to run each test in a different subprocess."
	)

	def setUp(self):
	super().setUp()
	torch.cuda._cached_device_count = (
	None # clear the lru_cache on this method before our test
	)

	@staticmethod
	def in_bad_fork_test() -> bool:
	_ = torch.cuda.is_available()
	return torch.cuda._is_in_bad_fork()

	# These tests validate the behavior and activation of the weaker, NVML-based, user-requested
	# `torch.cuda.is_available()` assessment. The NVML-based assessment should be attempted when
	# `PYTORCH_NVML_BASED_CUDA_CHECK` is set to 1, reverting to the default CUDA Runtime API check otherwise.
	# If the NVML-based assessment is attempted but fails, the CUDA Runtime API check should be executed
	@unittest.skipIf(IS_WINDOWS, "Needs fork")
	@parametrize("nvml_avail", [True, False])
	@parametrize("avoid_init", ["1", "0", None])
	def test_cuda_is_available(self, avoid_init, nvml_avail):
	if IS_JETSON and nvml_avail and avoid_init == "1":
	self.skipTest("Not working for Jetson")
	patch_env = {"PYTORCH_NVML_BASED_CUDA_CHECK": avoid_init} if avoid_init else {}
	with patch.dict(os.environ, **patch_env):
	if nvml_avail:
	_ = torch.cuda.is_available()
	else:
	with patch.object(torch.cuda, "_device_count_nvml", return_value=-1):
	_ = torch.cuda.is_available()
	with multiprocessing.get_context("fork").Pool(1) as pool:
	in_bad_fork = pool.apply(TestExtendedCUDAIsAvail.in_bad_fork_test)
	if os.getenv("PYTORCH_NVML_BASED_CUDA_CHECK") == "1" and nvml_avail:
	self.assertFalse(
	in_bad_fork, TestExtendedCUDAIsAvail.SUBPROCESS_REMINDER_MSG
	)
	else:
	assert in_bad_fork


	@torch.testing._internal.common_utils.markDynamoStrictTest
	class TestVisibleDeviceParses(TestCase):
	def test_env_var_parsing(self):
	def _parse_visible_devices(val):
	from torch.cuda import _parse_visible_devices as _pvd

	with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
	return _pvd()

	# rest of the string is ignored
	self.assertEqual(_parse_visible_devices("1gpu2,2ampere"), [1, 2])
	# Negatives abort parsing
	self.assertEqual(_parse_visible_devices("0, 1, 2, -1, 3"), [0, 1, 2])
	# Double mention of ordinal returns empty set
	self.assertEqual(_parse_visible_devices("0, 1, 2, 1"), [])
	# Unary pluses and minuses
	self.assertEqual(_parse_visible_devices("2, +3, -0, 5"), [2, 3, 0, 5])
	# Random string is used as empty set
	self.assertEqual(_parse_visible_devices("one,two,3,4"), [])
	# Random string is used as separator
	self.assertEqual(_parse_visible_devices("4,3,two,one"), [4, 3])
	# GPU ids are parsed
	self.assertEqual(_parse_visible_devices("GPU-9e8d35e3"), ["GPU-9e8d35e3"])
	# Ordinals are not included in GPUid set
	self.assertEqual(_parse_visible_devices("GPU-123, 2"), ["GPU-123"])
	# MIG ids are parsed
	self.assertEqual(_parse_visible_devices("MIG-89c850dc"), ["MIG-89c850dc"])

	def test_partial_uuid_resolver(self):
	from torch.cuda import _transform_uuid_to_ordinals

	uuids = [
	"GPU-9942190a-aa31-4ff1-4aa9-c388d80f85f1",
	"GPU-9e8d35e3-a134-0fdd-0e01-23811fdbd293",
	"GPU-e429a63e-c61c-4795-b757-5132caeb8e70",
	"GPU-eee1dfbc-0a0f-6ad8-5ff6-dc942a8b9d98",
	"GPU-bbcd6503-5150-4e92-c266-97cc4390d04e",
	"GPU-472ea263-58d7-410d-cc82-f7fdece5bd28",
	"GPU-e56257c4-947f-6a5b-7ec9-0f45567ccf4e",
	"GPU-1c20e77d-1c1a-d9ed-fe37-18b8466a78ad",
	]
	self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3"], uuids), [1])
	self.assertEqual(
	_transform_uuid_to_ordinals(["GPU-e4", "GPU-9e8d35e3"], uuids), [2, 1]
	)
	self.assertEqual(
	_transform_uuid_to_ordinals("GPU-9e8d35e3,GPU-1,GPU-47".split(","), uuids),
	[1, 7, 5],
	)
	# First invalid UUID aborts parsing
	self.assertEqual(
	_transform_uuid_to_ordinals(["GPU-123", "GPU-9e8d35e3"], uuids), []
	)
	self.assertEqual(
	_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-123", "GPU-47"], uuids),
	[1],
	)
	# First ambigous UUID aborts parsing
	self.assertEqual(
	_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-e", "GPU-47"], uuids), [1]
	)
	# Duplicate UUIDs result in empty set
	self.assertEqual(
	_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-47", "GPU-9e8"], uuids),
	[],
	)

	def test_ordinal_parse_visible_devices(self):
	def _device_count_nvml(val):
	from torch.cuda import _device_count_nvml as _dc

	with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
	return _dc()

	with patch.object(torch.cuda, "_raw_device_count_nvml", return_value=2):
	self.assertEqual(_device_count_nvml("1, 0"), 2)
	# Ordinal out of bounds aborts parsing
	self.assertEqual(_device_count_nvml("1, 5, 0"), 1)


	instantiate_parametrized_tests(TestExtendedCUDAIsAvail)

	if __name__ == "__main__":
	run_tests()