c10/cuda/CUDAFunctions.cpp - platform/external/pytorch - Git at Google

 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/macros/Macros.h>

 #include <limits>

 namespace c10::cuda {

 namespace {
 // returns -1 on failure
 int32_t driver_version() {
   int driver_version = -1;
   C10_CUDA_IGNORE_ERROR(cudaDriverGetVersion(&driver_version));
   return driver_version;
 }

 int device_count_impl(bool fail_if_no_driver) {
   int count = 0;
   auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDeviceCount(&count));
   if (err == cudaSuccess) {
     return count;
   }
   // Clear out the error state, so we don't spuriously trigger someone else.
   // (This shouldn't really matter, since we won't be running very much CUDA
   // code in this regime.)
   cudaError_t last_err C10_UNUSED = cudaGetLastError();
   switch (err) {
     case cudaErrorNoDevice:
       // Zero devices is ok here
       count = 0;
       break;
     case cudaErrorInsufficientDriver: {
       auto version = driver_version();
       if (version <= 0) {
         if (!fail_if_no_driver) {
           // No CUDA driver means no devices
           count = 0;
           break;
         }
         TORCH_CHECK(
             false,
             "Found no NVIDIA driver on your system. Please check that you "
             "have an NVIDIA GPU and installed a driver from "
             "http://www.nvidia.com/Download/index.aspx");
       } else {
         TORCH_CHECK(
             false,
             "The NVIDIA driver on your system is too old (found version ",
             version,
             "). Please update your GPU driver by downloading and installing "
             "a new version from the URL: "
             "http://www.nvidia.com/Download/index.aspx Alternatively, go to: "
             "https://pytorch.org to install a PyTorch version that has been "
             "compiled with your version of the CUDA driver.");
       }
     } break;
     case cudaErrorInitializationError:
       TORCH_CHECK(
           false,
           "CUDA driver initialization failed, you might not "
           "have a CUDA gpu.");
       break;
     case cudaErrorUnknown:
       TORCH_CHECK(
           false,
           "CUDA unknown error - this may be due to an "
           "incorrectly set up environment, e.g. changing env "
           "variable CUDA_VISIBLE_DEVICES after program start. "
           "Setting the available devices to be zero.");
       break;
 #if C10_ASAN_ENABLED
     case cudaErrorMemoryAllocation:
       // In ASAN mode, we know that a cudaErrorMemoryAllocation error will
       // pop up if compiled with NVCC (clang-cuda is fine)
       TORCH_CHECK(
           false,
           "Got 'out of memory' error while trying to initialize CUDA. "
           "CUDA with nvcc does not work well with ASAN and it's probably "
           "the reason. We will simply shut down CUDA support. If you "
           "would like to use GPUs, turn off ASAN.");
       break;
 #endif // C10_ASAN_ENABLED
     default:
       TORCH_CHECK(
           false,
           "Unexpected error from cudaGetDeviceCount(). Did you run "
           "some cuda functions before calling NumCudaDevices() "
           "that might have already set an error? Error ",
           err,
           ": ",
           cudaGetErrorString(err));
   }
   return count;
 }
 } // namespace

 DeviceIndex device_count() noexcept {
   // initialize number of devices only once
   static int count = []() {
     try {
       auto result = device_count_impl(/*fail_if_no_driver=*/false);
       TORCH_INTERNAL_ASSERT(
           result <= std::numeric_limits<DeviceIndex>::max(),
           "Too many CUDA devices, DeviceIndex overflowed");
       return result;
     } catch (const c10::Error& ex) {
       // We don't want to fail, but still log the warning
       // msg() returns the message without the stack trace
       TORCH_WARN("CUDA initialization: ", ex.msg());
       return 0;
     }
   }();
   return static_cast<DeviceIndex>(count);
 }

 DeviceIndex device_count_ensure_non_zero() {
   // Call the implementation every time to throw the exception
   int count = device_count_impl(/*fail_if_no_driver=*/true);
   // Zero gpus doesn't produce a warning in `device_count` but we fail here
   TORCH_CHECK(count, "No CUDA GPUs are available");
   TORCH_INTERNAL_ASSERT(
       count <= std::numeric_limits<DeviceIndex>::max(),
       "Too many CUDA devices, DeviceIndex overflowed");
   return static_cast<DeviceIndex>(count);
 }

 DeviceIndex current_device() {
   DeviceIndex cur_device = -1;
   C10_CUDA_CHECK(c10::cuda::GetDevice(&cur_device));
   return cur_device;
 }

 void set_device(DeviceIndex device) {
   C10_CUDA_CHECK(c10::cuda::SetDevice(device));
 }

 void device_synchronize() {
   const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
   if (C10_UNLIKELY(interp)) {
     (*interp)->trace_gpu_device_synchronization(c10::kCUDA);
   }
   C10_CUDA_CHECK(cudaDeviceSynchronize());
 }

 // this function has to be called from callers performing cuda synchronizing
 // operations, to raise proper error or warning
 void warn_or_error_on_sync() {
   if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_ERROR) {
     TORCH_CHECK(false, "called a synchronizing CUDA operation");
   } else if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_WARN) {
     TORCH_WARN("called a synchronizing CUDA operation");
   }
 }

 std::optional<DeviceIndex> getDeviceIndexWithPrimaryContext() {
   // check current device first
   auto current_device_index = current_device();
   if (current_device_index >= 0) {
     if (hasPrimaryContext(current_device_index)) {
       return current_device_index;
     }
   }
   for (const auto device_index : c10::irange(at::cuda::device_count())) {
     if (device_index == current_device_index)
       continue;
     if (hasPrimaryContext(device_index)) {
       return device_index;
     }
   }
   return std::nullopt;
 }

 namespace _internal {
 bool dummyHasPrimaryContext(C10_UNUSED DeviceIndex device_index) {
   TORCH_CHECK(false, "Should never been called");
 }
 bool (*hasPrimaryContext)(DeviceIndex) = dummyHasPrimaryContext;

 // Private api to be called from CUDAHooks.cpp
 C10_CUDA_API void setHasPrimaryContext(bool (*func)(DeviceIndex)) {
   hasPrimaryContext = func ? func : dummyHasPrimaryContext;
 }
 } // namespace _internal

 bool hasPrimaryContext(DeviceIndex device_index) {
   return _internal::hasPrimaryContext(device_index);
 }

 // Wrappers for raw CUDA device management functions
 cudaError_t GetDeviceCount(int* dev_count) {
   return cudaGetDeviceCount(dev_count);
 }

 // This is a codepath for CUDA 12 that comes with a critical change in behavior
 // of `cudaSetDevice`. Unlike to previous CUDA versions that allocate context
 // lazily CUDA 12.x eagerly allocates primary context the moment `cudaSetDevice`
 // is called. This can lead to dramatic consequences and pollute the device
 // memory in distributed runs. To avoid unnecessary context creation a new
 // function called `MaybeSetDevice` was introduced. This function is to be
 // called in device guard destructor and at the exit of torch.cuda.device
 // context manager. The behavior of `MaybeSetDevice` is quite simple, it calls
 // to `cudaSetDevice` if context already exist or if context was not allocated
 // on targeted device it simply saves the device index. This way we can keep
 // PyTorch backward compatible for applications like this:
 //
 // ```
 // import torch
 // x = torch.empty(1, device=“cuda:1”) # no CUDA context on cuda:0 after this
 // call y = torch.empty(1, device=“cuda”) # CUDA context is created on cuda:0
 // ```
 #if CUDA_VERSION >= 12000
 thread_local DeviceIndex targetDeviceIndex = -1;

 cudaError_t GetDevice(DeviceIndex* device) {
   if (targetDeviceIndex >= 0) {
     *device = targetDeviceIndex;
     return cudaSuccess;
   }
   int tmp_device = -1;
   auto err = cudaGetDevice(&tmp_device);
   if (err == cudaSuccess) {
     TORCH_INTERNAL_ASSERT(
         tmp_device >= 0 &&
             tmp_device <= std::numeric_limits<DeviceIndex>::max(),
         "cudaGetDevice returns invalid device ",
         tmp_device);
     *device = static_cast<DeviceIndex>(tmp_device);
   }
   return err;
 }

 cudaError_t SetDevice(DeviceIndex device) {
   TORCH_CHECK(device >= 0, "device id must be positive!", device);
   targetDeviceIndex = -1;
   int cur_device = -1;
   C10_CUDA_CHECK(cudaGetDevice(&cur_device));
   if (device == cur_device) {
     return cudaSuccess;
   }
   return cudaSetDevice(device);
 }

 cudaError_t MaybeSetDevice(DeviceIndex device) {
   if (hasPrimaryContext(device)) {
     return c10::cuda::SetDevice(device);
   }
   targetDeviceIndex = device;
   return cudaSuccess;
 }

 // This function always initializes the CUDA context
 // on to_device
 DeviceIndex ExchangeDevice(DeviceIndex to_device) {
   auto cur_device = targetDeviceIndex;
   targetDeviceIndex = -1;
   if (cur_device < 0) {
     int tmp_device = -1;
     C10_CUDA_CHECK(cudaGetDevice(&tmp_device));
     cur_device = static_cast<DeviceIndex>(tmp_device);
     if (to_device == cur_device) {
       return cur_device;
     }
   }
   C10_CUDA_CHECK(cudaSetDevice(to_device));
   return cur_device;
 }

 // This function does not initialize the CUDA context
 // on to_device if it does not already exist
 DeviceIndex MaybeExchangeDevice(DeviceIndex to_device) {
   int tmp_cur_device = -1;
   C10_CUDA_CHECK(cudaGetDevice(&tmp_cur_device));
   TORCH_INTERNAL_ASSERT(
       tmp_cur_device >= 0 &&
           tmp_cur_device <= std::numeric_limits<DeviceIndex>::max(),
       "cudaGetDevice returns invalid device ",
       tmp_cur_device);
   auto cur_device = static_cast<DeviceIndex>(tmp_cur_device);
   if (to_device == tmp_cur_device) {
     return cur_device;
   }
   if (hasPrimaryContext(to_device)) {
     C10_CUDA_CHECK(cudaSetDevice(to_device));
   } else {
     targetDeviceIndex = to_device;
   }
   return cur_device;
 }

 void SetTargetDevice() {
   if (targetDeviceIndex >= 0) {
     C10_CUDA_CHECK(c10::cuda::SetDevice(targetDeviceIndex));
   }
 }
 #else
 cudaError_t GetDevice(DeviceIndex* device) {
   int tmp_device = -1;
   auto err = cudaGetDevice(&tmp_device);
   if (err == cudaSuccess) {
     TORCH_INTERNAL_ASSERT(
         tmp_device >= 0 &&
             tmp_device <= std::numeric_limits<DeviceIndex>::max(),
         "cudaGetDevice returns invalid device ",
         tmp_device);
     *device = static_cast<DeviceIndex>(tmp_device);
   }
   return err;
 }

 cudaError_t SetDevice(DeviceIndex device) {
   TORCH_CHECK(device >= 0, "device id must be positive!", device);
   int cur_device = -1;
   C10_CUDA_CHECK(cudaGetDevice(&cur_device));
   if (device == cur_device) {
     return cudaSuccess;
   }
   return cudaSetDevice(device);
 }

 cudaError_t MaybeSetDevice(DeviceIndex device) {
   return c10::cuda::SetDevice(device);
 }

 DeviceIndex ExchangeDevice(DeviceIndex to_device) {
   DeviceIndex cur_device = -1;
   C10_CUDA_CHECK(c10::cuda::GetDevice(&cur_device));
   if (to_device == cur_device) {
     return cur_device;
   }
   C10_CUDA_CHECK(cudaSetDevice(to_device));
   return cur_device;
 }

 DeviceIndex MaybeExchangeDevice(DeviceIndex to_device) {
   return c10::cuda::ExchangeDevice(to_device);
 }

 void SetTargetDevice() {
   // no-op on CUDA version < 12.x
 }
 #endif

 } // namespace c10::cuda
	#include <c10/cuda/CUDAFunctions.h>
	#include <c10/macros/Macros.h>

	#include <limits>

	namespace c10::cuda {

	namespace {
	// returns -1 on failure
	int32_t driver_version() {
	int driver_version = -1;
	C10_CUDA_IGNORE_ERROR(cudaDriverGetVersion(&driver_version));
	return driver_version;
	}

	int device_count_impl(bool fail_if_no_driver) {
	int count = 0;
	auto err = C10_CUDA_ERROR_HANDLED(c10::cuda::GetDeviceCount(&count));
	if (err == cudaSuccess) {
	return count;
	}
	// Clear out the error state, so we don't spuriously trigger someone else.
	// (This shouldn't really matter, since we won't be running very much CUDA
	// code in this regime.)
	cudaError_t last_err C10_UNUSED = cudaGetLastError();
	switch (err) {
	case cudaErrorNoDevice:
	// Zero devices is ok here
	count = 0;
	break;
	case cudaErrorInsufficientDriver: {
	auto version = driver_version();
	if (version <= 0) {
	if (!fail_if_no_driver) {
	// No CUDA driver means no devices
	count = 0;
	break;
	}
	TORCH_CHECK(
	false,
	"Found no NVIDIA driver on your system. Please check that you "
	"have an NVIDIA GPU and installed a driver from "
	"http://www.nvidia.com/Download/index.aspx");
	} else {
	TORCH_CHECK(
	false,
	"The NVIDIA driver on your system is too old (found version ",
	version,
	"). Please update your GPU driver by downloading and installing "
	"a new version from the URL: "
	"http://www.nvidia.com/Download/index.aspx Alternatively, go to: "
	"https://pytorch.org to install a PyTorch version that has been "
	"compiled with your version of the CUDA driver.");
	}
	} break;
	case cudaErrorInitializationError:
	TORCH_CHECK(
	false,
	"CUDA driver initialization failed, you might not "
	"have a CUDA gpu.");
	break;
	case cudaErrorUnknown:
	TORCH_CHECK(
	false,
	"CUDA unknown error - this may be due to an "
	"incorrectly set up environment, e.g. changing env "
	"variable CUDA_VISIBLE_DEVICES after program start. "
	"Setting the available devices to be zero.");
	break;
	#if C10_ASAN_ENABLED
	case cudaErrorMemoryAllocation:
	// In ASAN mode, we know that a cudaErrorMemoryAllocation error will
	// pop up if compiled with NVCC (clang-cuda is fine)
	TORCH_CHECK(
	false,
	"Got 'out of memory' error while trying to initialize CUDA. "
	"CUDA with nvcc does not work well with ASAN and it's probably "
	"the reason. We will simply shut down CUDA support. If you "
	"would like to use GPUs, turn off ASAN.");
	break;
	#endif // C10_ASAN_ENABLED
	default:
	TORCH_CHECK(
	false,
	"Unexpected error from cudaGetDeviceCount(). Did you run "
	"some cuda functions before calling NumCudaDevices() "
	"that might have already set an error? Error ",
	err,
	": ",
	cudaGetErrorString(err));
	}
	return count;
	}
	} // namespace

	DeviceIndex device_count() noexcept {
	// initialize number of devices only once
	static int count = []() {
	try {
	auto result = device_count_impl(/fail_if_no_driver=/false);
	TORCH_INTERNAL_ASSERT(
	result <= std::numeric_limits<DeviceIndex>::max(),
	"Too many CUDA devices, DeviceIndex overflowed");
	return result;
	} catch (const c10::Error& ex) {
	// We don't want to fail, but still log the warning
	// msg() returns the message without the stack trace
	TORCH_WARN("CUDA initialization: ", ex.msg());
	return 0;
	}
	}();
	return static_cast<DeviceIndex>(count);
	}

	DeviceIndex device_count_ensure_non_zero() {
	// Call the implementation every time to throw the exception
	int count = device_count_impl(/fail_if_no_driver=/true);
	// Zero gpus doesn't produce a warning in `device_count` but we fail here
	TORCH_CHECK(count, "No CUDA GPUs are available");
	TORCH_INTERNAL_ASSERT(
	count <= std::numeric_limits<DeviceIndex>::max(),
	"Too many CUDA devices, DeviceIndex overflowed");
	return static_cast<DeviceIndex>(count);
	}

	DeviceIndex current_device() {
	DeviceIndex cur_device = -1;
	C10_CUDA_CHECK(c10::cuda::GetDevice(&cur_device));
	return cur_device;
	}

	void set_device(DeviceIndex device) {
	C10_CUDA_CHECK(c10::cuda::SetDevice(device));
	}

	void device_synchronize() {
	const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
	if (C10_UNLIKELY(interp)) {
	(*interp)->trace_gpu_device_synchronization(c10::kCUDA);
	}
	C10_CUDA_CHECK(cudaDeviceSynchronize());
	}

	// this function has to be called from callers performing cuda synchronizing
	// operations, to raise proper error or warning
	void warn_or_error_on_sync() {
	if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_ERROR) {
	TORCH_CHECK(false, "called a synchronizing CUDA operation");
	} else if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_WARN) {
	TORCH_WARN("called a synchronizing CUDA operation");
	}
	}

	std::optional<DeviceIndex> getDeviceIndexWithPrimaryContext() {
	// check current device first
	auto current_device_index = current_device();
	if (current_device_index >= 0) {
	if (hasPrimaryContext(current_device_index)) {
	return current_device_index;
	}
	}
	for (const auto device_index : c10::irange(at::cuda::device_count())) {
	if (device_index == current_device_index)
	continue;
	if (hasPrimaryContext(device_index)) {
	return device_index;
	}
	}
	return std::nullopt;
	}

	namespace _internal {
	bool dummyHasPrimaryContext(C10_UNUSED DeviceIndex device_index) {
	TORCH_CHECK(false, "Should never been called");
	}
	bool (*hasPrimaryContext)(DeviceIndex) = dummyHasPrimaryContext;

	// Private api to be called from CUDAHooks.cpp
	C10_CUDA_API void setHasPrimaryContext(bool (*func)(DeviceIndex)) {
	hasPrimaryContext = func ? func : dummyHasPrimaryContext;
	}
	} // namespace _internal

	bool hasPrimaryContext(DeviceIndex device_index) {
	return _internal::hasPrimaryContext(device_index);
	}

	// Wrappers for raw CUDA device management functions
	cudaError_t GetDeviceCount(int* dev_count) {
	return cudaGetDeviceCount(dev_count);
	}

	// This is a codepath for CUDA 12 that comes with a critical change in behavior
	// of `cudaSetDevice`. Unlike to previous CUDA versions that allocate context
	// lazily CUDA 12.x eagerly allocates primary context the moment `cudaSetDevice`
	// is called. This can lead to dramatic consequences and pollute the device
	// memory in distributed runs. To avoid unnecessary context creation a new
	// function called `MaybeSetDevice` was introduced. This function is to be
	// called in device guard destructor and at the exit of torch.cuda.device
	// context manager. The behavior of `MaybeSetDevice` is quite simple, it calls
	// to `cudaSetDevice` if context already exist or if context was not allocated
	// on targeted device it simply saves the device index. This way we can keep
	// PyTorch backward compatible for applications like this:
	//
	// ```
	// import torch
	// x = torch.empty(1, device=“cuda:1”) # no CUDA context on cuda:0 after this
	// call y = torch.empty(1, device=“cuda”) # CUDA context is created on cuda:0
	// ```
	#if CUDA_VERSION >= 12000
	thread_local DeviceIndex targetDeviceIndex = -1;

	cudaError_t GetDevice(DeviceIndex* device) {
	if (targetDeviceIndex >= 0) {
	*device = targetDeviceIndex;
	return cudaSuccess;
	}
	int tmp_device = -1;
	auto err = cudaGetDevice(&tmp_device);
	if (err == cudaSuccess) {
	TORCH_INTERNAL_ASSERT(
	tmp_device >= 0 &&
	tmp_device <= std::numeric_limits<DeviceIndex>::max(),
	"cudaGetDevice returns invalid device ",
	tmp_device);
	*device = static_cast<DeviceIndex>(tmp_device);
	}
	return err;
	}

	cudaError_t SetDevice(DeviceIndex device) {
	TORCH_CHECK(device >= 0, "device id must be positive!", device);
	targetDeviceIndex = -1;
	int cur_device = -1;
	C10_CUDA_CHECK(cudaGetDevice(&cur_device));
	if (device == cur_device) {
	return cudaSuccess;
	}
	return cudaSetDevice(device);
	}

	cudaError_t MaybeSetDevice(DeviceIndex device) {
	if (hasPrimaryContext(device)) {
	return c10::cuda::SetDevice(device);
	}
	targetDeviceIndex = device;
	return cudaSuccess;
	}

	// This function always initializes the CUDA context
	// on to_device
	DeviceIndex ExchangeDevice(DeviceIndex to_device) {
	auto cur_device = targetDeviceIndex;
	targetDeviceIndex = -1;
	if (cur_device < 0) {
	int tmp_device = -1;
	C10_CUDA_CHECK(cudaGetDevice(&tmp_device));
	cur_device = static_cast<DeviceIndex>(tmp_device);
	if (to_device == cur_device) {
	return cur_device;
	}
	}
	C10_CUDA_CHECK(cudaSetDevice(to_device));
	return cur_device;
	}

	// This function does not initialize the CUDA context
	// on to_device if it does not already exist
	DeviceIndex MaybeExchangeDevice(DeviceIndex to_device) {
	int tmp_cur_device = -1;
	C10_CUDA_CHECK(cudaGetDevice(&tmp_cur_device));
	TORCH_INTERNAL_ASSERT(
	tmp_cur_device >= 0 &&
	tmp_cur_device <= std::numeric_limits<DeviceIndex>::max(),
	"cudaGetDevice returns invalid device ",
	tmp_cur_device);
	auto cur_device = static_cast<DeviceIndex>(tmp_cur_device);
	if (to_device == tmp_cur_device) {
	return cur_device;
	}
	if (hasPrimaryContext(to_device)) {
	C10_CUDA_CHECK(cudaSetDevice(to_device));
	} else {
	targetDeviceIndex = to_device;
	}
	return cur_device;
	}

	void SetTargetDevice() {
	if (targetDeviceIndex >= 0) {
	C10_CUDA_CHECK(c10::cuda::SetDevice(targetDeviceIndex));
	}
	}
	#else
	cudaError_t GetDevice(DeviceIndex* device) {
	int tmp_device = -1;
	auto err = cudaGetDevice(&tmp_device);
	if (err == cudaSuccess) {
	TORCH_INTERNAL_ASSERT(
	tmp_device >= 0 &&
	tmp_device <= std::numeric_limits<DeviceIndex>::max(),
	"cudaGetDevice returns invalid device ",
	tmp_device);
	*device = static_cast<DeviceIndex>(tmp_device);
	}
	return err;
	}

	cudaError_t SetDevice(DeviceIndex device) {
	TORCH_CHECK(device >= 0, "device id must be positive!", device);
	int cur_device = -1;
	C10_CUDA_CHECK(cudaGetDevice(&cur_device));
	if (device == cur_device) {
	return cudaSuccess;
	}
	return cudaSetDevice(device);
	}

	cudaError_t MaybeSetDevice(DeviceIndex device) {
	return c10::cuda::SetDevice(device);
	}

	DeviceIndex ExchangeDevice(DeviceIndex to_device) {
	DeviceIndex cur_device = -1;
	C10_CUDA_CHECK(c10::cuda::GetDevice(&cur_device));
	if (to_device == cur_device) {
	return cur_device;
	}
	C10_CUDA_CHECK(cudaSetDevice(to_device));
	return cur_device;
	}

	DeviceIndex MaybeExchangeDevice(DeviceIndex to_device) {
	return c10::cuda::ExchangeDevice(to_device);
	}

	void SetTargetDevice() {
	// no-op on CUDA version < 12.x
	}
	#endif

	} // namespace c10::cuda