c10/cuda/CUDAStream.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <cstdint>
 #include <utility>

 #include <cuda_runtime_api.h>

 #include <c10/core/DeviceGuard.h>
 #include <c10/core/Stream.h>
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/util/Exception.h>

 /*
  * Stream pool note.
  *
  * A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams
  * are backed by cuStreams, but they use several pools to minimize the costs
  * associated with creating, retaining, and destroying cuStreams.
  *
  * There are three pools per device, and a device's pools are lazily created.
  *
  * The first pool contains only the default stream. When the default stream
  * is requested it's returned.
  *
  * The second pool is the "low priority" or "default priority" streams. In
  * HIP builds there is no distinction between streams in this pool and streams
  * in the third pool (below). There are 32 of these streams per device, and
  * when a stream is requested one of these streams is returned round-robin.
  * That is, the first stream requested is at index 0, the second at index 1...
  * to index 31, then index 0 again.
  *
  * This means that if 33 low priority streams are requested, the first and
  * last streams requested are actually the same stream (under the covers)
  * and kernels enqueued on them cannot run concurrently.
  *
  * The third pool is the "high priority" streams. The third pool acts like
  * the second pool except the streams are created with a higher priority.
  *
  * These pools suggest that stream users should prefer many short-lived streams,
  * as the cost of acquiring and releasing streams is effectively zero. If
  * many longer-lived streams are required in performance critical scenarios
  * then the functionality here may need to be extended to allow, for example,
  * "reserving" a subset of the pool so that other streams do not accidentally
  * overlap the performance critical streams.
  *
  * Note: although the notion of "current stream for device" is thread local
  * (every OS thread has a separate current stream, as one might expect),
  * the stream pool is global across all threads; stream 0 is always stream 0
  * no matter which thread you use it on.  Multiple threads can synchronize
  * on the same stream.  Although the CUDA documentation is not very clear
  * on the matter, streams are thread safe; e.g., it is safe to enqueue
  * a kernel on the same stream from two different threads.
  */

 namespace c10 {
 namespace cuda {

 // Value object representing a CUDA stream.  This is just a wrapper
 // around c10::Stream, but it comes with a little extra CUDA-specific
 // functionality (conversion to cudaStream_t), and a guarantee that
 // the wrapped c10::Stream really is a CUDA stream.
 class C10_CUDA_API CUDAStream {
  public:
   enum Unchecked { UNCHECKED };

   /// Construct a CUDAStream from a Stream.  This construction is checked,
   /// and will raise an error if the Stream is not, in fact, a CUDA stream.
   explicit CUDAStream(Stream stream) : stream_(stream) {
     TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
   }

   /// Construct a CUDAStream from a Stream with no error checking.
   /// This constructor uses the "named" constructor idiom, and can
   /// be invoked as: CUDAStream(CUDAStream::UNCHECKED, stream)
   explicit CUDAStream(Unchecked, Stream stream) : stream_(stream) {}

   bool operator==(const CUDAStream& other) const noexcept {
     return unwrap() == other.unwrap();
   }

   bool operator!=(const CUDAStream& other) const noexcept {
     return unwrap() != other.unwrap();
   }

   /// Implicit conversion to cudaStream_t.
   operator cudaStream_t() const {
     return stream();
   }

   /// Implicit conversion to Stream (a.k.a., forget that the stream is a
   /// CUDA stream).
   operator Stream() const {
     return unwrap();
   }

   /// Get the CUDA device index that this stream is associated with.
   DeviceIndex device_index() const {
     return stream_.device_index();
   }

   /// Get the full Device that this stream is associated with.  The Device
   /// is guaranteed to be a CUDA device.
   Device device() const {
     return Device(DeviceType::CUDA, device_index());
   }

   /// Return the stream ID corresponding to this particular stream.
   StreamId id() const {
     return stream_.id();
   }

   bool query() const {
     DeviceGuard guard{stream_.device()};
     cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream()));

     if (err == cudaSuccess) {
       return true;
     } else if (err != cudaErrorNotReady) {
       C10_CUDA_CHECK(err);
     } else {
       // ignore and clear the error if not ready
       (void)cudaGetLastError();
     }

     return false;
   }

   void synchronize() const {
     DeviceGuard guard{stream_.device()};
     c10::cuda::stream_synchronize(stream());
   }

   int priority() const {
     DeviceGuard guard{stream_.device()};
     int priority = 0;
     C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
     return priority;
   }

   /// Explicit conversion to cudaStream_t.
   cudaStream_t stream() const;

   /// Explicit conversion to Stream.
   Stream unwrap() const {
     return stream_;
   }

   /// Reversibly pack a CUDAStream into a uint64_t representation.  This may
   /// be helpful when storing a CUDAStream in a C struct, where you cannot
   /// conveniently place the CUDAStream object itself (which is morally
   /// equivalent, but unfortunately is not POD due to the fact that it
   /// has constructors.)
   ///
   /// The CUDAStream can be unpacked using unpack().  The format of
   /// the uint64_t is unspecified and may be changed.
   uint64_t pack() const noexcept {
     return stream_.pack();
   }

   // Unpack a CUDAStream from the uint64_t representation generated by pack().
   static CUDAStream unpack(uint64_t bits) {
     return CUDAStream(Stream::unpack(bits));
   }

   static std::tuple<int, int> priority_range() {
     // Note: this returns the range of priority **supported by PyTorch**, not
     // the range of priority **supported by CUDA**. The former is a subset of
     // the latter. Currently PyTorch only supports 0 and -1, which are "low" and
     // "high" priority.
     int least_priority, greatest_priority;
     C10_CUDA_CHECK(
         cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
     TORCH_INTERNAL_ASSERT(
         least_priority >= 0, "Unexpected CUDA stream priority range");
     TORCH_INTERNAL_ASSERT(
         greatest_priority <= -1, "Unexpected CUDA stream priority range");
     return std::make_tuple(0, -1);
   }

   // Deleted for now; use CUDAEvent::block instead
   // void synchronize_with(const CUDAEvent& event) const;

  private:
   Stream stream_;
 };

 /**
  * Get a new stream from the CUDA stream pool.  You can think of this
  * as "creating" a new stream, but no such creation actually happens;
  * instead, streams are preallocated from the pool and returned in a
  * round-robin fashion.
  *
  * You can request a stream from the high priority pool by setting
  * isHighPriority to true, or a stream for a specific device by setting device
  * (defaulting to the current CUDA stream.)
  */
 TORCH_API CUDAStream
 getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);

 /**
  * Get a CUDAStream from a externally allocated one.
  *
  * This is mainly for interoperability with different libraries where we
  * want to operate on a non-torch allocated stream for data exchange or similar
  * purposes
  */
 TORCH_API CUDAStream
 getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);

 /**
  * Get the default CUDA stream, for the passed CUDA device, or for the
  * current device if no device index is passed.  The default stream is
  * where most computation occurs when you aren't explicitly using
  * streams.
  */
 TORCH_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);

 /**
  * Get the current CUDA stream, for the passed CUDA device, or for the
  * current device if no device index is passed.  The current CUDA stream
  * will usually be the default CUDA stream for the device, but it may
  * be different if someone called 'setCurrentCUDAStream' or used 'StreamGuard'
  * or 'CUDAStreamGuard'.
  */
 TORCH_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);

 /**
  * Set the current stream on the device of the passed in stream to be
  * the passed in stream.  Yes, you read that right: this function
  * has *nothing* to do with the current device: it toggles the current
  * stream of the device of the passed stream.
  *
  * Confused?  Avoid using this function; prefer using 'CUDAStreamGuard' instead
  * (which will switch both your current device and current stream in the way you
  * expect, and reset it back to its original state afterwards).
  */
 TORCH_API void setCurrentCUDAStream(CUDAStream stream);

 C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);

 } // namespace cuda
 } // namespace c10

 namespace std {
 template <>
 struct hash<c10::cuda::CUDAStream> {
   size_t operator()(c10::cuda::CUDAStream s) const noexcept {
     return std::hash<c10::Stream>{}(s.unwrap());
   }
 };
 } // namespace std
	#pragma once

	#include <cstdint>
	#include <utility>

	#include <cuda_runtime_api.h>

	#include <c10/core/DeviceGuard.h>
	#include <c10/core/Stream.h>
	#include <c10/cuda/CUDAFunctions.h>
	#include <c10/util/Exception.h>

	/*
	* Stream pool note.
	*
	* A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams
	* are backed by cuStreams, but they use several pools to minimize the costs
	* associated with creating, retaining, and destroying cuStreams.
	*
	* There are three pools per device, and a device's pools are lazily created.
	*
	* The first pool contains only the default stream. When the default stream
	* is requested it's returned.
	*
	* The second pool is the "low priority" or "default priority" streams. In
	* HIP builds there is no distinction between streams in this pool and streams
	* in the third pool (below). There are 32 of these streams per device, and
	* when a stream is requested one of these streams is returned round-robin.
	* That is, the first stream requested is at index 0, the second at index 1...
	* to index 31, then index 0 again.
	*
	* This means that if 33 low priority streams are requested, the first and
	* last streams requested are actually the same stream (under the covers)
	* and kernels enqueued on them cannot run concurrently.
	*
	* The third pool is the "high priority" streams. The third pool acts like
	* the second pool except the streams are created with a higher priority.
	*
	* These pools suggest that stream users should prefer many short-lived streams,
	* as the cost of acquiring and releasing streams is effectively zero. If
	* many longer-lived streams are required in performance critical scenarios
	* then the functionality here may need to be extended to allow, for example,
	* "reserving" a subset of the pool so that other streams do not accidentally
	* overlap the performance critical streams.
	*
	* Note: although the notion of "current stream for device" is thread local
	* (every OS thread has a separate current stream, as one might expect),
	* the stream pool is global across all threads; stream 0 is always stream 0
	* no matter which thread you use it on. Multiple threads can synchronize
	* on the same stream. Although the CUDA documentation is not very clear
	* on the matter, streams are thread safe; e.g., it is safe to enqueue
	* a kernel on the same stream from two different threads.
	*/

	namespace c10 {
	namespace cuda {

	// Value object representing a CUDA stream. This is just a wrapper
	// around c10::Stream, but it comes with a little extra CUDA-specific
	// functionality (conversion to cudaStream_t), and a guarantee that
	// the wrapped c10::Stream really is a CUDA stream.
	class C10_CUDA_API CUDAStream {
	public:
	enum Unchecked { UNCHECKED };

	/// Construct a CUDAStream from a Stream. This construction is checked,
	/// and will raise an error if the Stream is not, in fact, a CUDA stream.
	explicit CUDAStream(Stream stream) : stream_(stream) {
	TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
	}

	/// Construct a CUDAStream from a Stream with no error checking.
	/// This constructor uses the "named" constructor idiom, and can
	/// be invoked as: CUDAStream(CUDAStream::UNCHECKED, stream)
	explicit CUDAStream(Unchecked, Stream stream) : stream_(stream) {}

	bool operator==(const CUDAStream& other) const noexcept {
	return unwrap() == other.unwrap();
	}

	bool operator!=(const CUDAStream& other) const noexcept {
	return unwrap() != other.unwrap();
	}

	/// Implicit conversion to cudaStream_t.
	operator cudaStream_t() const {
	return stream();
	}

	/// Implicit conversion to Stream (a.k.a., forget that the stream is a
	/// CUDA stream).
	operator Stream() const {
	return unwrap();
	}

	/// Get the CUDA device index that this stream is associated with.
	DeviceIndex device_index() const {
	return stream_.device_index();
	}

	/// Get the full Device that this stream is associated with. The Device
	/// is guaranteed to be a CUDA device.
	Device device() const {
	return Device(DeviceType::CUDA, device_index());
	}

	/// Return the stream ID corresponding to this particular stream.
	StreamId id() const {
	return stream_.id();
	}

	bool query() const {
	DeviceGuard guard{stream_.device()};
	cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream()));

	if (err == cudaSuccess) {
	return true;
	} else if (err != cudaErrorNotReady) {
	C10_CUDA_CHECK(err);
	} else {
	// ignore and clear the error if not ready
	(void)cudaGetLastError();
	}

	return false;
	}

	void synchronize() const {
	DeviceGuard guard{stream_.device()};
	c10::cuda::stream_synchronize(stream());
	}

	int priority() const {
	DeviceGuard guard{stream_.device()};
	int priority = 0;
	C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
	return priority;
	}

	/// Explicit conversion to cudaStream_t.
	cudaStream_t stream() const;

	/// Explicit conversion to Stream.
	Stream unwrap() const {
	return stream_;
	}

	/// Reversibly pack a CUDAStream into a uint64_t representation. This may
	/// be helpful when storing a CUDAStream in a C struct, where you cannot
	/// conveniently place the CUDAStream object itself (which is morally
	/// equivalent, but unfortunately is not POD due to the fact that it
	/// has constructors.)
	///
	/// The CUDAStream can be unpacked using unpack(). The format of
	/// the uint64_t is unspecified and may be changed.
	uint64_t pack() const noexcept {
	return stream_.pack();
	}

	// Unpack a CUDAStream from the uint64_t representation generated by pack().
	static CUDAStream unpack(uint64_t bits) {
	return CUDAStream(Stream::unpack(bits));
	}

	static std::tuple<int, int> priority_range() {
	// Note: this returns the range of priority supported by PyTorch, not
	// the range of priority supported by CUDA. The former is a subset of
	// the latter. Currently PyTorch only supports 0 and -1, which are "low" and
	// "high" priority.
	int least_priority, greatest_priority;
	C10_CUDA_CHECK(
	cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
	TORCH_INTERNAL_ASSERT(
	least_priority >= 0, "Unexpected CUDA stream priority range");
	TORCH_INTERNAL_ASSERT(
	greatest_priority <= -1, "Unexpected CUDA stream priority range");
	return std::make_tuple(0, -1);
	}

	// Deleted for now; use CUDAEvent::block instead
	// void synchronize_with(const CUDAEvent& event) const;

	private:
	Stream stream_;
	};

	/**
	* Get a new stream from the CUDA stream pool. You can think of this
	* as "creating" a new stream, but no such creation actually happens;
	* instead, streams are preallocated from the pool and returned in a
	* round-robin fashion.
	*
	* You can request a stream from the high priority pool by setting
	* isHighPriority to true, or a stream for a specific device by setting device
	* (defaulting to the current CUDA stream.)
	*/
	TORCH_API CUDAStream
	getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);

	/**
	* Get a CUDAStream from a externally allocated one.
	*
	* This is mainly for interoperability with different libraries where we
	* want to operate on a non-torch allocated stream for data exchange or similar
	* purposes
	*/
	TORCH_API CUDAStream
	getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);

	/**
	* Get the default CUDA stream, for the passed CUDA device, or for the
	* current device if no device index is passed. The default stream is
	* where most computation occurs when you aren't explicitly using
	* streams.
	*/
	TORCH_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);

	/**
	* Get the current CUDA stream, for the passed CUDA device, or for the
	* current device if no device index is passed. The current CUDA stream
	* will usually be the default CUDA stream for the device, but it may
	* be different if someone called 'setCurrentCUDAStream' or used 'StreamGuard'
	* or 'CUDAStreamGuard'.
	*/
	TORCH_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);

	/**
	* Set the current stream on the device of the passed in stream to be
	* the passed in stream. Yes, you read that right: this function
	* has nothing to do with the current device: it toggles the current
	* stream of the device of the passed stream.
	*
	* Confused? Avoid using this function; prefer using 'CUDAStreamGuard' instead
	* (which will switch both your current device and current stream in the way you
	* expect, and reset it back to its original state afterwards).
	*/
	TORCH_API void setCurrentCUDAStream(CUDAStream stream);

	C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);

	} // namespace cuda
	} // namespace c10

	namespace std {
	template <>
	struct hash<c10::cuda::CUDAStream> {
	size_t operator()(c10::cuda::CUDAStream s) const noexcept {
	return std::hash<c10::Stream>{}(s.unwrap());
	}
	};
	} // namespace std