torch/csrc/autograd/profiler_kineto.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <string>
 #include <vector>

 #include <torch/csrc/profiler/api.h>
 #include <torch/csrc/profiler/events.h>
 #include <torch/csrc/profiler/stubs/base.h>
 #include <torch/csrc/profiler/util.h>

 namespace torch {
 namespace profiler {
 namespace impl {
 struct Result;
 namespace kineto {
 struct ActivityTraceWrapper;
 } // namespace kineto
 } // namespace impl
 } // namespace profiler
 namespace autograd {
 namespace profiler {
 using experimental_event_t = std::shared_ptr<torch::profiler::impl::Result>;

 struct TORCH_API KinetoEvent {
   KinetoEvent(
       std::shared_ptr<const torch::profiler::impl::Result>,
       const bool verbose);

   uint64_t startThreadId() const;
   uint64_t endThreadId() const;
   uint8_t activityType() const;
   uint64_t fwdThreadId() const;
   bool hasShapes() const;
   const c10::ArrayRef<std::vector<int64_t>> shapes() const;
   bool hasTypes() const;
   const c10::ArrayRef<std::string> dtypes() const;
   uint64_t flops() const;
   int64_t sequenceNr() const;
   bool hasStack() const;
   const c10::ArrayRef<std::string> stack() const;
   uint8_t scope() const;
   bool hasModuleHierarchy() const;
   const c10::ArrayRef<std::string> moduleHierarchy() const;
   int64_t debugHandle() const;
   std::string name() const;
   c10::DeviceType deviceType() const;
   uint8_t deviceIndex() const;
   int64_t nBytes() const;
   uint64_t startUs() const;
   uint64_t durationUs() const;
   bool isAsync() const;
   uint64_t correlationId() const;
   uint64_t linkedCorrelationId() const;
   int64_t deviceResourceId() const;
   std::string backend() const;
   bool isPythonFunction() const;
   int64_t cudaElapsedUs() const;
   void getPerfEventCounters(torch::profiler::perf_counters_t&) const;

  private:
   torch::profiler::impl::ProfilerEventStub fallbackStart() const;
   torch::profiler::impl::ProfilerEventStub fallbackEnd() const;

   std::shared_ptr<const torch::profiler::impl::Result> result_;
   std::vector<std::string> python_stack_;

   // Copy fields from result so we can return ArrayRefs.
   std::vector<std::vector<int64_t>> shapes_;
   std::vector<std::string> dtypes_;
 };

 // Consolidating events returned directly from Kineto
 // with events manually created by us (e.g. start/stop marks,
 // memory allocation events)
 struct TORCH_API ProfilerResult {
   ProfilerResult();
   ProfilerResult(
       uint64_t start_time,
       std::vector<KinetoEvent> events,
       std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>&&
           trace,
       std::vector<experimental_event_t>&& event_tree);
   ~ProfilerResult();

   uint64_t trace_start_us() const {
     return trace_start_us_;
   }

   const std::vector<KinetoEvent>& events() const {
     return events_;
   }

   const std::vector<experimental_event_t>& event_tree() const {
     return event_tree_;
   }

   void save(const std::string& path);

  private:
   uint64_t trace_start_us_ = 0;
   std::vector<KinetoEvent> events_;
   std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper> trace_;
   std::vector<experimental_event_t> event_tree_;
 };

 /*
  * This API is used by backends to record latency of events that
  * happened in the backend but were not visible to pytorch runtime.
  * For example, if part of the model is lowered to a dsp backend, then
  * the execution of that part of the model is delegated to the backend.
  * When backend finishes execution it has an option to provide profiling
  * information (latency only at the moment) corresponding to different operators
  * that were executed in the backend.
  * When such events are recorded by backend using this API, the event
  * records will be collected by active kineto profiler. If no kineto profiler
  * is active then the event is ignored.
  * This provides us with a way to generate all the profiling information
  * for a model regardless of where model (or part of it) executed.
  * @param start_time_us: start time in us of the event
  * @param end_time_us: end time in us of the event
  * @param debug_handle: debug handle to correlate this event/op with
  * model level module/source information
  * @param scope: scope of the event, e.g. LITE_INTERPRETER, RECORD_FN etc.
  * @param event_name: name of the event, e.g. op name
  * @param backend_name: name of the backend where the event took place.
  */
 TORCH_API void reportBackendEventToActiveKinetoProfiler(
     const int64_t start_time_us,
     const int64_t end_time_us,
     const int64_t debug_handle,
     const at::RecordScope scope,
     const std::string& event_name,
     const std::string& backend_name);

 TORCH_API void enableProfiler(
     const torch::profiler::impl::ProfilerConfig& config,
     const std::set<torch::profiler::impl::ActivityType>& activities,
     const std::unordered_set<at::RecordScope>& scopes = {});

 /*
  * Same as enableProfiler but with callback to do post-processing of
  * KinetoEvents.
  * enableProfilerWithEventPostProcess enables profiler to capture
  * specified activities, with specified RecordFunction scope, if any.
  * Additionally, it takes a functor that does in-place post processing of
  * events, e.g. populate stack trace or module hierarchy information lazily
  * using debug_handle.
  * Example usage is with lite interpreter that has recording scope of
  * LITE_INTERPRETER. In this case lite interpreter runtime, records debug
  * handles in RecordFunction, along with other information. Debug handles are
  * eventually passed down to KinetoEvent and recorded as part of the event.
  * KinetoEdgeCPUProfiler, in torch/csrc/jit/mobile/profiler_edge.cpp, enables
  * profiler using post-processing callback, via
  * enableProfilerWithEventPostProcess, that takes these debug handles and
  * generates stack trace and module hierarchy information, once profiling is
  * done.
  */
 using post_process_t = std::function<void(
     /*debug_handle */ int64_t,
     /*jit_stack    */ std::vector<std::string>&,
     /*jit_modules  */ std::vector<std::string>&)>;
 TORCH_API void enableProfilerWithEventPostProcess(
     const torch::profiler::impl::ProfilerConfig& config,
     const std::set<torch::profiler::impl::ActivityType>& activities,
     post_process_t&& cb,
     const std::unordered_set<at::RecordScope>& scopes = {});

 TORCH_API std::unique_ptr<ProfilerResult> disableProfiler();

 TORCH_API void prepareProfiler(
     const torch::profiler::impl::ProfilerConfig& config,
     const std::set<torch::profiler::impl::ActivityType>& activities);

 } // namespace profiler
 } // namespace autograd

 namespace profiler {
 namespace impl {

 // Experimental.
 TORCH_API void _reportVulkanEventToProfiler(vulkan_id_t id);

 } // namespace impl
 } // namespace profiler

 } // namespace torch
	#pragma once

	#include <string>
	#include <vector>

	#include <torch/csrc/profiler/api.h>
	#include <torch/csrc/profiler/events.h>
	#include <torch/csrc/profiler/stubs/base.h>
	#include <torch/csrc/profiler/util.h>

	namespace torch {
	namespace profiler {
	namespace impl {
	struct Result;
	namespace kineto {
	struct ActivityTraceWrapper;
	} // namespace kineto
	} // namespace impl
	} // namespace profiler
	namespace autograd {
	namespace profiler {
	using experimental_event_t = std::shared_ptr<torch::profiler::impl::Result>;

	struct TORCH_API KinetoEvent {
	KinetoEvent(
	std::shared_ptr<const torch::profiler::impl::Result>,
	const bool verbose);

	uint64_t startThreadId() const;
	uint64_t endThreadId() const;
	uint8_t activityType() const;
	uint64_t fwdThreadId() const;
	bool hasShapes() const;
	const c10::ArrayRef<std::vector<int64_t>> shapes() const;
	bool hasTypes() const;
	const c10::ArrayRef<std::string> dtypes() const;
	uint64_t flops() const;
	int64_t sequenceNr() const;
	bool hasStack() const;
	const c10::ArrayRef<std::string> stack() const;
	uint8_t scope() const;
	bool hasModuleHierarchy() const;
	const c10::ArrayRef<std::string> moduleHierarchy() const;
	int64_t debugHandle() const;
	std::string name() const;
	c10::DeviceType deviceType() const;
	uint8_t deviceIndex() const;
	int64_t nBytes() const;
	uint64_t startUs() const;
	uint64_t durationUs() const;
	bool isAsync() const;
	uint64_t correlationId() const;
	uint64_t linkedCorrelationId() const;
	int64_t deviceResourceId() const;
	std::string backend() const;
	bool isPythonFunction() const;
	int64_t cudaElapsedUs() const;
	void getPerfEventCounters(torch::profiler::perf_counters_t&) const;

	private:
	torch::profiler::impl::ProfilerEventStub fallbackStart() const;
	torch::profiler::impl::ProfilerEventStub fallbackEnd() const;

	std::shared_ptr<const torch::profiler::impl::Result> result_;
	std::vector<std::string> python_stack_;

	// Copy fields from result so we can return ArrayRefs.
	std::vector<std::vector<int64_t>> shapes_;
	std::vector<std::string> dtypes_;
	};

	// Consolidating events returned directly from Kineto
	// with events manually created by us (e.g. start/stop marks,
	// memory allocation events)
	struct TORCH_API ProfilerResult {
	ProfilerResult();
	ProfilerResult(
	uint64_t start_time,
	std::vector<KinetoEvent> events,
	std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>&&
	trace,
	std::vector<experimental_event_t>&& event_tree);
	~ProfilerResult();

	uint64_t trace_start_us() const {
	return trace_start_us_;
	}

	const std::vector<KinetoEvent>& events() const {
	return events_;
	}

	const std::vector<experimental_event_t>& event_tree() const {
	return event_tree_;
	}

	void save(const std::string& path);

	private:
	uint64_t trace_start_us_ = 0;
	std::vector<KinetoEvent> events_;
	std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper> trace_;
	std::vector<experimental_event_t> event_tree_;
	};

	/*
	* This API is used by backends to record latency of events that
	* happened in the backend but were not visible to pytorch runtime.
	* For example, if part of the model is lowered to a dsp backend, then
	* the execution of that part of the model is delegated to the backend.
	* When backend finishes execution it has an option to provide profiling
	* information (latency only at the moment) corresponding to different operators
	* that were executed in the backend.
	* When such events are recorded by backend using this API, the event
	* records will be collected by active kineto profiler. If no kineto profiler
	* is active then the event is ignored.
	* This provides us with a way to generate all the profiling information
	* for a model regardless of where model (or part of it) executed.
	* @param start_time_us: start time in us of the event
	* @param end_time_us: end time in us of the event
	* @param debug_handle: debug handle to correlate this event/op with
	* model level module/source information
	* @param scope: scope of the event, e.g. LITE_INTERPRETER, RECORD_FN etc.
	* @param event_name: name of the event, e.g. op name
	* @param backend_name: name of the backend where the event took place.
	*/
	TORCH_API void reportBackendEventToActiveKinetoProfiler(
	const int64_t start_time_us,
	const int64_t end_time_us,
	const int64_t debug_handle,
	const at::RecordScope scope,
	const std::string& event_name,
	const std::string& backend_name);

	TORCH_API void enableProfiler(
	const torch::profiler::impl::ProfilerConfig& config,
	const std::set<torch::profiler::impl::ActivityType>& activities,
	const std::unordered_set<at::RecordScope>& scopes = {});

	/*
	* Same as enableProfiler but with callback to do post-processing of
	* KinetoEvents.
	* enableProfilerWithEventPostProcess enables profiler to capture
	* specified activities, with specified RecordFunction scope, if any.
	* Additionally, it takes a functor that does in-place post processing of
	* events, e.g. populate stack trace or module hierarchy information lazily
	* using debug_handle.
	* Example usage is with lite interpreter that has recording scope of
	* LITE_INTERPRETER. In this case lite interpreter runtime, records debug
	* handles in RecordFunction, along with other information. Debug handles are
	* eventually passed down to KinetoEvent and recorded as part of the event.
	* KinetoEdgeCPUProfiler, in torch/csrc/jit/mobile/profiler_edge.cpp, enables
	* profiler using post-processing callback, via
	* enableProfilerWithEventPostProcess, that takes these debug handles and
	* generates stack trace and module hierarchy information, once profiling is
	* done.
	*/
	using post_process_t = std::function<void(
	/debug_handle / int64_t,
	/jit_stack / std::vector<std::string>&,
	/jit_modules / std::vector<std::string>&)>;
	TORCH_API void enableProfilerWithEventPostProcess(
	const torch::profiler::impl::ProfilerConfig& config,
	const std::set<torch::profiler::impl::ActivityType>& activities,
	post_process_t&& cb,
	const std::unordered_set<at::RecordScope>& scopes = {});

	TORCH_API std::unique_ptr<ProfilerResult> disableProfiler();

	TORCH_API void prepareProfiler(
	const torch::profiler::impl::ProfilerConfig& config,
	const std::set<torch::profiler::impl::ActivityType>& activities);

	} // namespace profiler
	} // namespace autograd

	namespace profiler {
	namespace impl {

	// Experimental.
	TORCH_API void _reportVulkanEventToProfiler(vulkan_id_t id);

	} // namespace impl
	} // namespace profiler

	} // namespace torch