| import itertools |
| import torch |
| from torch.autograd import DeviceType |
| |
| from collections import defaultdict, namedtuple |
| from operator import attrgetter |
| |
| from typing import Dict, List, Tuple, Optional |
| |
| import bisect |
| import math |
| |
| |
| class EventList(list): |
| """A list of Events (for pretty printing)""" |
| def __init__(self, *args, **kwargs): |
| use_cuda = kwargs.pop('use_cuda', True) |
| profile_memory = kwargs.pop('profile_memory', False) |
| with_flops = kwargs.pop('with_flops', False) |
| super(EventList, self).__init__(*args, **kwargs) |
| self._use_cuda = use_cuda |
| self._profile_memory = profile_memory |
| self._tree_built = False |
| self._with_flops = with_flops |
| |
| def _build_tree(self): |
| self._populate_cpu_children() |
| self._remove_dup_nodes() |
| self._set_backward_stacktraces() |
| self._tree_built = True |
| |
| def __str__(self): |
| return self.table() |
| |
| def _remove_dup_nodes(self): |
| while True: |
| to_delete = set() |
| for idx in range(len(self)): |
| if (self[idx].cpu_parent is not None and |
| self[idx].cpu_parent.name == self[idx].name and |
| len(self[idx].cpu_parent.cpu_children) == 1): |
| self[idx].cpu_parent.cpu_children = self[idx].cpu_children |
| self[idx].cpu_parent.kernels = self[idx].kernels # lift kernels up |
| for ch in self[idx].cpu_children: |
| ch.cpu_parent = self[idx].cpu_parent |
| to_delete.add(idx) |
| if len(to_delete) == 0: |
| break |
| new_evts = [ev for ind, ev in enumerate(self) if ind not in to_delete] |
| self.clear() |
| self.extend(new_evts) |
| |
| def _populate_cpu_children(self): |
| """Populates child events into each underlying FunctionEvent object. |
| One event is a child of another if [s1, e1) is inside [s2, e2). Where |
| s1 and e1 would be start and end of the child event's interval. And |
| s2 and e2 start and end of the parent event's interval |
| |
| Example: In event list [[0, 10], [1, 3], [3, 4]] would have make [0, 10] |
| be a parent of two other intervals. |
| |
| If for any reason two intervals intersect only partially, this function |
| will not record a parent child relationship between then. |
| """ |
| |
| # Some events can be async (i.e. start and end on different threads), |
| # since it's generally undefined how to attribute children ranges to |
| # async ranges, we do not use them when calculating nested ranges and stats |
| sync_events = [evt for evt in self if not evt.is_async and evt.device_type == DeviceType.CPU] |
| events = sorted( |
| sync_events, |
| key=attrgetter("thread"), |
| ) |
| # Group by both thread and node_id, so that events that happen to have |
| # the same thread_id but are from different nodes aren't incorrectly |
| # grouped together. |
| threads = itertools.groupby( |
| events, key=lambda event: (event.thread, event.node_id) |
| ) |
| |
| # For each thread we keep a stack of current nested parents. |
| # We maintain the invariant that each interval is a subset of all other |
| # intervals lower in the stack. |
| # |
| # First we sort the intervals by their start time. Then we iterate over them. |
| # Every time we see a new interval we remove several parents from |
| # the top until we restore the invariant. Then parent child relationship |
| # if recorded if the stack is not empty. |
| # Finally we add new interval to the list |
| # |
| # Algorithm has O(N * log(N)) complexity where N is number of |
| # intervals |
| for thread_id, thread_events in threads: |
| thread_events_ = sorted( |
| thread_events, |
| key=lambda event: [event.time_range.start, -event.time_range.end], |
| ) |
| current_events: List[FunctionEvent] = [] |
| cur_end = 0 |
| for event in thread_events_: |
| while len(current_events) > 0: |
| parent = current_events[-1] |
| if event.time_range.start >= parent.time_range.end or \ |
| event.time_range.end > parent.time_range.end: |
| # this can't be a parent |
| current_events.pop() |
| else: |
| parent.append_cpu_child(event) |
| assert ( |
| event.cpu_parent is None |
| ), "There is already a CPU parent event for {}".format( |
| event.key |
| ) |
| event.set_cpu_parent(parent) |
| break |
| |
| current_events.append(event) |
| |
| def _set_backward_stacktraces(self): |
| def bw_parent(evt): |
| if evt is None: |
| return None |
| elif evt.scope == 1: # BACKWARD_FUNCTION |
| return evt |
| else: |
| return bw_parent(evt.cpu_parent) |
| |
| fwd_stacks = {} |
| for evt in self: |
| if bw_parent(evt) is None and evt.stack is not None: |
| t = (evt.sequence_nr, evt.thread) |
| if t not in fwd_stacks: |
| fwd_stacks[t] = evt.stack |
| |
| for evt in self: |
| p = bw_parent(evt) |
| if p is not None: |
| assert p.fwd_thread is not None |
| t = (p.sequence_nr, p.fwd_thread) |
| if t in fwd_stacks: |
| evt.stack = fwd_stacks[t] |
| else: |
| evt.stack = [] |
| |
| @property |
| def self_cpu_time_total(self): |
| return sum([event.self_cpu_time_total for event in self]) |
| |
| def table( |
| self, |
| sort_by=None, |
| row_limit=100, |
| max_src_column_width=75, |
| max_name_column_width=55, |
| max_shapes_column_width=80, |
| header=None, |
| top_level_events_only=False |
| ): |
| """Prints an EventList as a nicely formatted table. |
| |
| Args: |
| sort_by (str, optional): Attribute used to sort entries. By default |
| they are printed in the same order as they were registered. |
| Valid keys include: ``cpu_time``, ``cuda_time``, ``cpu_time_total``, |
| ``cuda_time_total``, ``cpu_memory_usage``, ``cuda_memory_usage``, |
| ``self_cpu_memory_usage``, ``self_cuda_memory_usage``, ``count``. |
| top_level_events_only(bool, optional): Boolean flag to determine the |
| selection of events to display. If true, the profiler will only |
| display events at top level like top-level invocation of python |
| `lstm`, python `add` or other functions, nested events like low-level |
| cpu/cuda ops events are omitted for profiler result readability. |
| |
| Returns: |
| A string containing the table. |
| """ |
| return _build_table( |
| self, |
| sort_by=sort_by, |
| row_limit=row_limit, |
| max_src_column_width=max_src_column_width, |
| max_name_column_width=max_name_column_width, |
| max_shapes_column_width=max_shapes_column_width, |
| header=header, |
| profile_memory=self._profile_memory, |
| with_flops=self._with_flops, |
| top_level_events_only=top_level_events_only) |
| |
| def export_chrome_trace(self, path): |
| """Exports an EventList as a Chrome tracing tools file. |
| |
| The checkpoint can be later loaded and inspected under ``chrome://tracing`` URL. |
| |
| Args: |
| path (str): Path where the trace will be written. |
| """ |
| import os |
| with open(path, 'w') as f: |
| chrome_events = [] |
| next_id = 0 |
| # Use file IO over using json.dump since JSON dumping is very slow and |
| # this technique is proven to give a 4x speedup. |
| f.write("[") |
| for evt in self: |
| if evt.trace_name is None: |
| continue |
| f.write( |
| '{"name": "%s", ' |
| '"ph": "X", ' |
| '"ts": %s, ' |
| '"dur": %s, ' |
| '"tid": %s, ' |
| '"pid": "CPU functions", ' |
| '"args": {}}, ' |
| % ( |
| evt.trace_name, |
| evt.time_range.start, |
| evt.time_range.elapsed_us(), |
| evt.thread |
| if not evt.is_remote |
| else f'" node_id:{evt.node_id}, thread_id:{evt.thread} "', |
| ) |
| ) |
| for k in evt.kernels: |
| # 's' and 'f' draw Flow arrows from |
| # the CPU launch to the GPU kernel |
| f.write('{"name": "%s", ' |
| '"ph": "s", ' |
| '"ts": %s, ' |
| '"tid": %s, ' |
| '"pid": "CPU functions", ' |
| '"id": %s, ' |
| '"cat": "cpu_to_cuda", ' |
| '"args": {}}, ' % (evt.trace_name, evt.time_range.start, |
| evt.thread, next_id)) |
| # Note: use torch.profiler to get device kernel trace |
| next_id += 1 |
| if len(self) > 0: |
| # remove trailing whitespace and comma |
| f.seek(f.tell() - 2, os.SEEK_SET) |
| f.truncate() |
| f.write("]") |
| |
| def supported_export_stacks_metrics(self): |
| return ["self_cpu_time_total", "self_cuda_time_total"] |
| |
| def export_stacks(self, path: str, metric: str): |
| if metric not in self.supported_export_stacks_metrics(): |
| raise ValueError("metric should be one of: " + str(self.supported_export_stacks_metrics())) |
| translate_table = str.maketrans(" ;\t\n", "____") |
| with open(path, 'w') as f: |
| for evt in self: |
| if evt.stack and len(evt.stack) > 0: |
| metric_value = getattr(evt, metric) |
| if int(metric_value) > 0: |
| stack_str = "" |
| for entry in reversed(evt.stack): |
| stack_str += entry.translate(translate_table) |
| stack_str += ";" |
| stack_str = stack_str[:-1] + " " + str(int(metric_value)) |
| f.write(stack_str + "\n") |
| |
| def key_averages(self, group_by_input_shapes=False, group_by_stack_n=0): |
| """Averages all function events over their keys. |
| |
| Args: |
| group_by_input_shapes: group entries by |
| (event name, input shapes) rather than just event name. |
| This is useful to see which input shapes contribute to the runtime |
| the most and may help with size-specific optimizations or |
| choosing the best candidates for quantization (aka fitting a roof line) |
| |
| group_by_stack_n: group by top n stack trace entries |
| |
| Returns: |
| An EventList containing FunctionEventAvg objects. |
| """ |
| assert self._tree_built |
| stats: Dict[Tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg) |
| |
| def get_key(event, group_by_input_shapes, group_by_stack_n) -> Tuple[str, ...]: |
| key = [str(event.key), str(event.node_id), str(event.device_type), str(event.is_legacy)] |
| if group_by_input_shapes: |
| key.append(str(event.input_shapes)) |
| if group_by_stack_n > 0: |
| key += event.stack[:group_by_stack_n] |
| return tuple(key) |
| for evt in self: |
| stats[get_key(evt, group_by_input_shapes, group_by_stack_n)].add(evt) |
| |
| avg_list = EventList( |
| stats.values(), |
| use_cuda=self._use_cuda, |
| profile_memory=self._profile_memory, |
| with_flops=self._with_flops) |
| for evt in avg_list: |
| evt.stack = evt.stack[:group_by_stack_n] |
| if not group_by_input_shapes: |
| evt.input_shapes = "" |
| return avg_list |
| |
| def total_average(self): |
| """Averages all events. |
| |
| Returns: |
| A FunctionEventAvg object. |
| """ |
| total_stat = FunctionEventAvg() |
| for evt in self: |
| total_stat += evt |
| total_stat.key = None |
| total_stat.key = 'Total' |
| return total_stat |
| |
| |
| def _format_time(time_us): |
| """Defines how to format time in FunctionEvent""" |
| US_IN_SECOND = 1000.0 * 1000.0 |
| US_IN_MS = 1000.0 |
| if time_us >= US_IN_SECOND: |
| return '{:.3f}s'.format(time_us / US_IN_SECOND) |
| if time_us >= US_IN_MS: |
| return '{:.3f}ms'.format(time_us / US_IN_MS) |
| return '{:.3f}us'.format(time_us) |
| |
| def _format_time_share(time_us, total_time_us): |
| """Defines how to format time in FunctionEvent""" |
| if total_time_us == 0: |
| assert time_us == 0, "Expected time_us == 0 but got {}".format(time_us) |
| return "NaN" |
| return '{:.2f}%'.format(time_us * 100.0 / total_time_us) |
| |
| def _format_memory(nbytes): |
| """Returns a formatted memory size string""" |
| KB = 1024 |
| MB = 1024 * KB |
| GB = 1024 * MB |
| if (abs(nbytes) >= GB): |
| return '{:.2f} Gb'.format(nbytes * 1.0 / GB) |
| elif (abs(nbytes) >= MB): |
| return '{:.2f} Mb'.format(nbytes * 1.0 / MB) |
| elif (abs(nbytes) >= KB): |
| return '{:.2f} Kb'.format(nbytes * 1.0 / KB) |
| else: |
| return str(nbytes) + ' b' |
| |
| def _attr_formatter(name): |
| return property(lambda self: _format_time(getattr(self, name))) |
| |
| |
| class FormattedTimesMixin(object): |
| """Helpers for FunctionEvent and FunctionEventAvg. |
| |
| The subclass should define `*_time_total` and `count` attributes. |
| """ |
| cpu_time_str = _attr_formatter('cpu_time') |
| cuda_time_str = _attr_formatter('cuda_time') |
| cpu_time_total_str = _attr_formatter('cpu_time_total') |
| cuda_time_total_str = _attr_formatter('cuda_time_total') |
| self_cpu_time_total_str = _attr_formatter('self_cpu_time_total') |
| self_cuda_time_total_str = _attr_formatter('self_cuda_time_total') |
| |
| @property |
| def cpu_time(self): |
| return 0.0 if self.count == 0 else 1.0 * self.cpu_time_total / self.count # type: ignore[attr-defined] |
| |
| @property |
| def cuda_time(self): |
| return 0.0 if self.count == 0 else 1.0 * self.cuda_time_total / self.count # type: ignore[attr-defined] |
| |
| |
| class Interval(object): |
| def __init__(self, start, end): |
| self.start = start |
| self.end = end |
| |
| def elapsed_us(self): |
| return self.end - self.start |
| |
| |
| Kernel = namedtuple('Kernel', ['name', 'device', 'duration']) |
| |
| |
| class FunctionEvent(FormattedTimesMixin): |
| """Profiling information about a single function.""" |
| def __init__( |
| self, id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None, |
| stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False, |
| is_remote=False, sequence_nr=-1, node_id=-1, device_type=DeviceType.CPU, device_index=0, |
| is_legacy=False, flops=None, trace_name=None): |
| self.id: int = id |
| self.node_id: int = node_id |
| self.name: str = name |
| self.trace_name: str = trace_name |
| self.time_range: Interval = Interval(start_us, end_us) |
| self.thread: int = thread |
| self.fwd_thread: Optional[int] = fwd_thread |
| self.kernels: List[Kernel] = [] |
| self.count: int = 1 |
| self.cpu_children: List[FunctionEvent] = [] |
| self.cpu_parent: Optional[FunctionEvent] = None |
| self.input_shapes: Tuple[int, ...] = input_shapes |
| self.stack: List = stack |
| self.scope: int = scope |
| self.cpu_memory_usage: int = cpu_memory_usage |
| self.cuda_memory_usage: int = cuda_memory_usage |
| self.is_async: bool = is_async |
| self.is_remote: bool = is_remote |
| self.sequence_nr: int = sequence_nr |
| self.device_type: DeviceType = device_type |
| self.device_index: int = device_index |
| self.is_legacy: bool = is_legacy |
| self.flops: Optional[int] = flops |
| |
| def append_kernel(self, name, device, duration): |
| assert self.device_type == DeviceType.CPU |
| self.kernels.append(Kernel(name, device, duration)) |
| |
| def append_cpu_child(self, child): |
| """Append a CPU child of type FunctionEvent. |
| |
| One is supposed to append only direct children to the event to have |
| correct self cpu time being reported. |
| """ |
| assert(self.device_type == DeviceType.CPU) |
| assert(isinstance(child, FunctionEvent)) |
| assert(child.device_type == DeviceType.CPU) |
| self.cpu_children.append(child) |
| |
| def set_cpu_parent(self, parent): |
| """Set the immediate CPU parent of type FunctionEvent |
| |
| One profiling FunctionEvent should have only one CPU parent such that |
| the child's range interval is completely inside the parent's. We use |
| this connection to determine the event is from top-level op or not. |
| """ |
| assert(self.device_type == DeviceType.CPU) |
| assert(isinstance(parent, FunctionEvent)) |
| assert(parent.device_type == DeviceType.CPU) |
| self.cpu_parent = parent |
| |
| # Note: async events don't have children, are not used when computing 'self' |
| # metrics of other events, have only total cpu time |
| @property |
| def self_cpu_memory_usage(self): |
| if self.is_async or self.device_type != DeviceType.CPU: |
| return 0 |
| return self.cpu_memory_usage - sum( |
| [child.cpu_memory_usage for child in self.cpu_children] |
| ) |
| |
| @property |
| def self_cuda_memory_usage(self): |
| if self.is_async or self.device_type != DeviceType.CPU: |
| return 0 |
| return self.cuda_memory_usage - sum( |
| [child.cuda_memory_usage for child in self.cpu_children] |
| ) |
| |
| @property |
| def self_cpu_time_total(self): |
| if self.is_async or self.device_type != DeviceType.CPU: |
| return 0 |
| return self.cpu_time_total - sum( |
| [child.cpu_time_total for child in self.cpu_children] |
| ) |
| |
| @property |
| def cuda_time_total(self): |
| if self.is_async: |
| return 0 |
| if self.device_type == DeviceType.CPU: |
| if not self.is_legacy: |
| # account for the kernels in the children ops |
| return (sum(kinfo.duration for kinfo in self.kernels) + |
| sum(ch.cuda_time_total for ch in self.cpu_children)) |
| else: |
| # each legacy cpu events has a single (fake) kernel |
| return sum(kinfo.duration for kinfo in self.kernels) |
| else: |
| assert self.device_type == DeviceType.CUDA |
| return self.time_range.elapsed_us() |
| |
| @property |
| def self_cuda_time_total(self): |
| if self.is_async: |
| return 0 |
| if self.device_type == DeviceType.CPU: |
| return self.cuda_time_total - \ |
| sum([child.cuda_time_total for child in self.cpu_children]) |
| else: |
| assert(self.device_type == DeviceType.CUDA) |
| return self.cuda_time_total |
| |
| @property |
| def cpu_time_total(self): |
| if self.device_type == DeviceType.CPU: |
| return self.time_range.elapsed_us() |
| else: |
| return 0 |
| |
| @property |
| def key(self): |
| return self.name |
| |
| def __repr__(self): |
| return ( |
| '<FunctionEvent id={} name={} device_type={} node_id={} cpu_time={} start_us={} end_us={} ' |
| 'cpu_children={} cuda_time={} name={} thread={} input_shapes={} ' |
| 'cpu_memory_usage={} cuda_memory_usage={} is_async={} is_remote={} seq_nr={} is_legacy={}>'.format( |
| self.id, |
| self.name, |
| self.device_type, |
| self.node_id, |
| self.cpu_time_str, |
| self.time_range.start, |
| self.time_range.end, |
| str([child.id for child in self.cpu_children]), |
| self.cuda_time_str, |
| self.name, |
| self.thread, |
| str(self.input_shapes), |
| self.cpu_memory_usage, |
| self.cuda_memory_usage, |
| self.is_async, |
| self.is_remote, |
| self.sequence_nr, |
| self.is_legacy, |
| ) |
| ) |
| |
| |
| class FunctionEventAvg(FormattedTimesMixin): |
| """Used to average stats over multiple FunctionEvent objects.""" |
| def __init__(self): |
| self.key: Optional[str] = None |
| self.count: int = 0 |
| self.node_id: int = 0 |
| self.is_async: bool = False |
| self.is_remote: bool = False |
| self.cpu_time_total: int = 0 |
| self.cuda_time_total: int = 0 |
| self.self_cpu_time_total: int = 0 |
| self.self_cuda_time_total: int = 0 |
| self.input_shapes: Optional[List[List[int]]] = None |
| self.stack: Optional[List] = None |
| self.scope: Optional[int] = None |
| self.cpu_memory_usage: int = 0 |
| self.cuda_memory_usage: int = 0 |
| self.self_cpu_memory_usage: int = 0 |
| self.self_cuda_memory_usage: int = 0 |
| self.cpu_children: Optional[List[FunctionEvent]] = None |
| self.cpu_parent: Optional[FunctionEvent] = None |
| self.device_type: DeviceType = DeviceType.CPU |
| self.is_legacy: bool = False |
| self.flops: int = 0 |
| |
| def add(self, other): |
| if self.key is None: |
| # First function being recorded as part of FunctionEventAvg, propagate |
| # fields. |
| self.key = other.key |
| self.node_id = other.node_id |
| self.is_async = other.is_async |
| self.is_remote = other.is_remote |
| self.cpu_parent = other.cpu_parent |
| self.cpu_children = other.cpu_children |
| |
| self.input_shapes = other.input_shapes |
| self.stack = other.stack |
| self.scope = other.scope |
| self.device_type = other.device_type |
| self.is_legacy = other.is_legacy |
| |
| assert isinstance(other, (FunctionEvent, FunctionEventAvg)) |
| assert other.key == self.key |
| self.cpu_time_total += other.cpu_time_total |
| self.cuda_time_total += other.cuda_time_total |
| self.self_cpu_time_total += other.self_cpu_time_total |
| self.self_cuda_time_total += other.self_cuda_time_total |
| self.cpu_memory_usage += other.cpu_memory_usage |
| self.cuda_memory_usage += other.cuda_memory_usage |
| self.self_cpu_memory_usage += other.self_cpu_memory_usage |
| self.self_cuda_memory_usage += other.self_cuda_memory_usage |
| self.count += other.count |
| if self.flops is None: |
| self.flops = other.flops |
| elif other.flops is not None: |
| self.flops += other.flops |
| return self |
| |
| def __iadd__(self, other): |
| return self.add(other) |
| |
| def __repr__(self): |
| return ( |
| '<FunctionEventAvg key={} self_cpu_time={} cpu_time={} ' |
| ' self_cuda_time={} cuda_time={} input_shapes={} ' |
| 'cpu_memory_usage={} cuda_memory_usage={}>'.format( |
| self.key, |
| self.self_cpu_time_total_str, |
| self.cpu_time_str, |
| self.self_cuda_time_total_str, |
| self.cuda_time_str, |
| str(self.input_shapes), |
| self.cpu_memory_usage, |
| self.cuda_memory_usage, |
| ) |
| ) |
| |
| |
| class StringTable(defaultdict): |
| def __missing__(self, key): |
| # manage cases like 't' (demangled to 'unsigned short') separately, |
| # for now simply check the length to avoid unexpected results for |
| # the short sequences |
| self[key] = torch._C._demangle(key) if len(key) > 1 else key |
| return self[key] |
| |
| |
| class MemRecordsAcc: |
| """Acceleration structure for accessing mem_records in interval""" |
| |
| def __init__(self, mem_records): |
| self._mem_records = mem_records |
| self._start_uses = [] |
| self._indices = [] |
| if len(mem_records) > 0: |
| tmp = sorted([(r[0].start_us(), i) for i, r in enumerate(mem_records)]) |
| self._start_uses, self._indices = zip(*tmp) |
| |
| def in_interval(self, start_us, end_us): |
| start_idx = bisect.bisect_left(self._start_uses, start_us) |
| end_idx = bisect.bisect_right(self._start_uses, end_us) |
| for i in range(start_idx, end_idx): |
| yield self._mem_records[self._indices[i]] |
| |
| |
| def _filter_stack_entry(entry): |
| filtered_entries = [ |
| ("autograd/__init__", "_make_grads"), |
| ("autograd/__init__", "backward"), |
| ("torch/tensor", "backward"), |
| ("_internal/common_utils", "prof_callable"), |
| ("_internal/common_utils", "prof_func_call"), |
| ("_internal/common_utils", "prof_meth_call"), |
| ] |
| return all([not (f[0] in entry and f[1] in entry) for f in filtered_entries]) |
| |
| MEMORY_EVENT_NAME = "[memory]" |
| OUT_OF_MEMORY_EVENT_NAME = "[OutOfMemory]" |
| |
| def _filter_name(name): |
| # ignoring the following utility ops |
| filtered_out_names = [ |
| MEMORY_EVENT_NAME, # used only for the top-level memory events |
| OUT_OF_MEMORY_EVENT_NAME, |
| "profiler::_record_function_enter", |
| "profiler::_record_function_enter_new", |
| "profiler::_record_function_exit", |
| "aten::is_leaf", |
| "aten::output_nr", |
| "aten::_version", |
| ] |
| return name in filtered_out_names |
| |
| # Demangles and optionally rewrites the provided event name, |
| # with_wildcard - whether to replace certain numbered event names |
| # with a wildcard name to aggregate them together in the profiler table |
| # output |
| def _rewrite_name(name, with_wildcard=False): |
| string_table = StringTable() |
| name = string_table[name] |
| if with_wildcard: |
| if name.startswith("ProfilerStep#"): |
| name = "ProfilerStep*" |
| return name |
| |
| def _build_table( |
| events, |
| sort_by=None, |
| header=None, |
| row_limit=100, |
| max_src_column_width=75, |
| max_name_column_width=55, |
| max_shapes_column_width=80, |
| with_flops=False, |
| profile_memory=False, |
| top_level_events_only=False): |
| """Prints a summary of events (which can be a list of FunctionEvent or FunctionEventAvg).""" |
| if len(events) == 0: |
| return "" |
| |
| has_cuda_time = any([event.self_cuda_time_total > 0 for event in events]) |
| has_cuda_mem = any([event.self_cuda_memory_usage > 0 for event in events]) |
| has_input_shapes = any( |
| [(event.input_shapes is not None and len(event.input_shapes) > 0) for event in events]) |
| |
| if sort_by is not None: |
| events = EventList(sorted( |
| events, key=lambda evt: getattr(evt, sort_by), reverse=True |
| ), use_cuda=has_cuda_time, profile_memory=profile_memory, with_flops=with_flops) |
| |
| name_column_width = max([len(evt.key) for evt in events]) + 4 |
| if max_name_column_width is not None: |
| name_column_width = min(name_column_width, max_name_column_width) |
| |
| shapes_column_width = max([len(str(evt.input_shapes)) for evt in events]) + 4 |
| if max_shapes_column_width is not None: |
| shapes_column_width = min(shapes_column_width, max_shapes_column_width) |
| |
| DEFAULT_COLUMN_WIDTH = 12 |
| flops_column_width = DEFAULT_COLUMN_WIDTH |
| |
| src_column_width = None |
| stacks = [] |
| for evt in events: |
| if evt.stack is not None and len(evt.stack) > 0: |
| stacks.append(evt.stack) |
| has_stack = len(stacks) > 0 |
| if has_stack: |
| src_column_width = max([max([len(entry) for entry in stack]) for stack in stacks]) + 4 |
| if max_src_column_width is not None: |
| src_column_width = min(src_column_width, max_src_column_width) |
| |
| headers = [ |
| 'Name', |
| 'Self CPU %', |
| 'Self CPU', |
| 'CPU total %', |
| 'CPU total', |
| 'CPU time avg', |
| ] |
| if has_cuda_time: |
| headers.extend([ |
| 'Self CUDA', |
| 'Self CUDA %', |
| 'CUDA total', |
| 'CUDA time avg', |
| ]) |
| if profile_memory: |
| headers.extend([ |
| 'CPU Mem', |
| 'Self CPU Mem', |
| ]) |
| if has_cuda_mem: |
| headers.extend([ |
| 'CUDA Mem', |
| 'Self CUDA Mem', |
| ]) |
| headers.append( |
| '# of Calls' |
| ) |
| # Only append Node ID if any event has a valid (>= 0) Node ID |
| append_node_id = any([evt.node_id != -1 for evt in events]) |
| if append_node_id: |
| headers.append('Node ID') |
| |
| # Have to use a list because nonlocal is Py3 only... |
| SPACING_SIZE = 2 |
| row_format_lst = [""] |
| header_sep_lst = [""] |
| line_length_lst = [-SPACING_SIZE] |
| MAX_STACK_ENTRY = 5 |
| |
| def add_column(padding, text_dir='>'): |
| row_format_lst[0] += '{: ' + text_dir + str(padding) + '}' + (' ' * SPACING_SIZE) |
| header_sep_lst[0] += '-' * padding + (' ' * SPACING_SIZE) |
| line_length_lst[0] += padding + SPACING_SIZE |
| |
| def auto_scale_flops(flops): |
| flop_headers = [ |
| 'FLOPs', |
| 'KFLOPs', |
| 'MFLOPs', |
| 'GFLOPs', |
| 'TFLOPs', |
| 'PFLOPs', |
| ] |
| assert flops > 0 |
| log_flops = max(0, min(math.log10(flops) / 3, float(len(flop_headers) - 1))) |
| assert log_flops >= 0 and log_flops < len(flop_headers) |
| return (pow(10, (math.floor(log_flops) * -3.0)), flop_headers[int(log_flops)]) |
| |
| add_column(name_column_width) |
| for _ in headers[1:]: |
| add_column(DEFAULT_COLUMN_WIDTH) |
| |
| if has_input_shapes: |
| headers.append('Input Shapes') |
| add_column(shapes_column_width) |
| |
| if has_stack: |
| headers.append('Source Location') |
| add_column(src_column_width, text_dir='<') |
| |
| if with_flops: |
| # Auto-scaling of flops header |
| raw_flops = [] |
| for evt in events: |
| if evt.flops > 0: |
| raw_flops.append(evt.flops) |
| if len(raw_flops) != 0: |
| (flops_scale, flops_header) = auto_scale_flops(min(raw_flops)) |
| headers.append('Total {}'.format(flops_header)) |
| add_column(flops_column_width) |
| else: |
| with_flops = False # can't find any valid flops |
| |
| row_format = row_format_lst[0] |
| header_sep = header_sep_lst[0] |
| line_length = line_length_lst[0] |
| add_column = None # type: ignore[assignment] |
| |
| # Have to use a list because nonlocal is Py3 only... |
| result = [] |
| |
| def append(s): |
| result.append(s) |
| result.append('\n') # Yes, newline after the end as well |
| |
| sum_self_cpu_time_total = sum([event.self_cpu_time_total for event in events]) |
| sum_self_cuda_time_total = 0 |
| for evt in events: |
| if evt.device_type == DeviceType.CPU: |
| # in legacy profiler, kernel info is stored in cpu events |
| if evt.is_legacy: |
| sum_self_cuda_time_total += evt.self_cuda_time_total |
| elif evt.device_type == DeviceType.CUDA: |
| # in kineto profiler, there're events with the correct device type (e.g. CUDA) |
| sum_self_cuda_time_total += evt.self_cuda_time_total |
| |
| # Actual printing |
| if header is not None: |
| append('=' * line_length) |
| append(header) |
| if top_level_events_only: |
| append('=' * line_length) |
| append('This report only display top-level ops statistics') |
| append(header_sep) |
| append(row_format.format(*headers)) |
| |
| append(header_sep) |
| |
| def trim_path(path, src_column_width): |
| if len(path) > src_column_width: |
| offset = len(path) - src_column_width |
| path = path[offset:] |
| if len(path) > 3: |
| path = "..." + path[3:] |
| return path |
| |
| event_limit = 0 |
| for evt in events: |
| if event_limit == row_limit: |
| break |
| if top_level_events_only and evt.cpu_parent is not None: |
| continue |
| else: |
| event_limit += 1 |
| name = evt.key |
| if max_name_column_width is not None and len(name) >= max_name_column_width - 3: |
| name = name[:(max_name_column_width - 3)] + "..." |
| row_values = [ |
| name, |
| # Self CPU total %, 0 for async events. |
| _format_time_share(evt.self_cpu_time_total, sum_self_cpu_time_total), |
| evt.self_cpu_time_total_str, # Self CPU total |
| # CPU total %, 0 for async events. |
| _format_time_share(evt.cpu_time_total, sum_self_cpu_time_total) if not evt.is_async else 0, |
| evt.cpu_time_total_str, # CPU total |
| evt.cpu_time_str, # CPU time avg |
| ] |
| if has_cuda_time: |
| row_values.extend([ |
| evt.self_cuda_time_total_str, |
| # CUDA time total % |
| _format_time_share(evt.self_cuda_time_total, sum_self_cuda_time_total), |
| evt.cuda_time_total_str, |
| evt.cuda_time_str, # Cuda time avg |
| ]) |
| if profile_memory: |
| row_values.extend([ |
| # CPU Mem Total |
| _format_memory(evt.cpu_memory_usage), |
| # Self CPU Mem Total |
| _format_memory(evt.self_cpu_memory_usage), |
| ]) |
| if has_cuda_mem: |
| row_values.extend([ |
| # CUDA Mem Total |
| _format_memory(evt.cuda_memory_usage), |
| # Self CUDA Mem Total |
| _format_memory(evt.self_cuda_memory_usage), |
| ]) |
| row_values.append( |
| evt.count, # Number of calls |
| ) |
| |
| if append_node_id: |
| row_values.append(evt.node_id) |
| if has_input_shapes: |
| row_values.append(str(evt.input_shapes)[:shapes_column_width]) |
| if with_flops: |
| if evt.flops <= 0: |
| row_values.append("--") |
| else: |
| row_values.append('{0:8.3f}'.format(evt.flops * flops_scale)) |
| if has_stack: |
| src_field = "" |
| if len(evt.stack) > 0: |
| src_field = trim_path(evt.stack[0], src_column_width) |
| row_values.append(src_field) |
| append(row_format.format(*row_values)) |
| |
| if has_stack: |
| empty_headers = [""] * (len(headers) - 1) |
| for entry in evt.stack[1:MAX_STACK_ENTRY]: |
| append(row_format.format(*(empty_headers + [trim_path(entry, src_column_width)]))) |
| empty_headers.append("") |
| append(row_format.format(*empty_headers)) |
| |
| append(header_sep) |
| append("Self CPU time total: {}".format(_format_time(sum_self_cpu_time_total))) |
| if has_cuda_time: |
| append("Self CUDA time total: {}".format(_format_time(sum_self_cuda_time_total))) |
| return ''.join(result) |