blob: 5997fab5ee254e47250ed1fff0b8cce18a0e5721 [file] [log] [blame]
# Owner(s): ["oncall: profiler"]
import collections
import gc
import io
import json
import os
import unittest
import torch
import torch.nn as nn
import torch.optim
import torch.utils.data
import torch.utils.data.datapipes as dp
from torch.testing._internal.common_cuda import TEST_MULTIGPU
from torch.testing._internal.common_utils import (
TestCase, run_tests, TEST_WITH_ASAN, TEST_WITH_ROCM, IS_WINDOWS,
TemporaryFileName, TemporaryDirectoryName)
from torch.autograd import (_record_function_with_args_enter, _record_function_with_args_exit)
from torch.autograd.profiler import profile as _profile
from torch.autograd.profiler_legacy import profile as _profile_legacy
from torch.profiler import (
kineto_available, profile, record_function, supported_activities,
DeviceType, ProfilerAction, ProfilerActivity
)
from torch.testing._internal.common_device_type import skipCUDAVersionIn
try:
import psutil
HAS_PSUTIL = True
except ImportError:
HAS_PSUTIL = False
import pickle
@unittest.skipIf(not HAS_PSUTIL, "Requires psutil to run")
@unittest.skipIf(TEST_WITH_ASAN, "Cannot test with ASAN")
@unittest.skipIf(IS_WINDOWS, "Test is flaky on Windows")
@unittest.skipIf(not torch.cuda.is_available(), "CUDA is required")
class TestProfilerCUDA(TestCase):
@skipCUDAVersionIn([(11, 5)]) # https://github.com/pytorch/pytorch/issues/69023
def test_mem_leak(self):
"""Checks that there's no memory leak when using profiler with CUDA
"""
t = torch.rand(1, 1).cuda()
p = psutil.Process()
last_rss = collections.deque(maxlen=5)
for outer_idx in range(10):
with _profile(use_cuda=True):
for _ in range(1024):
t = torch.mm(t, t)
gc.collect()
torch.cuda.empty_cache()
last_rss.append(p.memory_info().rss)
# with CUDA events leaking the increase in memory was ~7 MB between
# profiler invocations above
is_increasing = all(
[last_rss[idx] > last_rss[idx - 1] for idx in range(1, len(last_rss))])
max_diff = -1
for idx in range(1, len(last_rss)):
max_diff = max(max_diff, last_rss[idx] - last_rss[idx - 1])
self.assertTrue(not (is_increasing and max_diff > 100 * 1024),
msg='memory usage is increasing, {}'.format(str(last_rss)))
def test_custom_module_input_op_ids(self):
class MyFunc(torch.autograd.Function):
@staticmethod
def forward(ctx, x):
ctx.save_for_backward(x)
return x
@staticmethod
def backward(ctx, gO):
x, = ctx.saved_tensors
return x
def custom_layer(input_ten):
return MyFunc.apply(input_ten)
# Only testing that emit_nvtx runs when
# record_shapes option is enabled.
with torch.autograd.profiler.emit_nvtx(record_shapes=True) as prof:
x = torch.randn(10, 10, requires_grad=True)
y = torch.randn(10, 10, requires_grad=True)
z = x + y
s = custom_layer(z)
q = s.sum()
q.backward()
class TestRecordFunction(TestCase):
def _record_function_with_param(self):
u = torch.randn(3, 4, 5, requires_grad=True)
with _profile(with_stack=True, use_kineto=kineto_available(), record_shapes=True) as prof:
with record_function("## TEST 1 ##", "1, 2, 3"):
rf_handle = _record_function_with_args_enter("## TEST 2 ##", 1, False, 2.5, [u, u], "hello", u)
_record_function_with_args_exit(rf_handle)
return prof
def test_record_function(self):
prof_result = self._record_function_with_param()
found_test_1 = False
found_test_2 = False
for e in prof_result.function_events:
if "## TEST 1 ##" == e.name:
found_test_1 = True
self.assertTrue(e.input_shapes == [[]])
elif "## TEST 2 ##" == e.name:
found_test_2 = True
self.assertTrue(e.input_shapes == [[], [], [], [], [], [3, 4, 5]])
self.assertTrue(found_test_1)
self.assertTrue(found_test_2)
def test_datapipe_with_record_function(self):
with _profile(with_stack=True, use_kineto=kineto_available(), record_shapes=True) as prof:
input_dp1 = dp.iter.IterableWrapper(range(4))
input_dp2 = dp.iter.IterableWrapper(range(4, 8))
input_dp3 = dp.iter.IterableWrapper(range(8, 12))
output_dp = input_dp1.mux(input_dp2, input_dp3)
output = list(output_dp)
has_iter = False
has_mux = False
for e in prof.function_events:
if has_iter and has_mux:
break
if not has_iter and e.name == "enumerate(DataPipe)#IterableWrapperIterDataPipe":
has_iter = True
if not has_mux and e.name == "enumerate(DataPipe)#MultiplexerIterDataPipe":
has_mux = True
self.assertTrue(has_iter)
self.assertTrue(has_mux)
def test_datapipe_delegation_with_profiler(self):
class IDPIterator(torch.utils.data.IterDataPipe):
def __init__(self):
self.data = list(range(10))
self._idx = 0
def __iter__(self):
return self
def __next__(self):
if self._idx >= 10:
self._idx = 0
raise StopIteration
self._idx += 1
return self.data[self._idx - 1]
def get_value(self, idx):
return self.data[idx]
dp1 = IDPIterator()
self.assertEqual(5, dp1.get_value(5))
it_dp1 = iter(dp1)
self.assertEqual(5, it_dp1.get_value(5))
self.assertEqual(list(range(10)), list(it_dp1))
class IDPDelegator(torch.utils.data.IterDataPipe):
def __init__(self, datapipe):
self.datapipe = datapipe
def __iter__(self):
return iter(self.datapipe)
dp2 = IDPDelegator(dp1)
it_dp2 = iter(dp2)
self.assertEqual(5, it_dp2.get_value(5))
self.assertEqual(list(range(10)), list(it_dp2))
def test_datapipe_with_record_function_fork(self):
with _profile(with_stack=True, use_kineto=kineto_available(), record_shapes=True) as prof:
input_dp = dp.iter.IterableWrapper(range(10))
dp1, dp2, dp3 = input_dp.fork(num_instances=3)
output1 = list(dp1)
has_iter = False
has_child = False
for e in prof.function_events:
if has_iter and has_child:
break
if not has_iter and e.name == "enumerate(DataPipe)#IterableWrapperIterDataPipe":
has_iter = True
if not has_child and e.name == "enumerate(DataPipe)#_ChildDataPipe":
has_child = True
self.assertTrue(has_iter)
self.assertTrue(has_child)
class TestProfiler(TestCase):
def test_source(self):
"""Checks that source code attribution works for eager, TS and autograd mode
"""
# avoid automatic inlining
prev_opt = torch._C._get_graph_executor_optimize()
torch._C._set_graph_executor_optimize(False)
@torch.jit.script
def ts_method_2(x, y):
return torch.matmul(x, y)
@torch.jit.script
def ts_method_1(x, y, z):
a = x + z
w = ts_method_2(x, y) + a
return w.sum()
class DummyModule(nn.Module):
def __init__(self):
super(DummyModule, self).__init__()
self.conv = torch.nn.Conv2d(3, 2, kernel_size=1, stride=2, padding=3, bias=False)
def forward(self, x):
return self.conv(x)
mod = DummyModule()
with _profile(with_stack=True, use_kineto=kineto_available()) as p:
x = torch.randn(10, 10, requires_grad=True)
y = torch.randn(10, 10, requires_grad=True)
z = x + y
w = ts_method_1(x, y, z)
v = 2 * w
v.backward()
a = torch.randn(2, 3, 2, 2, requires_grad=True)
b = mod(a)
c = b.sum()
c.backward()
for e in p.function_events:
if "aten::add" in e.name or "AddBackward" in e.name:
self.assertTrue(any(["test_profiler" in entry for entry in e.stack]))
self.assertTrue(any([(
"test_source" in entry or
"ts_method_1" in entry or
"ts_method_2" in entry) for entry in e.stack]))
torch._C._set_graph_executor_optimize(prev_opt)
def payload(self, use_cuda=False):
x = torch.randn(10, 10)
if use_cuda:
x = x.cuda()
y = torch.randn(10, 10)
if use_cuda:
y = y.cuda()
z = torch.mm(x, y)
z = z + y
if use_cuda:
z = z.cpu()
@unittest.skipIf(not kineto_available(), "Kineto is required")
def test_kineto(self):
use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
with _profile(use_cuda=use_cuda, use_kineto=True):
self.payload(use_cuda=use_cuda)
# rerun to avoid initial start overhead
with _profile(use_cuda=use_cuda, use_kineto=True) as p:
self.payload(use_cuda=use_cuda)
output = p.key_averages().table(
sort_by="self_cuda_time_total" if use_cuda else "self_cpu_time_total", row_limit=-1)
# print(output)
found_gemm = False
found_memcpy = False
found_mm = False
for e in p.function_events:
if "aten::mm" in e.name:
found_mm = True
if "gemm" in e.name:
found_gemm = True
if "Memcpy" in e.name or "memcpy" in e.name:
found_memcpy = True
if use_cuda:
self.assertTrue(found_gemm)
self.assertTrue(found_memcpy)
else:
self.assertTrue(found_mm)
# p.export_chrome_trace("/tmp/test_trace.json")
@unittest.skipIf(not kineto_available(), "Kineto is required")
@unittest.skipIf(not TEST_MULTIGPU, "Multiple GPUs needed")
@unittest.skipIf(TEST_WITH_ROCM, "Not supported on ROCm")
def test_kineto_multigpu(self):
with profile(
activities=[
ProfilerActivity.CPU,
ProfilerActivity.CUDA]) as prof:
for gpu_id in [0, 1]:
x = torch.randn(10, 10).cuda(gpu_id)
y = torch.randn(10, 10).cuda(gpu_id)
z = x.matmul(y)
found_gemm_0 = False
found_gemm_1 = False
found_cuda = False
for evt in prof.events():
if "gemm" in evt.name.lower() and evt.device_type == DeviceType.CUDA:
if evt.device_index == 0:
found_gemm_0 = True
elif evt.device_index == 1:
found_gemm_1 = True
if "cuda" in evt.name.lower() and evt.device_type == DeviceType.CPU:
found_cuda = True
self.assertTrue(found_gemm_0)
self.assertTrue(found_gemm_1)
self.assertTrue(found_cuda)
def test_memory_profiler(self):
def run_profiler(tensor_creation_fn):
# collecting allocs / deallocs
with _profile(profile_memory=True, record_shapes=True, use_kineto=kineto_available()) as prof:
x = None
with record_function("test_user_scope_alloc"):
x = tensor_creation_fn()
with record_function("test_user_scope_dealloc"):
del x
return prof.key_averages(group_by_input_shape=True)
def check_metrics(stats, metric, allocs=None, deallocs=None):
stat_metrics = {}
for stat in stats:
stat_metrics[stat.key] = getattr(stat, metric)
if allocs is not None:
for alloc_fn in allocs:
self.assertTrue(alloc_fn in stat_metrics)
self.assertTrue(stat_metrics[alloc_fn] > 0)
if deallocs is not None:
for dealloc_fn in deallocs:
self.assertTrue(dealloc_fn in stat_metrics)
self.assertTrue(stat_metrics[dealloc_fn] < 0)
def create_cpu_tensor():
return torch.rand(10, 10)
def create_cuda_tensor():
return torch.rand(10, 10).cuda()
def create_mkldnn_tensor():
return torch.rand(10, 10, dtype=torch.float32).to_mkldnn()
stats = run_profiler(create_cpu_tensor)
check_metrics(
stats,
"cpu_memory_usage",
allocs=[
"aten::empty",
"aten::rand",
"test_user_scope_alloc",
],
deallocs=[
"test_user_scope_dealloc",
]
)
if kineto_available():
with TemporaryFileName(mode="w+") as fname:
with profile(profile_memory=True) as prof:
x = None
with record_function("test_user_scope_alloc"):
x = create_cpu_tensor()
with record_function("test_user_scope_dealloc"):
del x
prof.export_chrome_trace(fname)
with io.open(fname, 'r') as f:
trace = json.load(f)
assert "traceEvents" in trace
events = trace["traceEvents"]
found_memory_events = False
for evt in events:
assert "name" in evt
if evt["name"] == "[memory]":
found_memory_events = True
assert "args" in evt
assert "Addr" in evt["args"]
assert "Device Type" in evt["args"]
assert "Device Id" in evt["args"]
assert "Bytes" in evt["args"]
# Memory should be an instantaneous event.
assert "dur" not in evt["args"]
assert "cat" not in evt["args"]
assert found_memory_events
if torch.cuda.is_available():
create_cuda_tensor()
stats = run_profiler(create_cuda_tensor)
check_metrics(
stats,
"cuda_memory_usage",
allocs=[
"test_user_scope_alloc",
"aten::to",
"aten::empty_strided",
],
deallocs=[
"test_user_scope_dealloc",
]
)
check_metrics(
stats,
"cpu_memory_usage",
allocs=[
"aten::rand",
"aten::empty",
]
)
if torch._C.has_mkldnn:
create_mkldnn_tensor()
stats = run_profiler(create_mkldnn_tensor)
check_metrics(
stats,
"cpu_memory_usage",
allocs=[
"test_user_scope_alloc",
"aten::rand",
"aten::empty",
"aten::to_mkldnn",
],
deallocs=[
"test_user_scope_dealloc",
]
)
# check top-level memory events
with _profile(profile_memory=True, use_kineto=kineto_available()) as prof:
x = torch.rand(10, 10)
del x
if torch.cuda.is_available():
y = torch.rand(10, 10).cuda()
del y
gc.collect()
stats = prof.key_averages(group_by_input_shape=True)
check_metrics(
stats,
"cpu_memory_usage",
allocs=[
"aten::rand",
"aten::empty"
],
deallocs=[
"[memory]"
]
)
if torch.cuda.is_available():
check_metrics(
stats,
"cuda_memory_usage",
deallocs=[
"[memory]"
]
)
@unittest.skipIf(not kineto_available(), "Kineto is required")
def test_module_hierarchy(self):
class A(nn.Module):
def __init__(self):
super(A, self).__init__()
def my_new_method(self, x):
return x * 3
def forward_impl_(self, x, y):
return self.my_new_method(x) + y
def forward(self, x, y):
y = y - 2
return self.forward_impl_(x, y)
class B(nn.Module):
def __init__(self):
super(B, self).__init__()
def forward(self, x):
return x + 2
class C(nn.Module):
def __init__(self):
super(C, self).__init__()
self.A0 = A()
self.B0 = B()
def call_b(self, x):
return self.B0.forward(x)
def forward(self, x, y):
return self.A0.forward(x, y) + self.call_b(x)
model = C()
model = torch.jit.script(model)
input_a = torch.rand(128, 128)
input_b = torch.rand(128, 128)
op_to_module_hierarchy = {}
op_to_module_hierarchy["aten::sub"] = ["TOP(C)::forward.A0(A)::forward."]
op_to_module_hierarchy["aten::mul"] = [
"TOP(C)::forward.A0(A)::forward.SELF(A)::forward_impl_.SELF(A)::my_new_method."]
op_to_module_hierarchy["aten::add"] = [
"TOP(C)::forward.A0(A)::forward.SELF(A)::forward_impl_.",
"TOP(C)::forward.SELF(C)::call_b.B0(B)::forward.", "TOP(C)::forward."]
with TemporaryFileName(mode="w+") as fname:
with profile(activities=[torch.profiler.ProfilerActivity.CPU], with_modules=True,) as prof:
model(input_a, input_b)
prof.export_chrome_trace(fname)
with io.open(fname, 'r') as f:
trace = json.load(f)
assert "traceEvents" in trace
events = trace["traceEvents"]
found_memory_events = False
for evt in events:
assert "name" in evt
if "args" in evt:
op_name = evt["name"]
if "Module Hierarchy" in evt["args"]:
hierarchy = evt["args"]["Module Hierarchy"]
if op_name in op_to_module_hierarchy:
assert hierarchy in op_to_module_hierarchy[op_name]
def test_high_level_trace(self):
"""Checks that python side high level events are recorded.
"""
class RepeatedDataset(torch.utils.data.Dataset):
def __init__(self, N, D_in, D_out):
self.N = N
self.x = torch.randn(N, D_in)
self.y = torch.randn(N, D_out)
def __len__(self):
return self.N
def __getitem__(self, idx):
return self.x, self.y
class TwoLayerNet(torch.nn.Module):
def __init__(self, D_in, H, D_out):
super(TwoLayerNet, self).__init__()
self.linear1 = torch.nn.Linear(D_in, H)
self.linear2 = torch.nn.Linear(H, D_out)
def forward(self, x):
h_relu = self.linear1(x).clamp(min=0)
y_pred = self.linear2(h_relu)
return y_pred
class CustomSGD(torch.optim.SGD):
def __init__(self, *args, **kwargs):
super(CustomSGD, self).__init__(*args, **kwargs)
def train():
for _, data in enumerate(dataloader):
x, y = data[0], data[1]
y_pred = model(x)
loss = criterion(y_pred, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
N, D_in, H, D_out = 8, 10, 5, 2
model = TwoLayerNet(D_in, H, D_out)
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
ds = RepeatedDataset(N, D_in, D_out)
dataloader = torch.utils.data.DataLoader(ds, batch_size=1)
try:
train()
except Exception:
self.assertTrue(False, "Expected no exception without profiling.")
# Create multiple instances, expect each func is hooked only one time.
# Nested wrappers(repeated patching) will make following test fail.
optimizer_duplicate = torch.optim.SGD(model.parameters(), lr=1e-4)
dataloader_duplicate = torch.utils.data.DataLoader(ds, batch_size=1)
def judge(expected_event_count, prof):
actual_event_count = {}
for e in prof.function_events:
if "#" in e.name:
key = e.name
if key in expected_event_count.keys():
actual_event_count[key] = actual_event_count.setdefault(key, 0) + 1
for key, count in expected_event_count.items():
self.assertTrue((key in actual_event_count.keys()) and (count == actual_event_count[key]))
with _profile(use_kineto=kineto_available()) as prof:
train()
expected_event_count = {
# "+1" because the final iteration will enter __next__ but skip the loop body.
"enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__": (N + 1),
"Optimizer.step#SGD.step": N,
"Optimizer.zero_grad#SGD.zero_grad": N
}
judge(expected_event_count, prof)
# Test on pickle/unpickle. Expect to work in multi-processing.
optimizer = pickle.loads(pickle.dumps(optimizer))
with _profile(use_kineto=kineto_available()) as prof:
train()
judge(expected_event_count, prof)
# Test on customized optimizer.
optimizer = CustomSGD(model.parameters(), lr=1e-4)
with _profile(use_kineto=kineto_available()) as prof:
train()
expected_event_count = {
"enumerate(DataLoader)#_SingleProcessDataLoaderIter.__next__": (N + 1),
"Optimizer.step#CustomSGD.step": N,
"Optimizer.zero_grad#CustomSGD.zero_grad": N
}
judge(expected_event_count, prof)
def test_flops(self):
model = torch.nn.Sequential(
nn.Conv2d(16, 33, 18),
nn.ReLU(),
nn.Linear(243, 243),
nn.ReLU(),
)
inputs = torch.randn(40, 16, 18, 260)
with _profile(record_shapes=True, with_flops=True, use_kineto=kineto_available()) as prof:
model(inputs)
profiler_output = prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10)
self.assertIn("Total MFLOPs", profiler_output)
if not (kineto_available() and torch.cuda.is_available()):
return
with profile(activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA],
record_shapes=True,
with_flops=True,
) as kineto_profiler:
model(inputs)
profiler_output = kineto_profiler.key_averages().table(
sort_by="self_cuda_time_total", row_limit=-1)
self.assertIn("Total MFLOPs", profiler_output)
def test_kineto_profiler_api(self):
called_num = [0]
use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
with profile(activities=supported_activities()):
self.payload(use_cuda=use_cuda)
def trace_handler(p):
output = p.key_averages().table(
sort_by="self_cuda_time_total" if use_cuda else "self_cpu_time_total", row_limit=-1)
# print(output)
# p.export_chrome_trace("/tmp/test_trace_" + str(called_num[0]) + ".json")
called_num[0] += 1
with profile(
activities=supported_activities(),
schedule=torch.profiler.schedule(
wait=1,
warmup=1,
active=2),
on_trace_ready=trace_handler
) as p:
for idx in range(8):
self.payload(use_cuda=use_cuda)
p.step()
self.assertEqual(called_num[0], 2)
# case without schedule
with profile(
activities=supported_activities()
) as p:
self.payload(use_cuda=use_cuda)
self.payload(use_cuda=use_cuda)
output = p.key_averages().table(
sort_by="self_cuda_time_total" if use_cuda else "self_cpu_time_total", row_limit=-1)
# print(output)
test_schedule = torch.profiler.schedule(
skip_first=2,
wait=1,
warmup=1,
active=2,
repeat=2)
test_schedule_expected_outputs = [
ProfilerAction.NONE,
ProfilerAction.NONE,
ProfilerAction.NONE,
ProfilerAction.WARMUP,
ProfilerAction.RECORD,
ProfilerAction.RECORD_AND_SAVE,
ProfilerAction.NONE,
ProfilerAction.WARMUP,
ProfilerAction.RECORD,
ProfilerAction.RECORD_AND_SAVE,
ProfilerAction.NONE,
ProfilerAction.NONE,
ProfilerAction.NONE,
ProfilerAction.NONE,
]
for step in range(len(test_schedule_expected_outputs)):
self.assertEqual(test_schedule(step), test_schedule_expected_outputs[step])
def test_export_stacks(self):
with _profile(with_stack=True, use_kineto=kineto_available()) as p:
x = torch.randn(10, 10)
y = torch.randn(10, 10)
z = torch.mm(x, y)
z = z + y
with TemporaryFileName(mode="w+") as fname:
p.export_stacks(fname)
with io.open(fname, 'r') as f:
lines = f.readlines()
assert len(lines) > 0, "Empty stacks file"
for line in lines:
is_int = False
try:
assert int(line.split(" ")[-1]) > 0, "Invalid stacks record"
is_int = True
except ValueError:
pass
assert is_int, "Invalid stacks record"
@unittest.skipIf(not kineto_available(), "Kineto is required")
@unittest.skipIf(IS_WINDOWS, "Test is flaky on Windows")
def test_tensorboard_trace_handler(self):
use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
with _profile(use_cuda=use_cuda, use_kineto=True):
self.payload(use_cuda=use_cuda)
with TemporaryDirectoryName() as dname:
with profile(
activities=[
torch.profiler.ProfilerActivity.CPU
] + ([
torch.profiler.ProfilerActivity.CUDA
] if use_cuda else []),
schedule=torch.profiler.schedule(
wait=1,
warmup=1,
active=2,
repeat=3),
on_trace_ready=torch.profiler.tensorboard_trace_handler(dname)
) as p:
for _ in range(18):
self.payload(use_cuda=use_cuda)
p.step()
self.assertTrue(os.path.exists(dname))
file_num = 0
for file_name in os.listdir(dname):
parts = file_name.split('.')
self.assertTrue(len(parts) > 4)
self.assertTrue(parts[-4].isdigit() and int(parts[-4]) > 0, "Wrong tracing file name pattern")
self.assertEqual(parts[-3:], ['pt', 'trace', 'json'])
file_num += 1
self.assertEqual(file_num, 3)
# test case for gzip file format
with TemporaryDirectoryName() as dname:
p = profile(
activities=[
torch.profiler.ProfilerActivity.CPU
] + ([
torch.profiler.ProfilerActivity.CUDA
] if use_cuda else []),
schedule=torch.profiler.schedule(
wait=1,
warmup=1,
active=2,
repeat=3),
on_trace_ready=torch.profiler.tensorboard_trace_handler(dname, use_gzip=True)
)
p.start()
for _ in range(18):
self.payload(use_cuda=use_cuda)
p.step()
p.stop()
self.assertTrue(os.path.exists(dname))
file_num = 0
for file_name in os.listdir(dname):
parts = file_name.split('.')
self.assertTrue(len(parts) > 4)
self.assertTrue(parts[-5].isdigit() and int(parts[-5]) > 0, "Wrong tracing file name pattern")
self.assertEqual(parts[-4:], ['pt', 'trace', 'json', 'gz'])
file_num += 1
self.assertEqual(file_num, 3)
@unittest.skipIf(not kineto_available(), "Kineto is required")
def test_profiler_metadata(self):
t1, t2 = torch.ones(1), torch.ones(1)
with profile() as prof:
torch.add(t1, t2)
prof.add_metadata("test_key1", "test_value1")
prof.add_metadata_json("test_key2", "[1,2,3]")
with TemporaryFileName(mode="w+") as fname:
prof.export_chrome_trace(fname)
with io.open(fname, 'r') as f:
trace = json.load(f)
assert "test_key1" in trace
assert trace["test_key1"] == "test_value1"
assert "test_key2" in trace
assert trace["test_key2"] == [1, 2, 3]
def _test_profiler_tracing(self, use_kineto):
with _profile(use_kineto=use_kineto) as prof:
t1, t2 = torch.ones(1), torch.ones(1)
torch.add(t1, t2)
with TemporaryFileName(mode="w+") as fname:
prof.export_chrome_trace(fname)
# read the trace and expect valid json
# if the JSON generated by export_chrome_trace is not valid, this will throw and fail the test.
with io.open(fname, 'r') as f:
json.load(f)
# test empty trace
with _profile(use_kineto=use_kineto) as prof:
pass
# saving an empty trace
with TemporaryFileName(mode="w+") as fname:
prof.export_chrome_trace(fname)
# Same test but for cuda.
use_cuda = torch.profiler.ProfilerActivity.CUDA in supported_activities()
if not use_cuda:
return
device = torch.device("cuda:0")
with _profile(use_cuda=True, use_kineto=use_kineto) as prof:
t1, t2 = torch.ones(1, device=device), torch.ones(1, device=device)
torch.add(t1, t2)
with TemporaryFileName(mode="w+") as fname:
prof.export_chrome_trace(fname)
# Now validate the json
with io.open(fname, 'r') as f:
json.load(f)
def test_profiler_tracing(self):
self._test_profiler_tracing(False)
if kineto_available():
self._test_profiler_tracing(True)
@unittest.skip("Disable forward->backward link to workaround profiler crash")
def test_profiler_fwd_bwd_link(self):
with _profile(use_kineto=True) as prof:
t1, t2 = torch.ones(1, requires_grad=True), torch.ones(1, requires_grad=True)
z = torch.add(t1, t2)
y = torch.ones(1)
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)
loss.backward()
with TemporaryFileName(mode="w+") as fname:
prof.export_chrome_trace(fname)
with io.open(fname, 'r') as f:
j = json.load(f)
events = j["traceEvents"]
ts_to_name = {}
flow_s_to_ts = {}
flow_f_to_ts = {}
for e in events:
if e["ph"] == "X":
ts_to_name[e["ts"]] = e["name"]
if "cat" in e and "name" in e and e["cat"] == "forward_backward" and e["name"] == "fwd_bwd":
if e["ph"] == "s":
flow_s_to_ts[e["id"]] = e["ts"]
elif e["ph"] == "f":
flow_f_to_ts[e["id"]] = e["ts"]
self.assertTrue(len(flow_s_to_ts) == 2)
self.assertTrue(len(flow_f_to_ts) == 2)
self.assertTrue(1 in flow_s_to_ts.keys())
self.assertTrue(1 in flow_f_to_ts.keys())
self.assertTrue(2 in flow_s_to_ts.keys())
self.assertTrue(2 in flow_f_to_ts.keys())
s_ts_1 = flow_s_to_ts[1]
f_ts_1 = flow_f_to_ts[1]
s_ts_2 = flow_s_to_ts[2]
f_ts_2 = flow_f_to_ts[2]
self.assertTrue(all([ts in ts_to_name.keys() for ts in [s_ts_1, f_ts_1, s_ts_2, f_ts_2]]))
self.assertTrue(ts_to_name[s_ts_1] == "aten::binary_cross_entropy_with_logits")
self.assertTrue(ts_to_name[s_ts_2] == "aten::add")
def test_profiler_type(self):
profiler_type = torch._C._autograd._profiler_type
ActiveProfilerType = torch._C._autograd.ActiveProfilerType
self.assertEqual(profiler_type(), ActiveProfilerType.NONE)
# Autograd profiler
with _profile_legacy():
self.assertEqual(profiler_type(), ActiveProfilerType.LEGACY)
# Kineto profiler
with profile():
self.assertEqual(profiler_type(), ActiveProfilerType.KINETO)
if __name__ == '__main__':
run_tests()