| #include <torch/csrc/lazy/python/init.h> |
| |
| #include <ATen/FunctionalTensorWrapper.h> |
| #include <c10/core/Device.h> |
| #include <torch/csrc/jit/python/pybind.h> |
| #include <torch/csrc/lazy/backend/backend_device.h> |
| #include <torch/csrc/lazy/backend/backend_interface.h> |
| #include <torch/csrc/lazy/core/config.h> |
| #include <torch/csrc/lazy/core/debug_util.h> |
| #include <torch/csrc/lazy/core/internal_ops/ltc_ops.h> |
| #include <torch/csrc/lazy/core/ir_dump_util.h> |
| #include <torch/csrc/lazy/core/lazy_graph_executor.h> |
| #include <torch/csrc/lazy/core/metrics.h> |
| #include <torch/csrc/lazy/core/trie.h> |
| #include <torch/csrc/lazy/python/python_util.h> |
| #if !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE)) |
| #include <torch/csrc/lazy/ts_backend/ts_backend_impl.h> |
| #include <torch/csrc/lazy/ts_backend/ts_lowering_context.h> |
| #endif // FBCODE_CAFFE2 || OVRSOURCE |
| #include <string> |
| #include <vector> |
| |
| namespace torch { |
| namespace lazy { |
| |
| // TODO(whc) backend 'device' related APIs are not very clear, this code could |
| // be simplified but it should probably be done together with |
| // designing/refactoring the overall approach to get/set of default eager/lazy |
| // device types |
| torch::lazy::BackendDevice GetDeviceOrCurrent(const std::string& device_str) { |
| if (device_str.empty()) { |
| getBackend()->GetDefaultDeviceType(); |
| return torch::lazy::BackendDevice(); |
| } |
| return torch::lazy::atenDeviceToBackendDevice(c10::Device(device_str)); |
| } |
| |
| std::ptrdiff_t GetTensorId(const at::Tensor& tensor) { |
| torch::lazy::LazyTensorPtr lazy_tensor = torch::lazy::TryGetLtcTensor(tensor); |
| return lazy_tensor->GetUniqueId(); |
| } |
| |
| std::string GetTensorsDump( |
| const std::vector<at::Tensor>& tensors, |
| const std::function<std::string(c10::ArrayRef<const torch::lazy::Node*>)>& |
| coverter) { |
| std::vector<const torch::lazy::Node*> nodes; |
| std::vector<torch::lazy::Value> values; |
| for (auto& tensor : tensors) { |
| auto inner = at::functionalization::impl::from_functional_tensor(tensor); |
| torch::lazy::LazyTensorPtr lazy_tensor = |
| torch::lazy::TryGetLtcTensor(inner); |
| values.push_back(lazy_tensor->GetIrValue()); |
| nodes.push_back(values.back().node.get()); |
| } |
| return coverter(nodes); |
| } |
| |
| std::vector<torch::lazy::LazyTensorPtr> GetLtcTensors( |
| const std::vector<at::Tensor>& tensors, |
| bool want_all) { |
| std::vector<torch::lazy::LazyTensorPtr> lazy_tensors; |
| lazy_tensors.reserve(tensors.size()); |
| if (want_all) { |
| for (auto& tensor : tensors) { |
| lazy_tensors.push_back(torch::lazy::TryGetLtcTensor(tensor)); |
| } |
| } else { |
| for (auto& tensor : tensors) { |
| auto lazy_tensor = torch::lazy::TryGetLtcTensor(tensor); |
| if (lazy_tensor) { |
| lazy_tensors.push_back(lazy_tensor); |
| } |
| } |
| } |
| return lazy_tensors; |
| } |
| |
| std::string GetTensorsBackendGraph(const std::vector<at::Tensor>& tensors) { |
| std::vector<torch::lazy::LazyTensorPtr> lazy_tensors = |
| GetLtcTensors(tensors, /*want_all=*/false); |
| return torch::lazy::LazyGraphExecutor::Get()->DumpBackendComputation( |
| lazy_tensors); |
| } |
| |
| void SyncTensors( |
| const std::vector<at::Tensor>& tensors, |
| const std::vector<std::string>& devices, |
| bool wait, |
| bool sync_ltc_data) { |
| std::vector<torch::lazy::LazyTensorPtr> lazy_tensors = |
| GetLtcTensors(tensors, /*want_all=*/false); |
| torch::lazy::LazyGraphExecutor::Get()->SyncTensorsGraph( |
| &lazy_tensors, devices, wait, sync_ltc_data); |
| } |
| |
| void initLazyBindings(PyObject* module) { |
| auto m = py::handle(module).cast<py::module>(); |
| auto lazy = m.def_submodule("_lazy"); |
| auto lazy_ts_backend = m.def_submodule("_lazy_ts_backend"); |
| |
| lazy.def( |
| "_mark_step", |
| // TODO(whc) this API should probably change from vector<string> to |
| // vector<c10::device> but in a separate PR |
| [](const std::string& device_str, |
| const std::vector<std::string>& devices, |
| bool wait) { |
| pybind11::gil_scoped_release no_gil; |
| auto backend_device = GetDeviceOrCurrent(device_str); |
| torch::lazy::LazyGraphExecutor::Get()->SyncLiveTensorsGraph( |
| &backend_device, devices, wait); |
| torch::lazy::LazyGraphExecutor::Get()->MarkStep(backend_device); |
| }, |
| py::arg("device") = "", |
| py::arg("devices"), |
| py::arg("wait") = true); |
| lazy.def( |
| "_wait_device_ops", |
| [](const std::vector<std::string>& devices) { |
| pybind11::gil_scoped_release no_gil; |
| // TODO: Add support of non-empty devices. |
| if (!devices.empty()) { |
| LOG(ERROR) << "Non-empty devices are not supported."; |
| } |
| torch::lazy::LazyGraphExecutor::Get()->WaitDeviceOps({}); |
| }, |
| py::arg("devices")); |
| lazy.def("_reset_metrics", []() { |
| torch::lazy::MetricsArena::Get()->ResetCounters(); |
| torch::lazy::MetricsArena::Get()->ResetMetrics(); |
| }); |
| lazy.def("_counter_names", []() { return torch::lazy::GetCounterNames(); }); |
| lazy.def( |
| "_metrics_report", []() { return torch::lazy::CreateMetricReport(); }); |
| lazy.def("_counter_value", [](const std::string& name) -> py::object { |
| torch::lazy::CounterData* data = torch::lazy::GetCounter(name); |
| return data != nullptr ? py::cast<int64_t>(data->Value()) : py::none(); |
| }); |
| lazy.def("_get_tensor_id", [](const at::Tensor& tensor) { |
| return GetTensorId(tensor); |
| }); |
| |
| lazy.def( |
| "_get_tensors_text", |
| [](const std::vector<at::Tensor>& tensors) -> std::string { |
| auto coverter = [](c10::ArrayRef<const torch::lazy::Node*> nodes) { |
| return torch::lazy::DumpUtil::ToText(nodes); |
| }; |
| return GetTensorsDump(tensors, coverter); |
| }); |
| lazy.def( |
| "_get_tensors_dot", |
| [](const std::vector<at::Tensor>& tensors) -> std::string { |
| auto coverter = [](c10::ArrayRef<const torch::lazy::Node*> nodes) { |
| return torch::lazy::DumpUtil::ToDot(nodes); |
| }; |
| return GetTensorsDump(tensors, coverter); |
| }); |
| lazy.def( |
| "_get_tensors_backend", |
| [](const std::vector<at::Tensor>& tensors) -> std::string { |
| return GetTensorsBackendGraph(tensors); |
| }); |
| lazy.def("_get_graph_hash", [](const std::vector<at::Tensor>& tensors) { |
| std::vector<LazyTensorPtr> xtensors; |
| xtensors.reserve(tensors.size()); |
| for (auto& tensor : tensors) { |
| xtensors.emplace_back(TryGetLtcTensor(tensor)); |
| } |
| auto hash = LazyGraphExecutor::Get()->GetGraphHash(xtensors); |
| std::string bin((const char*)&hash, sizeof(hash)); |
| return py::bytes(bin); |
| }); |
| lazy.def( |
| "_sync_multi", |
| [](const std::vector<at::Tensor>& tensors, |
| const std::vector<std::string>& devices, |
| bool wait, |
| bool sync_ltc_data) { |
| pybind11::gil_scoped_release no_gil; |
| SyncTensors(tensors, devices, wait, sync_ltc_data); |
| }, |
| py::arg("tensors"), |
| py::arg("devices"), |
| py::arg("wait") = true, |
| py::arg("sync_ltc_data") = true); |
| |
| lazy.def("_get_force_fallback", []() { |
| return torch::lazy::getLTCForceFallback(); |
| }); |
| lazy.def("_set_force_fallback", [](std::string newval) { |
| torch::lazy::getLTCForceFallback() = newval; |
| }); |
| lazy.def("_clear_ir_cache", []() { TrieCache::Get()->Clear(); }); |
| lazy.def("_dump_ir_cache", [](std::string filename) { |
| TrieCache::Get()->DumpToDotFile(filename); |
| }); |
| lazy.def("_set_reuse_ir", [](bool val) { FLAGS_torch_lazy_reuse_ir = val; }); |
| lazy.def("_set_symbolic_shape_mode", [](bool val) { |
| FLAGS_ltc_enable_symbolic_shapes = val; |
| }); |
| lazy.def("_get_symbolic_shape_mode", []() { |
| return FLAGS_ltc_enable_symbolic_shapes; |
| }); |
| lazy.def("_get_default_device_type", []() { |
| return getBackend()->GetDefaultDeviceType()->toString(); |
| }); |
| |
| lazy_ts_backend.def("_init", []() { |
| #if !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE)) |
| torch::lazy::InitTorchScriptBackend(); |
| #else |
| TORCH_CHECK(false, "TorchScript backend not yet supported in FBCODE/OVRSOURCE builds"); |
| #endif // !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE)) |
| }); |
| |
| /* |
| * Return tensor ids and tensors for DeviceData nodes. |
| * TODO(shunting) revisit this API for XLA |
| */ |
| lazy_ts_backend.def( |
| "_get_tensors_ts_device_data_node", |
| [](const std::vector<at::Tensor>& tensors) |
| -> std::pair<std::vector<int64_t>, std::vector<at::IValue>> { |
| #if !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE)) |
| std::vector<const Node*> roots; |
| for (auto& tensor : tensors) { |
| auto xtensor = TryGetLtcTensor(tensor); |
| roots.push_back(xtensor->GetIrValue().node.get()); |
| } |
| auto post_order = Util::ComputePostOrder(roots); |
| std::vector<int64_t> tensor_ids; |
| std::vector<at::IValue> ivalues; |
| |
| std::unordered_set<BackendData::Handle> data_handles_; |
| for (auto nodeptr : post_order) { |
| if (nodeptr->op() == *torch::lazy::ltc_device_data) { |
| const auto backend_data = |
| getBackend()->GetComputationDataFromNode(nodeptr); |
| |
| auto infoptr = backend_data->info(); |
| auto deviceDataInfoPtr = |
| (torch::lazy::LazyGraphExecutor::DeviceDataInfo*)infoptr; |
| auto* tsDataPtr = (torch::lazy::TSData*)backend_data.get(); |
| |
| // dedup DeviceData by handle |
| auto handle = tsDataPtr->GetHandle(); |
| if (!data_handles_.insert(handle).second) { |
| continue; |
| } |
| tensor_ids.push_back(deviceDataInfoPtr->tensor_id); |
| /* |
| * If the TSData contains a tensor, then the tensor id will uniquely |
| * identify the tensor. We use that tensor id to find the tensor in |
| * other places: e.g. in the python forward method parameters. |
| * |
| * If the TSData contains a scalar, the tensor id itself is not |
| * important. We reuse the scalar value in future calls. |
| */ |
| if (tsDataPtr->HasValue()) { |
| ivalues.emplace_back(tsDataPtr->data()); |
| } else { |
| CHECK(tsDataPtr->scalar.has_value()); |
| ivalues.emplace_back(tsDataPtr->scalar.value()); |
| } |
| } |
| } |
| return std::make_pair(tensor_ids, ivalues); |
| #else |
| TORCH_CHECK( |
| false, "TorchScript backend not yet supported in FBCODE builds"); |
| return std::make_pair( |
| std::vector<int64_t>(), std::vector<at::IValue>()); |
| #endif // !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE)) |
| }); |
| // TODO(shunting) revisit this part for XLA |
| lazy_ts_backend.def( |
| "_run_cached_graph", |
| [](const std::string& hash_str, |
| const std::vector<at::IValue>& graph_inputs) { |
| std::vector<at::Tensor> result; |
| #if !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE)) |
| TORCH_CHECK(hash_str.size() == sizeof(hash_t)); |
| hash_t hash = *(hash_t*)(hash_str.c_str()); |
| auto cachedComputation = |
| LazyGraphExecutor::Get()->GetComputationCache()->Get(hash); |
| TORCH_CHECK( |
| cachedComputation, |
| "Failed to get computation by hash. Maybe the entry get kicked out of the LRU cache"); // TODO implement a fallback mechanism, or make sure those entries never get kicked out |
| auto computationPtr = |
| (torch::lazy::TSComputation*)cachedComputation->computation.get(); |
| |
| std::vector<torch::jit::IValue> stack; |
| stack.reserve(graph_inputs.size()); |
| for (const auto& arg : graph_inputs) { |
| stack.emplace_back(arg); |
| } |
| computationPtr->graph_executor().run(stack); |
| result.reserve(stack.size()); |
| for (torch::jit::IValue elem : stack) { |
| result.push_back(elem.toTensor()); |
| } |
| #else |
| TORCH_CHECK( |
| false, "TorchScript backend not yet supported in FBCODE builds"); |
| #endif // !(defined(FBCODE_CAFFE2) || defined(OVRSOURCE)) |
| return result; |
| }); |
| |
| // GetPythonFramesFunction() has not ever worked with torchdeploy/multipy |
| // possibly becuase GetPythonFrames resolves to external cpython rather |
| // than embedded cpython. So far this problem has only been observed |
| // internally, so we will just block it off there. |
| |
| #if !(defined(USE_DEPLOY)) |
| |
| // When libtorch_python is loaded, we register the python frame getter |
| // otherwise, debug util simply omits python frames |
| GetPythonFramesFunction() = GetPythonFrames; |
| |
| #endif // USE_DEPLOY |
| } |
| |
| } // namespace lazy |
| } // namespace torch |