| #include <c10/core/impl/alloc_cpu.h> |
| #include <c10/core/Allocator.h> |
| |
| #include <torch/csrc/Device.h> |
| #include <torch/extension.h> |
| |
| #include <ATen/native/cpu/Loops.h> |
| #include <ATen/native/DispatchStub.h> |
| #include <ATen/EmptyTensor.h> |
| |
| |
| static uint64_t add_counter = 0; |
| static uint64_t last_saved_value = 0; |
| |
| // basic dummy add function |
| at::Tensor custom_add_Tensor(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) { |
| add_counter += 1; |
| // Since this custom device is just for testing, not bothering to implement kernels. |
| return at::empty(self.sizes(), self.options()); |
| } |
| |
| // A dummy allocator for our custom device, that secretly uses the CPU |
| struct DummyCustomAllocator final : at::Allocator { |
| DummyCustomAllocator() = default; |
| at::DataPtr allocate(size_t nbytes) const override { |
| void* data = c10::alloc_cpu(nbytes); |
| return {data, data, &ReportAndDelete, at::Device(at::DeviceType::PrivateUse1, 0)}; |
| } |
| |
| static void ReportAndDelete(void* ptr) { |
| if (!ptr) { |
| return; |
| } |
| c10::free_cpu(ptr); |
| } |
| |
| at::DeleterFnPtr raw_deleter() const override { |
| return &ReportAndDelete; |
| } |
| }; |
| |
| // Register our dummy allocator |
| static DummyCustomAllocator global_custom_alloc; |
| REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &global_custom_alloc); |
| |
| // basic dummy empty function, so we can directly construct tensors on the custom device |
| // This dummy test device will just use the CPU allocator, and ignores pinned memory. |
| at::Tensor custom_empty_memory_format(at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) { |
| constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1); |
| return at::detail::empty_generic(size, &global_custom_alloc, private_use_ks, c10::dtype_or_default(dtype), memory_format); |
| } |
| at::Tensor custom_empty_symint(c10::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) { |
| constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1); |
| return at::detail::empty_generic(size, &global_custom_alloc, private_use_ks, c10::dtype_or_default(dtype), memory_format); |
| } |
| |
| at::Tensor & custom_fill__scalar(at::Tensor & self, const at::Scalar & value) { |
| // Not bothering to implement. |
| return self; |
| } |
| |
| // basic dummy copy_() function, so we can copy from the custom device to/from CPU |
| at::Tensor custom__copy_from(const at::Tensor& self, const at::Tensor& dst, bool non_blocking) { |
| TORCH_CHECK(self.is_cpu() || self.device().type() == c10::DeviceType::PrivateUse1, "Dummy test only allows copy from cpu -> dummy device."); |
| TORCH_CHECK(dst.is_cpu() || dst.device().type() == c10::DeviceType::PrivateUse1, "Dummy test only allows copy from cpu -> dummy device."); |
| |
| // Some dummy asserts for the basic use case: inputs are the same size / dtype, all contiguous. |
| TORCH_CHECK(self.sizes() == dst.sizes()); |
| TORCH_CHECK(self.scalar_type() == dst.scalar_type()); |
| TORCH_CHECK(self.is_contiguous() && dst.is_contiguous()); |
| |
| std::memcpy(dst.storage().data_ptr().get(), self.storage().data_ptr().get(), self.storage().nbytes()); |
| return dst; |
| } |
| |
| |
| // This macro does the heavy lifting. |
| // With TORCH_LIBRARY_IMPL, you can register custom kernels for your backend. |
| // For open registration, we're registering all of our kernels to the PrivateUse1 dispatch key. |
| // Later in this file, we map a custom device to the PrivateUse1 device type, |
| // which allows user code that puts a tensor on your custom_device to eventually get plumbed |
| // into the kernels registered here. |
| // |
| // This macro registers your kernels to the PyTorch Dispatcher. |
| // More details on the dispatcher can be found at http://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/. |
| TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) { |
| m.impl("add.Tensor", &custom_add_Tensor); |
| m.impl("empty.memory_format", &custom_empty_symint); |
| m.impl("fill_.Scalar", &custom_fill__scalar); |
| m.impl("_copy_from", &custom__copy_from); |
| } |
| |
| // This basic implementation doesn't bother dealing with different device indices |
| // (e.g. custom_device:0 vs. custom_device:1). |
| // We could do that by letting the user pass in a device index in our exposed device function. |
| // Note that if you do that, you'll also need to register a device guard to core. |
| // See `c10/core/impl/DeviceGuardImplInterface.h:C10_REGISTER_GUARD_IMPL`. |
| c10::Device get_custom_device() { |
| return c10::Device(c10::DeviceType::PrivateUse1, 0); |
| } |
| |
| bool custom_add_called() { |
| bool called = false; |
| if (add_counter > last_saved_value) { |
| called = true; |
| last_saved_value = add_counter; |
| } |
| return called; |
| } |
| |
| // Here, we're exposing a custom device object that corresponds to our custom backend. |
| // We do this using pybind: exposing an "extension_name.custom_device()" function in python, |
| // that's implemented in C++. |
| // The implementation in this file maps directly to the `PrivateUse1` device type. |
| PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { |
| m.def("custom_device", &get_custom_device, "get custom device object"); |
| m.def("custom_add_called", &custom_add_called, "check if our custom add function was called"); |
| } |