test/cpp_extensions/open_registration_extension.cpp - platform/external/pytorch - Git at Google

 #include <c10/core/impl/alloc_cpu.h>
 #include <c10/core/Allocator.h>

 #include <torch/csrc/Device.h>
 #include <torch/extension.h>

 #include <ATen/native/cpu/Loops.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/EmptyTensor.h>


 static uint64_t add_counter = 0;
 static uint64_t last_saved_value = 0;

 // basic dummy add function
 at::Tensor custom_add_Tensor(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) {
   add_counter += 1;
   // Since this custom device is just for testing, not bothering to implement kernels.
   return at::empty(self.sizes(), self.options());
 }

 // A dummy allocator for our custom device, that secretly uses the CPU
 struct DummyCustomAllocator final : at::Allocator {
   DummyCustomAllocator() = default;
   at::DataPtr allocate(size_t nbytes) const override {
     void* data = c10::alloc_cpu(nbytes);
     return {data, data, &ReportAndDelete, at::Device(at::DeviceType::PrivateUse1, 0)};
   }

   static void ReportAndDelete(void* ptr) {
     if (!ptr) {
       return;
     }
     c10::free_cpu(ptr);
   }

   at::DeleterFnPtr raw_deleter() const override {
     return &ReportAndDelete;
   }
 };

 // Register our dummy allocator
 static DummyCustomAllocator global_custom_alloc;
 REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &global_custom_alloc);

 // basic dummy empty function, so we can directly construct tensors on the custom device
 // This dummy test device will just use the CPU allocator, and ignores pinned memory.
 at::Tensor custom_empty_memory_format(at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
   constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1);
   return at::detail::empty_generic(size, &global_custom_alloc, private_use_ks, c10::dtype_or_default(dtype), memory_format);
 }
 at::Tensor custom_empty_symint(c10::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
   constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1);
   return at::detail::empty_generic(size, &global_custom_alloc, private_use_ks, c10::dtype_or_default(dtype), memory_format);
 }

 at::Tensor & custom_fill__scalar(at::Tensor & self, const at::Scalar & value) {
   // Not bothering to implement.
   return self;
 }

 // basic dummy copy_() function, so we can copy from the custom device to/from CPU
 at::Tensor custom__copy_from(const at::Tensor& self, const at::Tensor& dst, bool non_blocking) {
   TORCH_CHECK(self.is_cpu() || self.device().type() == c10::DeviceType::PrivateUse1, "Dummy test only allows copy from cpu -> dummy device.");
   TORCH_CHECK(dst.is_cpu() || dst.device().type() == c10::DeviceType::PrivateUse1, "Dummy test only allows copy from cpu -> dummy device.");

   // Some dummy asserts for the basic use case: inputs are the same size / dtype, all contiguous.
   TORCH_CHECK(self.sizes() == dst.sizes());
   TORCH_CHECK(self.scalar_type() == dst.scalar_type());
   TORCH_CHECK(self.is_contiguous() && dst.is_contiguous());

   std::memcpy(dst.storage().data_ptr().get(), self.storage().data_ptr().get(), self.storage().nbytes());
   return dst;
 }


 // This macro does the heavy lifting.
 // With TORCH_LIBRARY_IMPL, you can register custom kernels for your backend.
 // For open registration, we're registering all of our kernels to the PrivateUse1 dispatch key.
 // Later in this file, we map a custom device to the PrivateUse1 device type,
 // which allows user code that puts a tensor on your custom_device to eventually get plumbed
 // into the kernels registered here.
 //
 // This macro registers your kernels to the PyTorch Dispatcher.
 // More details on the dispatcher can be found at http://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/.
 TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
   m.impl("add.Tensor", &custom_add_Tensor);
   m.impl("empty.memory_format", &custom_empty_symint);
   m.impl("fill_.Scalar", &custom_fill__scalar);
   m.impl("_copy_from", &custom__copy_from);
 }

 // This basic implementation doesn't bother dealing with different device indices
 // (e.g. custom_device:0 vs. custom_device:1).
 // We could do that by letting the user pass in a device index in our exposed device function.
 // Note that if you do that, you'll also need to register a device guard to core.
 // See `c10/core/impl/DeviceGuardImplInterface.h:C10_REGISTER_GUARD_IMPL`.
 c10::Device get_custom_device() {
   return c10::Device(c10::DeviceType::PrivateUse1, 0);
 }

 bool custom_add_called() {
   bool called = false;
   if (add_counter > last_saved_value) {
     called = true;
     last_saved_value = add_counter;
   }
   return called;
 }

 // Here, we're exposing a custom device object that corresponds to our custom backend.
 // We do this using pybind: exposing an "extension_name.custom_device()" function in python,
 // that's implemented in C++.
 // The implementation in this file maps directly to the `PrivateUse1` device type.
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     m.def("custom_device", &get_custom_device, "get custom device object");
     m.def("custom_add_called", &custom_add_called, "check if our custom add function was called");
 }
	#include <c10/core/impl/alloc_cpu.h>
	#include <c10/core/Allocator.h>

	#include <torch/csrc/Device.h>
	#include <torch/extension.h>

	#include <ATen/native/cpu/Loops.h>
	#include <ATen/native/DispatchStub.h>
	#include <ATen/EmptyTensor.h>


	static uint64_t add_counter = 0;
	static uint64_t last_saved_value = 0;

	// basic dummy add function
	at::Tensor custom_add_Tensor(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha) {
	add_counter += 1;
	// Since this custom device is just for testing, not bothering to implement kernels.
	return at::empty(self.sizes(), self.options());
	}

	// A dummy allocator for our custom device, that secretly uses the CPU
	struct DummyCustomAllocator final : at::Allocator {
	DummyCustomAllocator() = default;
	at::DataPtr allocate(size_t nbytes) const override {
	void* data = c10::alloc_cpu(nbytes);
	return {data, data, &ReportAndDelete, at::Device(at::DeviceType::PrivateUse1, 0)};
	}

	static void ReportAndDelete(void* ptr) {
	if (!ptr) {
	return;
	}
	c10::free_cpu(ptr);
	}

	at::DeleterFnPtr raw_deleter() const override {
	return &ReportAndDelete;
	}
	};

	// Register our dummy allocator
	static DummyCustomAllocator global_custom_alloc;
	REGISTER_ALLOCATOR(c10::DeviceType::PrivateUse1, &global_custom_alloc);

	// basic dummy empty function, so we can directly construct tensors on the custom device
	// This dummy test device will just use the CPU allocator, and ignores pinned memory.
	at::Tensor custom_empty_memory_format(at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
	constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1);
	return at::detail::empty_generic(size, &global_custom_alloc, private_use_ks, c10::dtype_or_default(dtype), memory_format);
	}
	at::Tensor custom_empty_symint(c10::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
	constexpr c10::DispatchKeySet private_use_ks(c10::DispatchKey::PrivateUse1);
	return at::detail::empty_generic(size, &global_custom_alloc, private_use_ks, c10::dtype_or_default(dtype), memory_format);
	}

	at::Tensor & custom_fill__scalar(at::Tensor & self, const at::Scalar & value) {
	// Not bothering to implement.
	return self;
	}

	// basic dummy copy_() function, so we can copy from the custom device to/from CPU
	at::Tensor custom__copy_from(const at::Tensor& self, const at::Tensor& dst, bool non_blocking) {
	TORCH_CHECK(self.is_cpu() \|\| self.device().type() == c10::DeviceType::PrivateUse1, "Dummy test only allows copy from cpu -> dummy device.");
	TORCH_CHECK(dst.is_cpu() \|\| dst.device().type() == c10::DeviceType::PrivateUse1, "Dummy test only allows copy from cpu -> dummy device.");

	// Some dummy asserts for the basic use case: inputs are the same size / dtype, all contiguous.
	TORCH_CHECK(self.sizes() == dst.sizes());
	TORCH_CHECK(self.scalar_type() == dst.scalar_type());
	TORCH_CHECK(self.is_contiguous() && dst.is_contiguous());

	std::memcpy(dst.storage().data_ptr().get(), self.storage().data_ptr().get(), self.storage().nbytes());
	return dst;
	}


	// This macro does the heavy lifting.
	// With TORCH_LIBRARY_IMPL, you can register custom kernels for your backend.
	// For open registration, we're registering all of our kernels to the PrivateUse1 dispatch key.
	// Later in this file, we map a custom device to the PrivateUse1 device type,
	// which allows user code that puts a tensor on your custom_device to eventually get plumbed
	// into the kernels registered here.
	//
	// This macro registers your kernels to the PyTorch Dispatcher.
	// More details on the dispatcher can be found at http://blog.ezyang.com/2020/09/lets-talk-about-the-pytorch-dispatcher/.
	TORCH_LIBRARY_IMPL(aten, PrivateUse1, m) {
	m.impl("add.Tensor", &custom_add_Tensor);
	m.impl("empty.memory_format", &custom_empty_symint);
	m.impl("fill_.Scalar", &custom_fill__scalar);
	m.impl("_copy_from", &custom__copy_from);
	}

	// This basic implementation doesn't bother dealing with different device indices
	// (e.g. custom_device:0 vs. custom_device:1).
	// We could do that by letting the user pass in a device index in our exposed device function.
	// Note that if you do that, you'll also need to register a device guard to core.
	// See `c10/core/impl/DeviceGuardImplInterface.h:C10_REGISTER_GUARD_IMPL`.
	c10::Device get_custom_device() {
	return c10::Device(c10::DeviceType::PrivateUse1, 0);
	}

	bool custom_add_called() {
	bool called = false;
	if (add_counter > last_saved_value) {
	called = true;
	last_saved_value = add_counter;
	}
	return called;
	}

	// Here, we're exposing a custom device object that corresponds to our custom backend.
	// We do this using pybind: exposing an "extension_name.custom_device()" function in python,
	// that's implemented in C++.
	// The implementation in this file maps directly to the `PrivateUse1` device type.
	PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
	m.def("custom_device", &get_custom_device, "get custom device object");
	m.def("custom_add_called", &custom_add_called, "check if our custom add function was called");
	}