torch/csrc/xpu/Module.cpp - platform/external/pytorch - Git at Google

 #include <ATen/ATen.h>
 #include <ATen/xpu/XPUContext.h>
 #include <c10/util/CallOnce.h>
 #include <c10/xpu/XPUFunctions.h>
 #include <torch/csrc/Module.h>
 #include <torch/csrc/THP.h>
 #include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/pycfunction_helpers.h>
 #include <torch/csrc/utils/python_numbers.h>
 #include <torch/csrc/utils/python_strings.h>

 #include <pthread.h>

 using namespace torch;

 static bool in_bad_fork = false; // True for children forked after xpu init

 // Called in the forked child if xpu has already been initialized
 static void forked_child() {
   in_bad_fork = true;
   torch::utils::set_requires_device_init(at::kXPU, true);
 }

 // Should be called before the first xpu call. It is mainly called in lazy_init.
 // Note: This is distinct from initExtension because a stub xpu implementation
 // has some working functions (e.g. device_count) but cannot fully initialize.
 static void poison_fork() {
   static c10::once_flag flag;
   c10::call_once(flag, [] { pthread_atfork(nullptr, nullptr, forked_child); });
 }

 // XPU management methods

 static PyObject* THXPModule_isInBadFork_wrap(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
   return PyBool_FromLong(in_bad_fork);
   END_HANDLE_TH_ERRORS
 }

 PyObject* THXPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to set_device");

   int device = THPUtils_unpackInt(arg);
   c10::xpu::set_device(static_cast<c10::DeviceIndex>(device));

   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }

 PyObject* THXPModule_exchangeDevice_wrap(PyObject* self, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchange_device");

   int device = THPUtils_unpackInt(arg);
   if (device < 0) {
     return THPUtils_packInt32(-1);
   }

   torch::utils::device_lazy_init(at::kXPU);
   int current_device = c10::xpu::exchange_device(device);

   return THPUtils_packInt32(current_device);
   END_HANDLE_TH_ERRORS
 }

 PyObject* THXPModule_maybeExchangeDevice_wrap(PyObject* self, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(
       THPUtils_checkLong(arg), "invalid argument to maybe_exchange_device");

   int device = THPUtils_unpackInt(arg);
   if (device < 0) {
     return THPUtils_packInt32(-1);
   }

   torch::utils::device_lazy_init(at::kXPU);
   int current_device = c10::xpu::maybe_exchange_device(device);

   return THPUtils_packInt32(current_device);
   END_HANDLE_TH_ERRORS
 }

 PyObject* THXPModule_getDevice_wrap(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS

   // NOLINTNEXTLINE(bugprone-signed-char-misuse)
   auto device = static_cast<int32_t>(c10::xpu::current_device());

   return THPUtils_packInt32(device);
   END_HANDLE_TH_ERRORS
 }

 PyObject* THXPModule_getDeviceCount_wrap(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
   poison_fork();
   return THPUtils_packUInt64(at::xpu::device_count());
   END_HANDLE_TH_ERRORS
 }

 PyObject* THXPModule_getCurrentStream_wrap(
     PyObject* self,
     PyObject* device_index) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(
       THPUtils_checkLong(device_index), "invalid argument to current_stream");
   int64_t device = THPUtils_unpackLong(device_index);
   auto stream = at::xpu::getCurrentXPUStream(device);
   PyObject* output_tuple = PyTuple_New(3);
   PyTuple_SetItem(
       output_tuple, 0, THPUtils_packInt64(static_cast<int64_t>(stream.id())));
   PyTuple_SetItem(
       output_tuple,
       1,
       THPUtils_packInt64(static_cast<int64_t>(stream.device_index())));
   PyTuple_SetItem(
       output_tuple,
       2,
       THPUtils_packInt64(static_cast<int64_t>(stream.device_type())));
   return output_tuple;
   END_HANDLE_TH_ERRORS
 }

 PyObject* THXPModule_getCurrentStream_raw(
     PyObject* self,
     PyObject* device_index) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(
       THPUtils_checkLong(device_index),
       "invalid argument to getCurrentRawStream");
   int64_t device = THPUtils_unpackLong(device_index);
   return PyLong_FromVoidPtr(&at::xpu::getCurrentXPUStream(device).queue());
   END_HANDLE_TH_ERRORS
 }

 PyObject* THXPModule_setStream_wrap(
     PyObject* self,
     PyObject* args,
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
   int64_t stream_id = 0;
   int64_t device_index = 0;
   int64_t device_type = 0;

   // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
   constexpr const char* kwlist[] = {
       "stream_id", "device_index", "device_type", nullptr};
   if (!PyArg_ParseTupleAndKeywords(
           args,
           kwargs,
           "|LLL",
           const_cast<char**>(kwlist),
           &stream_id,
           &device_index,
           &device_type)) {
   }

   auto stream = at::xpu::XPUStream::unpack3(
       stream_id, device_index, static_cast<c10::DeviceType>(device_type));

   auto device = c10::xpu::current_device();
   if (device != stream.device_index()) {
     c10::xpu::set_device(stream.device_index());
   }
   at::xpu::setCurrentXPUStream(stream);
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }

 PyObject* THXPModule_xpuSynchronize(PyObject* self, PyObject* arg) {
   HANDLE_TH_ERRORS
   TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to synchronize");
   int device = THPUtils_unpackInt(arg);
   {
     pybind11::gil_scoped_release no_gil;
     // Only the SYCL queues we have reserved will be synchronized, see Note
     // [Synchronize Streams on Device].
     c10::xpu::syncStreamsOnDevice(static_cast<c10::DeviceIndex>(device));
   }
   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }

 // XPU module initialization

 static void registerXpuDeviceProperties(PyObject* module) {
   // Add _xpuDevicePropertires class to torch._C
   using namespace c10::xpu;
   auto get_device_type = [](const DeviceProp& prop) {
     std::ostringstream stream;
     using namespace sycl::info;
     switch (prop.device_type) {
       case device_type::cpu:
         stream << "cpu";
         break;
       case device_type::gpu:
         stream << "gpu";
         break;
       case device_type::accelerator:
         stream << "accelerator";
         break;
       case device_type::host:
         stream << "host";
         break;
       default:
         stream << "unknown device type:"
                << static_cast<typename std::underlying_type<device_type>::type>(
                       prop.device_type);
         break;
     }
     return stream.str();
   };
   auto gpu_subslice_count = [](const DeviceProp& prop) {
     return (prop.gpu_eu_count / prop.gpu_eu_count_per_subslice);
   };
   auto m = py::handle(module).cast<py::module>();
   py::class_<DeviceProp>(m, "_XpuDeviceProperties")
       .def_readonly("name", &DeviceProp::name)
       .def_readonly("platform_name", &DeviceProp::platform_name)
       .def_readonly("total_memory", &DeviceProp::global_mem_size)
       .def_readonly("max_compute_units", &DeviceProp::max_compute_units)
       .def_readonly("gpu_eu_count", &DeviceProp::gpu_eu_count)
       .def_property_readonly("gpu_subslice_count", gpu_subslice_count)
       .def_readonly("max_work_group_size", &DeviceProp::max_work_group_size)
       .def_readonly("max_num_sub_groups", &DeviceProp::max_num_sub_groups)
       .def_readonly("sub_group_sizes", &DeviceProp::sub_group_sizes)
       .def_property_readonly("type", get_device_type)
       .def(
           "__repr__",
           [&get_device_type, &gpu_subslice_count](const DeviceProp& prop) {
             std::ostringstream stream;
             stream << "_XpuDeviceProperties(name='" << prop.name
                    << "', platform_name='" << prop.platform_name << "', type='"
                    << get_device_type(prop)
                    << ", total_memory=" << prop.global_mem_size / (1024 * 1024)
                    << "MB, max_compute_units=" << prop.max_compute_units
                    << ", gpu_eu_count=" << prop.gpu_eu_count
                    << ", gpu_subslice_count=" << gpu_subslice_count(prop)
                    << ", max_work_group_size=" << prop.max_work_group_size
                    << ", max_num_sub_groups=" << prop.max_num_sub_groups
                    << ", sub_group_sizes=[" << prop.sub_group_sizes << "])";
             return stream.str();
           });
 }

 static void bindGetDeviceProperties(PyObject* module) {
   // Add method to torch.xpu
   auto m = py::handle(module).cast<py::module>();
   m.def(
       "_get_device_properties",
       [](int device) -> c10::xpu::DeviceProp* {
         return at::xpu::getDeviceProperties(device);
       },
       py::return_value_policy::reference);
 }

 // Callback for python part. Used for additional initialization of python
 // classes
 static PyObject* THXPModule_initExtension(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
   TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
   poison_fork();

   auto m = THPObjectPtr(PyImport_ImportModule("torch.xpu"));
   if (!m)
     throw python_error();

   bindGetDeviceProperties(m);

   Py_RETURN_NONE;
   END_HANDLE_TH_ERRORS
 }

 // NOLINTNEXTLINE(modernize-avoid-c-arrays,
 // cppcoreguidelines-avoid-non-const-global-variables,
 // cppcoreguidelines-avoid-c-arrays)
 static struct PyMethodDef _THXPModule_methods[] = {
     {"_xpu_init", THXPModule_initExtension, METH_NOARGS, nullptr},
     {"_xpu_setDevice", THXPModule_setDevice_wrap, METH_O, nullptr},
     {"_xpu_exchangeDevice", THXPModule_exchangeDevice_wrap, METH_O, nullptr},
     {"_xpu_maybeExchangeDevice",
      THXPModule_maybeExchangeDevice_wrap,
      METH_O,
      nullptr},
     {"_xpu_getDevice", THXPModule_getDevice_wrap, METH_NOARGS, nullptr},
     {"_xpu_getDeviceCount",
      THXPModule_getDeviceCount_wrap,
      METH_NOARGS,
      nullptr},
     {"_xpu_isInBadFork", THXPModule_isInBadFork_wrap, METH_NOARGS, nullptr},
     {"_xpu_getCurrentStream",
      THXPModule_getCurrentStream_wrap,
      METH_O,
      nullptr},
     {"_xpu_getCurrentRawStream",
      THXPModule_getCurrentStream_raw,
      METH_O,
      nullptr},
     {"_xpu_setStream",
      castPyCFunctionWithKeywords(THXPModule_setStream_wrap),
      METH_VARARGS | METH_KEYWORDS,
      nullptr},
     {"_xpu_synchronize", THXPModule_xpuSynchronize, METH_O, nullptr},
     {nullptr}};

 PyMethodDef* THXPModule_methods() {
   return _THXPModule_methods;
 }

 namespace torch::xpu {

 void initModule(PyObject* module) {
   registerXpuDeviceProperties(module);
 }

 } // namespace torch::xpu
	#include <ATen/ATen.h>
	#include <ATen/xpu/XPUContext.h>
	#include <c10/util/CallOnce.h>
	#include <c10/xpu/XPUFunctions.h>
	#include <torch/csrc/Module.h>
	#include <torch/csrc/THP.h>
	#include <torch/csrc/utils/device_lazy_init.h>
	#include <torch/csrc/utils/pycfunction_helpers.h>
	#include <torch/csrc/utils/python_numbers.h>
	#include <torch/csrc/utils/python_strings.h>

	#include <pthread.h>

	using namespace torch;

	static bool in_bad_fork = false; // True for children forked after xpu init

	// Called in the forked child if xpu has already been initialized
	static void forked_child() {
	in_bad_fork = true;
	torch::utils::set_requires_device_init(at::kXPU, true);
	}

	// Should be called before the first xpu call. It is mainly called in lazy_init.
	// Note: This is distinct from initExtension because a stub xpu implementation
	// has some working functions (e.g. device_count) but cannot fully initialize.
	static void poison_fork() {
	static c10::once_flag flag;
	c10::call_once(flag, [] { pthread_atfork(nullptr, nullptr, forked_child); });
	}

	// XPU management methods

	static PyObject* THXPModule_isInBadFork_wrap(PyObject* self, PyObject* noargs) {
	HANDLE_TH_ERRORS
	return PyBool_FromLong(in_bad_fork);
	END_HANDLE_TH_ERRORS
	}

	PyObject* THXPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
	HANDLE_TH_ERRORS
	TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to set_device");

	int device = THPUtils_unpackInt(arg);
	c10::xpu::set_device(static_cast<c10::DeviceIndex>(device));

	Py_RETURN_NONE;
	END_HANDLE_TH_ERRORS
	}

	PyObject* THXPModule_exchangeDevice_wrap(PyObject* self, PyObject* arg) {
	HANDLE_TH_ERRORS
	TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to exchange_device");

	int device = THPUtils_unpackInt(arg);
	if (device < 0) {
	return THPUtils_packInt32(-1);
	}

	torch::utils::device_lazy_init(at::kXPU);
	int current_device = c10::xpu::exchange_device(device);

	return THPUtils_packInt32(current_device);
	END_HANDLE_TH_ERRORS
	}

	PyObject* THXPModule_maybeExchangeDevice_wrap(PyObject* self, PyObject* arg) {
	HANDLE_TH_ERRORS
	TORCH_CHECK(
	THPUtils_checkLong(arg), "invalid argument to maybe_exchange_device");

	int device = THPUtils_unpackInt(arg);
	if (device < 0) {
	return THPUtils_packInt32(-1);
	}

	torch::utils::device_lazy_init(at::kXPU);
	int current_device = c10::xpu::maybe_exchange_device(device);

	return THPUtils_packInt32(current_device);
	END_HANDLE_TH_ERRORS
	}

	PyObject* THXPModule_getDevice_wrap(PyObject* self, PyObject* noargs) {
	HANDLE_TH_ERRORS

	// NOLINTNEXTLINE(bugprone-signed-char-misuse)
	auto device = static_cast<int32_t>(c10::xpu::current_device());

	return THPUtils_packInt32(device);
	END_HANDLE_TH_ERRORS
	}

	PyObject* THXPModule_getDeviceCount_wrap(PyObject* self, PyObject* noargs) {
	HANDLE_TH_ERRORS
	poison_fork();
	return THPUtils_packUInt64(at::xpu::device_count());
	END_HANDLE_TH_ERRORS
	}

	PyObject* THXPModule_getCurrentStream_wrap(
	PyObject* self,
	PyObject* device_index) {
	HANDLE_TH_ERRORS
	TORCH_CHECK(
	THPUtils_checkLong(device_index), "invalid argument to current_stream");
	int64_t device = THPUtils_unpackLong(device_index);
	auto stream = at::xpu::getCurrentXPUStream(device);
	PyObject* output_tuple = PyTuple_New(3);
	PyTuple_SetItem(
	output_tuple, 0, THPUtils_packInt64(static_cast<int64_t>(stream.id())));
	PyTuple_SetItem(
	output_tuple,
	1,
	THPUtils_packInt64(static_cast<int64_t>(stream.device_index())));
	PyTuple_SetItem(
	output_tuple,
	2,
	THPUtils_packInt64(static_cast<int64_t>(stream.device_type())));
	return output_tuple;
	END_HANDLE_TH_ERRORS
	}

	PyObject* THXPModule_getCurrentStream_raw(
	PyObject* self,
	PyObject* device_index) {
	HANDLE_TH_ERRORS
	TORCH_CHECK(
	THPUtils_checkLong(device_index),
	"invalid argument to getCurrentRawStream");
	int64_t device = THPUtils_unpackLong(device_index);
	return PyLong_FromVoidPtr(&at::xpu::getCurrentXPUStream(device).queue());
	END_HANDLE_TH_ERRORS
	}

	PyObject* THXPModule_setStream_wrap(
	PyObject* self,
	PyObject* args,
	PyObject* kwargs) {
	HANDLE_TH_ERRORS
	int64_t stream_id = 0;
	int64_t device_index = 0;
	int64_t device_type = 0;

	// NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
	constexpr const char* kwlist[] = {
	"stream_id", "device_index", "device_type", nullptr};
	if (!PyArg_ParseTupleAndKeywords(
	args,
	kwargs,
	"\|LLL",
	const_cast<char**>(kwlist),
	&stream_id,
	&device_index,
	&device_type)) {
	}

	auto stream = at::xpu::XPUStream::unpack3(
	stream_id, device_index, static_cast<c10::DeviceType>(device_type));

	auto device = c10::xpu::current_device();
	if (device != stream.device_index()) {
	c10::xpu::set_device(stream.device_index());
	}
	at::xpu::setCurrentXPUStream(stream);
	Py_RETURN_NONE;
	END_HANDLE_TH_ERRORS
	}

	PyObject* THXPModule_xpuSynchronize(PyObject* self, PyObject* arg) {
	HANDLE_TH_ERRORS
	TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to synchronize");
	int device = THPUtils_unpackInt(arg);
	{
	pybind11::gil_scoped_release no_gil;
	// Only the SYCL queues we have reserved will be synchronized, see Note
	// [Synchronize Streams on Device].
	c10::xpu::syncStreamsOnDevice(static_cast<c10::DeviceIndex>(device));
	}
	Py_RETURN_NONE;
	END_HANDLE_TH_ERRORS
	}

	// XPU module initialization

	static void registerXpuDeviceProperties(PyObject* module) {
	// Add _xpuDevicePropertires class to torch._C
	using namespace c10::xpu;
	auto get_device_type = [](const DeviceProp& prop) {
	std::ostringstream stream;
	using namespace sycl::info;
	switch (prop.device_type) {
	case device_type::cpu:
	stream << "cpu";
	break;
	case device_type::gpu:
	stream << "gpu";
	break;
	case device_type::accelerator:
	stream << "accelerator";
	break;
	case device_type::host:
	stream << "host";
	break;
	default:
	stream << "unknown device type:"
	<< static_cast<typename std::underlying_type<device_type>::type>(
	prop.device_type);
	break;
	}
	return stream.str();
	};
	auto gpu_subslice_count = [](const DeviceProp& prop) {
	return (prop.gpu_eu_count / prop.gpu_eu_count_per_subslice);
	};
	auto m = py::handle(module).cast<py::module>();
	py::class_<DeviceProp>(m, "_XpuDeviceProperties")
	.def_readonly("name", &DeviceProp::name)
	.def_readonly("platform_name", &DeviceProp::platform_name)
	.def_readonly("total_memory", &DeviceProp::global_mem_size)
	.def_readonly("max_compute_units", &DeviceProp::max_compute_units)
	.def_readonly("gpu_eu_count", &DeviceProp::gpu_eu_count)
	.def_property_readonly("gpu_subslice_count", gpu_subslice_count)
	.def_readonly("max_work_group_size", &DeviceProp::max_work_group_size)
	.def_readonly("max_num_sub_groups", &DeviceProp::max_num_sub_groups)
	.def_readonly("sub_group_sizes", &DeviceProp::sub_group_sizes)
	.def_property_readonly("type", get_device_type)
	.def(
	"__repr__",
	[&get_device_type, &gpu_subslice_count](const DeviceProp& prop) {
	std::ostringstream stream;
	stream << "_XpuDeviceProperties(name='" << prop.name
	<< "', platform_name='" << prop.platform_name << "', type='"
	<< get_device_type(prop)
	<< ", total_memory=" << prop.global_mem_size / (1024 * 1024)
	<< "MB, max_compute_units=" << prop.max_compute_units
	<< ", gpu_eu_count=" << prop.gpu_eu_count
	<< ", gpu_subslice_count=" << gpu_subslice_count(prop)
	<< ", max_work_group_size=" << prop.max_work_group_size
	<< ", max_num_sub_groups=" << prop.max_num_sub_groups
	<< ", sub_group_sizes=[" << prop.sub_group_sizes << "])";
	return stream.str();
	});
	}

	static void bindGetDeviceProperties(PyObject* module) {
	// Add method to torch.xpu
	auto m = py::handle(module).cast<py::module>();
	m.def(
	"_get_device_properties",
	[](int device) -> c10::xpu::DeviceProp* {
	return at::xpu::getDeviceProperties(device);
	},
	py::return_value_policy::reference);
	}

	// Callback for python part. Used for additional initialization of python
	// classes
	static PyObject* THXPModule_initExtension(PyObject* self, PyObject* noargs) {
	HANDLE_TH_ERRORS
	TORCH_INTERNAL_ASSERT(!in_bad_fork); // Handled at python level
	poison_fork();

	auto m = THPObjectPtr(PyImport_ImportModule("torch.xpu"));
	if (!m)
	throw python_error();

	bindGetDeviceProperties(m);

	Py_RETURN_NONE;
	END_HANDLE_TH_ERRORS
	}

	// NOLINTNEXTLINE(modernize-avoid-c-arrays,
	// cppcoreguidelines-avoid-non-const-global-variables,
	// cppcoreguidelines-avoid-c-arrays)
	static struct PyMethodDef _THXPModule_methods[] = {
	{"_xpu_init", THXPModule_initExtension, METH_NOARGS, nullptr},
	{"_xpu_setDevice", THXPModule_setDevice_wrap, METH_O, nullptr},
	{"_xpu_exchangeDevice", THXPModule_exchangeDevice_wrap, METH_O, nullptr},
	{"_xpu_maybeExchangeDevice",
	THXPModule_maybeExchangeDevice_wrap,
	METH_O,
	nullptr},
	{"_xpu_getDevice", THXPModule_getDevice_wrap, METH_NOARGS, nullptr},
	{"_xpu_getDeviceCount",
	THXPModule_getDeviceCount_wrap,
	METH_NOARGS,
	nullptr},
	{"_xpu_isInBadFork", THXPModule_isInBadFork_wrap, METH_NOARGS, nullptr},
	{"_xpu_getCurrentStream",
	THXPModule_getCurrentStream_wrap,
	METH_O,
	nullptr},
	{"_xpu_getCurrentRawStream",
	THXPModule_getCurrentStream_raw,
	METH_O,
	nullptr},
	{"_xpu_setStream",
	castPyCFunctionWithKeywords(THXPModule_setStream_wrap),
	METH_VARARGS \| METH_KEYWORDS,
	nullptr},
	{"_xpu_synchronize", THXPModule_xpuSynchronize, METH_O, nullptr},
	{nullptr}};

	PyMethodDef* THXPModule_methods() {
	return _THXPModule_methods;
	}

	namespace torch::xpu {

	void initModule(PyObject* module) {
	registerXpuDeviceProperties(module);
	}

	} // namespace torch::xpu