torchgen/api/ufunc.py - platform/external/pytorch - Git at Google

 from __future__ import annotations

 from dataclasses import dataclass

 import torchgen.api.types as api_types
 from torchgen.api import cpp, structured
 from torchgen.api.types import (
     ArgName,
     BaseCppType,
     BaseCType,
     Binding,
     ConstRefCType,
     CType,
     NamedCType,
     scalarT,
 )
 from torchgen.model import (
     Argument,
     BaseTy,
     BaseType,
     DispatchKey,
     FunctionSchema,
     NativeFunctionsGroup,
     Type,
 )


 def schema_kernel_name(func: FunctionSchema, dispatch_key: DispatchKey) -> str:
     assert func.is_out_fn(), "ufunc.kernel_name should only be invoked on out schemas"
     return f"ufunc_{func.name.name}_{dispatch_key}"


 def kernel_name(g: NativeFunctionsGroup, dispatch_key: DispatchKey) -> str:
     return schema_kernel_name(g.out.func, dispatch_key)


 # Tensors are omitted (as they are stored in TensorIterator), everything else is
 # passed along  (technically, we can pass tensors along too, it just wastes
 # argument registers)
 #
 # NB: used for CPU only
 def dispatchstub_type(t: Type, *, binds: ArgName) -> NamedCType | None:
     # Dispatch stubs are always plain ints
     r = cpp.valuetype_type(t, binds=binds, symint=False)
     if r is not None:
         return r

     if t == BaseType(BaseTy.Scalar):
         return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
     elif t == BaseType(BaseTy.Tensor):
         return None
     else:
         raise AssertionError(f"unrecognized type {repr(t)}")


 def opmath_type(scalar_t: BaseCppType) -> BaseCppType:
     if scalar_t == api_types.scalar_t:
         return api_types.opmath_t
     raise NotImplementedError


 # NB: Tensors in constructor are stored in opmath_t, not scalar_t
 # because Tensor in constructor = its a scalar tensor partially applied =
 # it can be higher precision and we want to compute in that higher precision
 #
 # NB: CUDA only
 def ufunctor_ctor_type(t: Type, *, binds: ArgName, scalar_t: BaseCppType) -> NamedCType:
     r = cpp.valuetype_type(t, binds=binds, symint=False)
     if r is not None:
         return r

     if t == BaseType(BaseTy.Scalar):
         return NamedCType(binds, BaseCType(opmath_type(scalar_t)))
     elif t == BaseType(BaseTy.Tensor):
         return NamedCType(binds, BaseCType(opmath_type(scalar_t)))
     else:
         raise AssertionError(f"unrecognized type {repr(t)}")


 # Only Tensors ever get passed directly to operator()
 #
 # NB: CUDA only
 # (Actually, this works for CPU too)
 def ufunctor_apply_type(
     t: Type, *, binds: ArgName, scalar_t: BaseCppType
 ) -> NamedCType:
     if t == BaseType(BaseTy.Tensor):
         return NamedCType(binds, BaseCType(scalar_t))
     else:
         raise AssertionError(f"unrecognized type {repr(t)}")


 # The actual ufunc template function the user writes.  Everything here
 # is done in the computation type.  compute_t is opmath_t in CUDA and scalar_t
 # in CPU
 def ufunc_type(t: Type, *, binds: ArgName, compute_t: CType) -> NamedCType:
     r = cpp.valuetype_type(t, binds=binds, symint=False)
     if r is not None:
         return r

     if t == BaseType(BaseTy.Scalar):
         return NamedCType(binds, compute_t)
     elif t == BaseType(BaseTy.Tensor):
         return NamedCType(binds, compute_t)
     else:
         raise AssertionError(f"unrecognized type {repr(t)}")


 def ufunctor_ctor_argument(a: Argument, scalar_t: BaseCppType) -> Binding:
     return Binding(
         nctype=ufunctor_ctor_type(a.type, binds=a.name, scalar_t=scalar_t),
         name=a.name,
         default=None,
         argument=a,
     )


 def ufunctor_apply_argument(a: Argument, scalar_t: BaseCppType) -> Binding:
     return Binding(
         nctype=ufunctor_apply_type(a.type, binds=a.name, scalar_t=scalar_t),
         name=a.name,
         default=None,
         argument=a,
     )


 def ufunc_argument(a: Argument, compute_t: CType) -> Binding:
     return Binding(
         nctype=ufunc_type(a.type, binds=a.name, compute_t=compute_t),
         name=a.name,
         default=None,
         argument=a,
     )


 @dataclass(frozen=True)
 class UfunctorBindings:
     ctor: list[Binding]
     apply: list[Binding]


 # ufunctors are a CUDA-only concept representing functors that take some of
 # their arguments on a host-side constructor, and the rest in the device-side
 # apply.  E.g.,
 #
 # template <typename scalar_t>
 # struct CUDAFunctorOnSelf_add {
 #   using opmath_t = at::opmath_type<scalar_t>;
 #   opmath_t other_;
 #   opmath_t alpha_;
 #   CUDAFunctorOnSelf_add(opmath_t other, opmath_t alpha) : other_(other), alpha_(alpha) {}
 #   __device__ scalar_t operator()(scalar_t self) {
 #     return ufunc::add(static_cast<opmath_t>(self), other_, alpha_);
 #   }
 # };
 #
 # The ctor refers to the constructor CUDAFunctorOnSelf_add, while apply refers
 # to the operator() definition
 def ufunctor_arguments(
     g: NativeFunctionsGroup, *, scalar_tensor_idx: int | None, scalar_t: BaseCppType
 ) -> UfunctorBindings:
     ctor = []
     apply = []
     for a in g.functional.func.arguments.flat_non_out:
         if a.type.is_tensor_like():
             if scalar_tensor_idx == 0:
                 # put it in the ctor anyway
                 ctor.append(ufunctor_ctor_argument(a, scalar_t=scalar_t))
                 scalar_tensor_idx = None
             else:
                 if scalar_tensor_idx is not None:
                     scalar_tensor_idx -= 1
                 apply.append(ufunctor_apply_argument(a, scalar_t=scalar_t))
         else:
             ctor.append(ufunctor_ctor_argument(a, scalar_t=scalar_t))
     assert scalar_tensor_idx is None
     return UfunctorBindings(ctor=ctor, apply=apply)


 # ufuncs are the inner loop template functions that you wrote in ufunc/add.h
 # which do the actual computation in question.  E.g.,
 #
 # template <typename T>
 # C10_HOST_DEVICE T add(T self, T other, T alpha) __ubsan_ignore_undefined__ {
 #   return self + alpha * other;
 # }
 #
 # In this file, we refer to T as compute_t which is bound by caller
 def ufunc_arguments(g: NativeFunctionsGroup, *, compute_t: CType) -> list[Binding]:
     return [
         ufunc_argument(a, compute_t=compute_t)
         for a in g.functional.func.arguments.flat_non_out
     ]


 # Stubs are the DispatchStub trampolines that CPU kernels use to get to their
 # vectorized versions.  E.g.,
 #
 # using structured_binary_fn_alpha = void(*)(TensorIteratorBase&, const Scalar& alpha);
 # DECLARE_DISPATCH(structured_binary_fn_alpha, add_stub);
 def stub_arguments(g: NativeFunctionsGroup) -> list[Binding]:
     # stubs drop all tensor arguments (they are implicit in the TensorIterator
     # argument and keep everything else)
     return [
         r
         for a in g.out.func.arguments.flat_non_out
         if not a.type.is_tensor_like()
         for r in structured.argument(a)
     ]
	from __future__ import annotations

	from dataclasses import dataclass

	import torchgen.api.types as api_types
	from torchgen.api import cpp, structured
	from torchgen.api.types import (
	ArgName,
	BaseCppType,
	BaseCType,
	Binding,
	ConstRefCType,
	CType,
	NamedCType,
	scalarT,
	)
	from torchgen.model import (
	Argument,
	BaseTy,
	BaseType,
	DispatchKey,
	FunctionSchema,
	NativeFunctionsGroup,
	Type,
	)


	def schema_kernel_name(func: FunctionSchema, dispatch_key: DispatchKey) -> str:
	assert func.is_out_fn(), "ufunc.kernel_name should only be invoked on out schemas"
	return f"ufunc_{func.name.name}_{dispatch_key}"


	def kernel_name(g: NativeFunctionsGroup, dispatch_key: DispatchKey) -> str:
	return schema_kernel_name(g.out.func, dispatch_key)


	# Tensors are omitted (as they are stored in TensorIterator), everything else is
	# passed along (technically, we can pass tensors along too, it just wastes
	# argument registers)
	#
	# NB: used for CPU only
	def dispatchstub_type(t: Type, *, binds: ArgName) -> NamedCType \| None:
	# Dispatch stubs are always plain ints
	r = cpp.valuetype_type(t, binds=binds, symint=False)
	if r is not None:
	return r

	if t == BaseType(BaseTy.Scalar):
	return NamedCType(binds, ConstRefCType(BaseCType(scalarT)))
	elif t == BaseType(BaseTy.Tensor):
	return None
	else:
	raise AssertionError(f"unrecognized type {repr(t)}")


	def opmath_type(scalar_t: BaseCppType) -> BaseCppType:
	if scalar_t == api_types.scalar_t:
	return api_types.opmath_t
	raise NotImplementedError


	# NB: Tensors in constructor are stored in opmath_t, not scalar_t
	# because Tensor in constructor = its a scalar tensor partially applied =
	# it can be higher precision and we want to compute in that higher precision
	#
	# NB: CUDA only
	def ufunctor_ctor_type(t: Type, *, binds: ArgName, scalar_t: BaseCppType) -> NamedCType:
	r = cpp.valuetype_type(t, binds=binds, symint=False)
	if r is not None:
	return r

	if t == BaseType(BaseTy.Scalar):
	return NamedCType(binds, BaseCType(opmath_type(scalar_t)))
	elif t == BaseType(BaseTy.Tensor):
	return NamedCType(binds, BaseCType(opmath_type(scalar_t)))
	else:
	raise AssertionError(f"unrecognized type {repr(t)}")


	# Only Tensors ever get passed directly to operator()
	#
	# NB: CUDA only
	# (Actually, this works for CPU too)
	def ufunctor_apply_type(
	t: Type, *, binds: ArgName, scalar_t: BaseCppType
	) -> NamedCType:
	if t == BaseType(BaseTy.Tensor):
	return NamedCType(binds, BaseCType(scalar_t))
	else:
	raise AssertionError(f"unrecognized type {repr(t)}")


	# The actual ufunc template function the user writes. Everything here
	# is done in the computation type. compute_t is opmath_t in CUDA and scalar_t
	# in CPU
	def ufunc_type(t: Type, *, binds: ArgName, compute_t: CType) -> NamedCType:
	r = cpp.valuetype_type(t, binds=binds, symint=False)
	if r is not None:
	return r

	if t == BaseType(BaseTy.Scalar):
	return NamedCType(binds, compute_t)
	elif t == BaseType(BaseTy.Tensor):
	return NamedCType(binds, compute_t)
	else:
	raise AssertionError(f"unrecognized type {repr(t)}")


	def ufunctor_ctor_argument(a: Argument, scalar_t: BaseCppType) -> Binding:
	return Binding(
	nctype=ufunctor_ctor_type(a.type, binds=a.name, scalar_t=scalar_t),
	name=a.name,
	default=None,
	argument=a,
	)


	def ufunctor_apply_argument(a: Argument, scalar_t: BaseCppType) -> Binding:
	return Binding(
	nctype=ufunctor_apply_type(a.type, binds=a.name, scalar_t=scalar_t),
	name=a.name,
	default=None,
	argument=a,
	)


	def ufunc_argument(a: Argument, compute_t: CType) -> Binding:
	return Binding(
	nctype=ufunc_type(a.type, binds=a.name, compute_t=compute_t),
	name=a.name,
	default=None,
	argument=a,
	)


	@dataclass(frozen=True)
	class UfunctorBindings:
	ctor: list[Binding]
	apply: list[Binding]


	# ufunctors are a CUDA-only concept representing functors that take some of
	# their arguments on a host-side constructor, and the rest in the device-side
	# apply. E.g.,
	#
	# template <typename scalar_t>
	# struct CUDAFunctorOnSelf_add {
	# using opmath_t = at::opmath_type<scalar_t>;
	# opmath_t other_;
	# opmath_t alpha_;
	# CUDAFunctorOnSelf_add(opmath_t other, opmath_t alpha) : other_(other), alpha_(alpha) {}
	# __device__ scalar_t operator()(scalar_t self) {
	# return ufunc::add(static_cast<opmath_t>(self), other_, alpha_);
	# }
	# };
	#
	# The ctor refers to the constructor CUDAFunctorOnSelf_add, while apply refers
	# to the operator() definition
	def ufunctor_arguments(
	g: NativeFunctionsGroup, *, scalar_tensor_idx: int \| None, scalar_t: BaseCppType
	) -> UfunctorBindings:
	ctor = []
	apply = []
	for a in g.functional.func.arguments.flat_non_out:
	if a.type.is_tensor_like():
	if scalar_tensor_idx == 0:
	# put it in the ctor anyway
	ctor.append(ufunctor_ctor_argument(a, scalar_t=scalar_t))
	scalar_tensor_idx = None
	else:
	if scalar_tensor_idx is not None:
	scalar_tensor_idx -= 1
	apply.append(ufunctor_apply_argument(a, scalar_t=scalar_t))
	else:
	ctor.append(ufunctor_ctor_argument(a, scalar_t=scalar_t))
	assert scalar_tensor_idx is None
	return UfunctorBindings(ctor=ctor, apply=apply)


	# ufuncs are the inner loop template functions that you wrote in ufunc/add.h
	# which do the actual computation in question. E.g.,
	#
	# template <typename T>
	# C10_HOST_DEVICE T add(T self, T other, T alpha) __ubsan_ignore_undefined__ {
	# return self + alpha * other;
	# }
	#
	# In this file, we refer to T as compute_t which is bound by caller
	def ufunc_arguments(g: NativeFunctionsGroup, *, compute_t: CType) -> list[Binding]:
	return [
	ufunc_argument(a, compute_t=compute_t)
	for a in g.functional.func.arguments.flat_non_out
	]


	# Stubs are the DispatchStub trampolines that CPU kernels use to get to their
	# vectorized versions. E.g.,
	#
	# using structured_binary_fn_alpha = void(*)(TensorIteratorBase&, const Scalar& alpha);
	# DECLARE_DISPATCH(structured_binary_fn_alpha, add_stub);
	def stub_arguments(g: NativeFunctionsGroup) -> list[Binding]:
	# stubs drop all tensor arguments (they are implicit in the TensorIterator
	# argument and keep everything else)
	return [
	r
	for a in g.out.func.arguments.flat_non_out
	if not a.type.is_tensor_like()
	for r in structured.argument(a)
	]