torch/_prims/nvfuser_prims.py - platform/external/pytorch - Git at Google

 # Module for defining "primitive" operations executable by the nvFuser. This
 # list exists to decouple main set of primitives from the ones that provide a
 # lowering of the op to nvFuser’s Python interface. Mostly torch.ops.nvprims is
 # a subset of the primitives in torch.ops.prims, but some additional primitives
 # can be added in the future for the corresponding higher-level torch/aten
 # functions.

 from typing import Any, Dict, Optional, Tuple

 import torch
 import torch._prims_common as utils

 from torch._prims_common import (
     DimsSequenceType,
     elementwise_dtypes,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
     getnvFuserDtype,
     make_contiguous_strides_for,
     NumberType,
     ShapeType,
     TensorLikeType,
 )

 from torch._prims_common.wrappers import (
     _maybe_convert_to_dtype,
     backwards_not_supported,
     elementwise_type_promotion_wrapper,
 )

 nvprim_namespace = "nvprims"
 nvprim = torch.library.Library(nvprim_namespace, "DEF")
 nvprim_impl = torch.library.Library(
     nvprim_namespace, "IMPL", "CompositeExplicitAutograd"
 )
 nvprim_implicit_impl = torch.library.Library(
     nvprim_namespace, "IMPL", "CompositeImplicitAutograd"
 )
 nvprim_autograd_impl = torch.library.Library(nvprim_namespace, "IMPL", "Autograd")
 nvprim_meta_impl = torch.library.Library(nvprim_namespace, "IMPL", "Meta")

 nvprim_names = [
     "abs",
     "acos",
     "asin",
     "atan",
     "atanh",
     "cos",
     "cosh",
     "clone",
     "bitwise_not",
     "ceil",
     "erf",
     "erfc",
     "exp",
     "expm1",
     "floor",
     "imag",
     "isfinite",
     "lgamma",
     "log",
     "log1p",
     "log2",
     "log10",
     "real",
     "reciprocal",
     "neg",
     "round",
     "rsqrt",
     "sign",
     "sin",
     "sinh",
     "sqrt",
     "tan",
     "tanh",
     "transpose",
     "trunc",
     "add",
     "atan2",
     "bitwise_and",
     "bitwise_or",
     "bitwise_xor",
     "div",
     "eq",
     "fmod",
     "ge",
     "gt",
     "le",
     "lt",
     "mul",
     "ne",
     "pow",
     "remainder",
     "sub",
     "squeeze",
     "view_of",
     "broadcast_in_dim",
     "where",
     "convert_element_type",
     "sum",
     "var",
     "amax",
     "amin",
 ]

 _nvfuser_impls: Dict[str, Any] = {}

 _nvfuser_unary_ops = {
     "abs",
     "acos",
     "asin",
     "atan",
     "atanh",
     "cos",
     "cosh",
     "bitwise_not",
     "ceil",
     "erf",
     "erfc",
     "exp",
     "expm1",
     "floor",
     "imag",
     "isfinite",
     "lgamma",
     "log",
     "log1p",
     "log2",
     "log10",
     "reciprocal",
     "neg",
     "real",
     "round",
     "rsqrt",
     "sign",
     "sin",
     "sinh",
     "sqrt",
     "tan",
     "tanh",
     "trunc",
 }


 def _assert_nvfuser_op_exists(fname: str):
     try:
         try:
             from nvfuser import (  # type: ignore[import, attr-defined]
                 FusionDefinition as fd,
             )
         except ImportError:
             from nvfuser._C import FusionDefinition as fd  # type: ignore[import]

         assert getattr(fd.Operators, fname)
     except ImportError:
         # Not all PyTorch builds have nvfuser
         pass


 for fname in _nvfuser_unary_ops:
     exec(
         f"""
 # Ensure that the nvfuser implementation exists
 _assert_nvfuser_op_exists("{fname}")

 def _{fname}_nvfuser(fd, a):
     return fd.ops.{fname}(a)  # type: ignore[attr-defined]

 _nvfuser_impls["{fname}"] = _{fname}_nvfuser
 """
     )

 _nvfuser_binary_ops = {
     "add",
     "atan2",
     "bitwise_and",
     "bitwise_or",
     "bitwise_xor",
     "div",
     "eq",
     "fmod",
     "ge",
     "gt",
     "le",
     "lt",
     "mul",
     "ne",
     "pow",
     "remainder",
     "sub",
 }

 for fname in _nvfuser_binary_ops:
     exec(
         f"""
 # Ensure that the nvfuser implementation exists
 _assert_nvfuser_op_exists("{fname}")

 def _{fname}_nvfuser(fd, a, b):
     return fd.ops.{fname}(a, b)  # type: ignore[attr-defined]

 _nvfuser_impls["{fname}"] = _{fname}_nvfuser
 """
     )

 _nvfuser_ternary_ops = {
     "where",
 }

 for fname in _nvfuser_ternary_ops:
     exec(
         f"""
 # Ensure that the nvfuser implementation exists
 _assert_nvfuser_op_exists("{fname}")

 def _{fname}_nvfuser(fd, a, b, c):
     return fd.ops.{fname}(a, b, c)  # type: ignore[attr-defined]

 _nvfuser_impls["{fname}"] = _{fname}_nvfuser
 """
     )


 def _native_batch_norm_nvfuser(
     fd, input, weight, bias, running_mean, running_var, training, momentum, eps
 ):

     """
     if weight is None:
         weight = fd.define_null_tensor()
     if bias is None:
         bias = fd.define_null_tensor()
     if running_mean is None:
         running_mean = fd.define_null_tensor()
     if running_var is None:
         running_var = fd.define_null_tensor()
     """
     return fd.ops.batch_norm(
         input,
         weight,
         bias,
         running_mean,
         running_var,
         momentum,
         eps,
         training,
     )


 def _broadcast_in_dim_nvfuser(
     fd: Any,
     a: TensorLikeType,
     shape: ShapeType,
     broadcast_dimensions: ShapeType,
 ):
     return fd.ops.broadcast_in_dim(a, shape, broadcast_dimensions)  # type: ignore[attr-defined]


 def _convert_element_type_nvfuser(fd: Any, a: TensorLikeType, dtype: torch.dtype):
     nvfuser_dtype = getnvFuserDtype(dtype)
     return fd.ops.cast(a, nvfuser_dtype)  # type: ignore[attr-defined]


 def _transpose_nvfuser(fd, a, dims):
     return fd.ops.permute(a, dims)  # type: ignore[attr-defined]


 def _squeeze_nvfuser(fd, a, a_shape, dimensions):
     for idx in sorted(dimensions, reverse=True):
         a = fd.ops.squeeze(a, a_shape, idx)
         a_shape = a_shape[:idx] + a_shape[idx + 1 :]
     return a


 def _view_of_nvfuser(fd, a):
     return fd.ops.set(a)


 def _view_nvfuser(
     fd,
     a,
     a_shape,
     new_shape,
 ):
     try:
         return fd.ops.view(a, a_shape, new_shape)
     except AttributeError:
         return fd.ops.reshape(a, a_shape, new_shape)


 def _sum_nvfuser(
     fd: Any,
     a: TensorLikeType,
     dims: DimsSequenceType,
 ):
     keep_dims = False
     try:
         from nvfuser import DataType  # type: ignore[import, attr-defined]
     except ImportError:
         from nvfuser._C import DataType  # type: ignore[import]

     output_dtype = DataType.Null
     return fd.ops.sum(a, dims, keep_dims, output_dtype)


 def _var_nvfuser(
     fd: Any,
     a: TensorLikeType,
     dims: DimsSequenceType,
     *,
     correction: float,
 ):
     keep_dims = False
     return fd.ops.var(a, dims, correction, keep_dims)


 def _var_mean_nvfuser(
     fd: Any,
     a: TensorLikeType,
     dims: DimsSequenceType,
     unbiased: Optional[bool] = None,
     keepdim: bool = False,
     *,
     correction: float,
 ):
     # Unbiased arg shouldn't be set when this function is called
     assert unbiased is None
     # Ignore keepdim arg, because currently it's automatically converted into nvfuser's symbolic scalar
     # keepdim is handled by the reference implementation
     keepdim = False
     return fd.ops.var_mean(a, dims, correction, keepdim)


 def _rand_like_nvfuser(fd: Any, a: TensorLikeType):
     return fd.ops.rand_like(a)


 def _amax_nvfuser(
     fd: Any,
     a: TensorLikeType,
     dims: DimsSequenceType,
 ):
     keep_dims = False
     return fd.ops.max(a, dims, keep_dims)


 def _amin_nvfuser(
     fd: Any,
     a: TensorLikeType,
     dims: DimsSequenceType,
 ):
     keep_dims = False
     return fd.ops.min(a, dims, keep_dims)


 def _clone_nvfuser(fd: Any, input: TensorLikeType, *, memory_format=None):
     return fd.ops.set(input)


 def _full_nvfuser(
     fd: Any,
     shape: ShapeType,
     fill_value: NumberType,
     *,
     dtype: Optional[torch.dtype] = None,
     layout: Optional[torch.layout] = None,
     device: Optional[torch.device] = None,
     pin_memory: bool = False,
     requires_grad: bool = False,
 ):
     assert device != torch.device("cpu")
     assert layout is None or layout is torch.strided
     assert pin_memory is False
     assert requires_grad is False
     dtype = dtype if dtype is not None else utils.type_to_dtype(type(fill_value))
     nvfuser_dtype = getnvFuserDtype(dtype)
     return fd.ops.full(shape, fill_value, nvfuser_dtype)


 _nvfuser_impls["native_batch_norm"] = _native_batch_norm_nvfuser
 _nvfuser_impls["broadcast_in_dim"] = _broadcast_in_dim_nvfuser
 _nvfuser_impls["convert_element_type"] = _convert_element_type_nvfuser
 _nvfuser_impls["clone"] = _clone_nvfuser
 _nvfuser_impls["transpose"] = _transpose_nvfuser
 _nvfuser_impls["squeeze"] = _squeeze_nvfuser
 _nvfuser_impls["view_of"] = _view_of_nvfuser
 _nvfuser_impls["view"] = _view_nvfuser
 _nvfuser_impls["rand_like"] = _rand_like_nvfuser
 _nvfuser_impls["sum"] = _sum_nvfuser
 _nvfuser_impls["var"] = _var_nvfuser
 _nvfuser_impls["var_mean"] = _var_mean_nvfuser
 _nvfuser_impls["amax"] = _amax_nvfuser
 _nvfuser_impls["amin"] = _amin_nvfuser
 _nvfuser_impls["full"] = _full_nvfuser


 def register_full():
     name = "full"

     nvprim.define(
         "full(SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, "
         + "bool? pin_memory=None, bool? requires_grad=None) -> Tensor"
     )

     def _meta_impl(
         size,
         fill_value,
         *,
         out=None,
         dtype=None,
         layout=None,
         device=None,
         pin_memory=False,
         requires_grad=False,
     ):
         strides = make_contiguous_strides_for(size)
         return torch._prims.TensorMeta(
             None,
             shape=size,
             strides=strides,
             dtype=dtype,
             device=device,
         )

     def _prim_impl(
         size,
         fill_value,
         *,
         out=None,
         dtype=None,
         layout=None,
         device=None,
         pin_memory=False,
         requires_grad=False,
     ):
         return torch.full(
             size,
             fill_value,
             out=out,
             dtype=dtype,
             layout=layout,
             device=device,
             pin_memory=pin_memory,
             requires_grad=requires_grad,
         )

     nvprim_impl.impl(name, _prim_impl)
     nvprim_meta_impl.impl(name, _meta_impl)

     prim_packet = getattr(torch._ops.ops.nvprims, name)
     prim = prim_packet.default
     nvprim_autograd_impl.impl(name, backwards_not_supported(prim))
     for p in (prim_packet, prim):
         p.__doc__ = "Create a tensor with given size and filled with value"
         p.impl_nvfuser = _nvfuser_impls["full"]
         p.is_recomputable = _nvfuser_is_recomputable["full"]
         p.return_type = torch._prims_common.RETURN_TYPE.NEW  # type: ignore[attr-defined]


 # functorch.compile.min_cut_rematerialization_partition accepts a list of
 # operators that can be recomputed in the backward pass. This list is used to
 # determine which operators can be recomputed. If an operator is not in this
 # list, it will not be recomputed.
 _nvfuser_is_recomputable: Dict[str, bool] = {
     # Reductions are not allowed to be recomputed
     "amax": False,
     "amin": False,
     "sum": False,
     "var": False,
     "var_mean": False,
     # Normalizations are not allowed to be recomputed
     "native_batch_norm": False,
     # Random ops are not allowed to be recomputed
     "rand_like": False,
     # Everything else is allowed to be recomputed
     "abs": True,
     "acos": True,
     "add": True,
     "asin": True,
     "atan": True,
     "atan2": True,
     "atanh": True,
     "bitwise_and": True,
     "bitwise_not": True,
     "bitwise_or": True,
     "bitwise_xor": True,
     "broadcast_in_dim": True,
     "ceil": True,
     "clone": True,
     "convert_element_type": True,
     "cos": True,
     "cosh": True,
     "div": True,
     "eq": True,
     "erf": True,
     "erfc": True,
     "exp": True,
     "expm1": True,
     "floor": True,
     "fmod": True,
     "full": True,
     "ge": True,
     "gt": True,
     "imag": True,
     "isfinite": True,
     "le": True,
     "lgamma": True,
     "log": True,
     "log10": True,
     "log1p": True,
     "log2": True,
     "lt": True,
     "mul": True,
     "ne": True,
     "neg": True,
     "pow": True,
     "real": True,
     "reciprocal": True,
     "remainder": True,
     "round": True,
     "rsqrt": True,
     "sign": True,
     "sin": True,
     "sinh": True,
     "sqrt": True,
     "squeeze": True,
     "sub": True,
     "tan": True,
     "tanh": True,
     "transpose": True,
     "trunc": True,
     "view": True,
     "view_of": True,
     "where": True,
 }


 def register_native_batch_norm():
     """This function is used to register the native_batch_norm function in torch.ops.nvprims module."""
     name = "native_batch_norm"

     nvprim.define(
         f"{name}(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, "
         + "bool training, float momentum, float eps)"
         + " -> (Tensor, Tensor, Tensor)"
     )

     def _prim_impl(
         input, weight, bias, running_mean, running_var, training, momentum, eps
     ):
         return torch.native_batch_norm(
             input, weight, bias, running_mean, running_var, training, momentum, eps
         )

     nvprim_impl.impl(name, _prim_impl)
     prim_packet = torch._ops.ops.nvprims.native_batch_norm
     prim = prim_packet.default

     def _native_batch_norm_ref(
         input: torch.Tensor,
         weight: Optional[torch.Tensor],
         bias: Optional[torch.Tensor],
         running_mean: Optional[torch.Tensor],
         running_var: Optional[torch.Tensor],
         training: bool,
         momentum: float,
         eps: float,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

         if torch._prims_common.is_complex_dtype(input.dtype):
             raise NotImplementedError("Complex tensors are not supported")

         # note: BN only promotes input to dtype of weight/bias, but keeps the same output dtype
         result_dtype = input.dtype
         computation_dtype, _ = elementwise_dtypes(
             input,
             weight,
             bias,
             type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
         )

         input_ = _maybe_convert_to_dtype(input, computation_dtype)
         output, mean, rstd = prim(
             input_, weight, bias, running_mean, running_var, training, momentum, eps
         )
         output_ = _maybe_convert_to_dtype(output, result_dtype)  # type: ignore[arg-type]
         return (output_, mean, rstd)  # type: ignore[return-value]

     def _native_batch_norm_autograd(
         input: torch.Tensor,
         weight: Optional[torch.Tensor],
         bias: Optional[torch.Tensor],
         running_mean: Optional[torch.Tensor],
         running_var: Optional[torch.Tensor],
         training: bool,
         momentum: float,
         eps: float,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         # This wrapper is needed to convert prims calls inside
         # _native_batch_norm_ref to nvprims calls
         from torch._prims.context import NvfuserPrimsMode

         with NvfuserPrimsMode():
             return backwards_not_supported(_native_batch_norm_ref)(
                 input, weight, bias, running_mean, running_var, training, momentum, eps
             )

     nvprim_autograd_impl.impl(name, _native_batch_norm_autograd)

     for p in (prim_packet, prim):
         p.__doc__ = "Computes batch normalization."
         p.impl_nvfuser = _nvfuser_impls["native_batch_norm"]
         p.is_recomputable = _nvfuser_is_recomputable["native_batch_norm"]
         p.return_type = torch._prims_common.RETURN_TYPE.NEW  # type: ignore[attr-defined]


 def register_rand_like():
     name = "rand_like"

     nvprim.define(
         "rand_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, "
         + "Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor"
     )

     def _meta_rand_like(
         self,
         *,
         dtype=None,
         layout=None,
         device=None,
         pin_memory=None,
         memory_format=None,
     ):
         strides = make_contiguous_strides_for(self.shape)
         return torch._prims.TensorMeta(
             self,
             shape=self.shape,
             strides=strides,
             dtype=dtype,
             device=device,
         )

     def _prim_impl(
         self,
         *,
         dtype=None,
         layout=None,
         device=None,
         pin_memory=None,
         memory_format=None,
     ):
         return torch.rand_like(
             self,
             dtype=dtype,
             layout=layout,
             device=device,
             pin_memory=pin_memory,
             memory_format=memory_format,
         )

     nvprim_impl.impl(name, _prim_impl)
     nvprim_meta_impl.impl(name, _meta_rand_like)

     prim_packet = getattr(torch._ops.ops.nvprims, name)
     prim = prim_packet.default

     nvprim_autograd_impl.impl(name, backwards_not_supported(prim))

     for p in (prim_packet, prim):
         p.__doc__ = "Computes rand_like"
         p.impl_nvfuser = _nvfuser_impls["rand_like"]
         p.is_recomputable = _nvfuser_is_recomputable["rand_like"]
         p.return_type = torch._prims_common.RETURN_TYPE.NEW  # type: ignore[attr-defined]


 def register_var_mean():
     """This function is used to register the var_mean function in torch.ops.nvprims module."""
     name = "var_mean.main"

     # This overload must be default for correct dispatching of var_mean(Tensor, bool)
     nvprim.define("var_mean(Tensor inp, bool unbiased) -> (Tensor, Tensor)")

     # This signature tries to combine several overloads of the torch.var_mean function into one overload.
     nvprim.define(
         f"{name}(Tensor inp, int[1]? dim=None, bool? unbiased=None, bool keepdim=False, *, float? correction=None)"
         + " -> (Tensor, Tensor)"
     )

     # This function is used for device="meta" Tensors.
     def _meta_var_mean(inp, dim=None, unbiased=None, keepdim=False, *, correction=None):
         if torch._prims_common.is_complex_dtype(inp.dtype):
             output_dtype = torch._prims_common.corresponding_real_dtype(inp.dtype)
         else:
             output_dtype = inp.dtype
         var = torch._prims._reduction_meta(inp, dim, output_dtype=output_dtype)
         mean = torch._prims._reduction_meta(inp, dim, output_dtype=inp.dtype)
         if keepdim:
             output_shape = [
                 inp.shape[i] if i not in dim else 1 for i in range(inp.ndim)
             ]
             broadcast_dims = [i for i in range(inp.ndim) if i not in dim]
             var = torch._ops.ops.nvprims.broadcast_in_dim(
                 var, output_shape, broadcast_dims
             )
             mean = torch._ops.ops.nvprims.broadcast_in_dim(
                 mean, output_shape, broadcast_dims
             )
         return (var, mean)

     # This function is used under _AutoDispatchBelowAutograd context
     def _prim_impl(inp, dim=None, unbiased=None, keepdim=False, *, correction=None):
         correction = torch._prims_common.set_correction(unbiased, correction)
         return torch.var_mean(inp, dim, correction=correction, keepdim=keepdim)

     nvprim_impl.impl(name, _prim_impl)
     nvprim_meta_impl.impl(name, _meta_var_mean)

     prim_packet = torch._ops.ops.nvprims.var_mean
     prim = prim_packet.main

     def _unbiased_overload_impl(inp, unbiased):
         return prim(inp, dim=None, unbiased=unbiased)

     nvprim_implicit_impl.impl("var_mean", _unbiased_overload_impl)

     @elementwise_type_promotion_wrapper(
         type_promoting_args=("a",),
         type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT,
     )
     def _var_mean_ref(a, dim=None, unbiased=None, keepdim=False, *, correction=None):
         correction = torch._prims_common.set_correction(unbiased, correction)
         # reduces over all dimensions if dim=() is passed
         if dim == () or dim == []:
             dim = None
         dim = torch._prims_common.reduction_dims(a.shape, dim)

         # For complex tensors eager computes the variance as the sum of variances of
         # the real and imaginary parts
         # TODO: Creating a complex tensor from real and imaginary parts is not supported
         if torch._prims_common.is_complex_dtype(a.dtype):
             raise NotImplementedError("Complex tensors are not supported")

         var_mean = prim(a, dim, correction=correction)

         if keepdim:
             output_shape = [a.shape[i] if i not in dim else 1 for i in range(a.ndim)]
             broadcast_dims = [i for i in range(a.ndim) if i not in dim]
             var, mean = var_mean
             var = torch._ops.ops.nvprims.broadcast_in_dim(
                 var, output_shape, broadcast_dims
             )
             mean = torch._ops.ops.nvprims.broadcast_in_dim(
                 mean, output_shape, broadcast_dims
             )
             var_mean = (var, mean)
         return var_mean

     def _var_mean_autograd(
         a, dim=None, unbiased=None, keepdim=False, *, correction=None
     ):
         # This wrapper is needed to convert prims calls inside
         # elementwise_type_promotion_wrapper to nvprims calls
         from torch._prims.context import NvfuserPrimsMode

         with NvfuserPrimsMode():
             return backwards_not_supported(_var_mean_ref)(
                 a, dim, unbiased, keepdim, correction=correction
             )

     nvprim_autograd_impl.impl(name, _var_mean_autograd)

     for p in (prim_packet, prim):
         p.__doc__ = "Computes the variance and mean of x over the list of dimensions specified in the dim argument"
         p.impl_nvfuser = _nvfuser_impls["var_mean"]
         p.is_recomputable = _nvfuser_is_recomputable["var_mean"]
         p.return_type = torch._prims_common.RETURN_TYPE.NEW  # type: ignore[attr-defined]


 def _nvprims_view_impl_aten(a, original_shape, new_shape):
     return a.reshape(new_shape)


 def register_view():
     """This function is used to register the view function in torch.ops.view module."""
     # View is implemented as a decomposition into prims.split_dim,
     # prims.collapse_dim, and prims.reshape, but we would like to intercept
     # non-decomposed view for now
     name = "view"

     nvprim.define("view(Tensor inp, SymInt[] original_shape, SymInt[] shape) -> Tensor")
     nvprim.define("view.shape(Tensor inp, SymInt[] shape) -> Tensor")

     # This function is used under _AutoDispatchBelowAutograd context
     def _prim_impl(a, original_shape, new_shape):
         return a.reshape(new_shape)

     nvprim_impl.impl(name, _prim_impl)

     prim_packet = torch._ops.ops.nvprims.view
     prim = prim_packet.default

     def _view_no_original_shape_overload_impl(a, shape):
         if list(a.shape) == list(shape):
             return torch.ops.nvprims.view_of(a)
         return torch.ops.nvprims.view.default(a, a.shape, shape)

     nvprim_implicit_impl.impl("view.shape", _view_no_original_shape_overload_impl)
     nvprim_autograd_impl.impl(name, backwards_not_supported(prim))

     for p in (prim_packet, prim):
         p.__doc__ = "Creates a tensor with the specified shape containing a copy of the data in a."
         p.impl_nvfuser = _nvfuser_impls["view"]
         p.is_recomputable = _nvfuser_is_recomputable["view"]
         p.return_type = torch._prims_common.RETURN_TYPE.VIEW  # type: ignore[attr-defined]
         p.impl_aten = _nvprims_view_impl_aten


 def register_nvprims():
     """Registers all nvFuser primitives in the torch.ops.nvprims module."""
     register_var_mean()
     register_view()
     register_native_batch_norm()
     register_rand_like()
     register_full()

     for name in nvprim_names:
         main_prim = getattr(torch._ops.ops.prims, name)

         nvprim.define(main_prim.schema)
         nvprim_impl.impl(name, main_prim.prim_impl)
         nvprim_meta_impl.impl(name, main_prim.prim_meta_impl)

         prim_packet = getattr(torch._ops.ops.nvprims, name)
         prim = prim_packet.default

         nvprim_autograd_impl.impl(name, backwards_not_supported(prim))

         for p in (prim_packet, prim):
             p.__doc__ = main_prim.__doc__
             p.impl_nvfuser = _nvfuser_impls[name]
             p.is_recomputable = _nvfuser_is_recomputable.get(name, False)
             p.return_type = main_prim.return_type  # type: ignore[attr-defined]
             p.impl_aten = main_prim.impl_aten
	# Module for defining "primitive" operations executable by the nvFuser. This
	# list exists to decouple main set of primitives from the ones that provide a
	# lowering of the op to nvFuser’s Python interface. Mostly torch.ops.nvprims is
	# a subset of the primitives in torch.ops.prims, but some additional primitives
	# can be added in the future for the corresponding higher-level torch/aten
	# functions.

	from typing import Any, Dict, Optional, Tuple

	import torch
	import torch._prims_common as utils

	from torch._prims_common import (
	DimsSequenceType,
	elementwise_dtypes,
	ELEMENTWISE_TYPE_PROMOTION_KIND,
	getnvFuserDtype,
	make_contiguous_strides_for,
	NumberType,
	ShapeType,
	TensorLikeType,
	)

	from torch._prims_common.wrappers import (
	_maybe_convert_to_dtype,
	backwards_not_supported,
	elementwise_type_promotion_wrapper,
	)

	nvprim_namespace = "nvprims"
	nvprim = torch.library.Library(nvprim_namespace, "DEF")
	nvprim_impl = torch.library.Library(
	nvprim_namespace, "IMPL", "CompositeExplicitAutograd"
	)
	nvprim_implicit_impl = torch.library.Library(
	nvprim_namespace, "IMPL", "CompositeImplicitAutograd"
	)
	nvprim_autograd_impl = torch.library.Library(nvprim_namespace, "IMPL", "Autograd")
	nvprim_meta_impl = torch.library.Library(nvprim_namespace, "IMPL", "Meta")

	nvprim_names = [
	"abs",
	"acos",
	"asin",
	"atan",
	"atanh",
	"cos",
	"cosh",
	"clone",
	"bitwise_not",
	"ceil",
	"erf",
	"erfc",
	"exp",
	"expm1",
	"floor",
	"imag",
	"isfinite",
	"lgamma",
	"log",
	"log1p",
	"log2",
	"log10",
	"real",
	"reciprocal",
	"neg",
	"round",
	"rsqrt",
	"sign",
	"sin",
	"sinh",
	"sqrt",
	"tan",
	"tanh",
	"transpose",
	"trunc",
	"add",
	"atan2",
	"bitwise_and",
	"bitwise_or",
	"bitwise_xor",
	"div",
	"eq",
	"fmod",
	"ge",
	"gt",
	"le",
	"lt",
	"mul",
	"ne",
	"pow",
	"remainder",
	"sub",
	"squeeze",
	"view_of",
	"broadcast_in_dim",
	"where",
	"convert_element_type",
	"sum",
	"var",
	"amax",
	"amin",
	]

	_nvfuser_impls: Dict[str, Any] = {}

	_nvfuser_unary_ops = {
	"abs",
	"acos",
	"asin",
	"atan",
	"atanh",
	"cos",
	"cosh",
	"bitwise_not",
	"ceil",
	"erf",
	"erfc",
	"exp",
	"expm1",
	"floor",
	"imag",
	"isfinite",
	"lgamma",
	"log",
	"log1p",
	"log2",
	"log10",
	"reciprocal",
	"neg",
	"real",
	"round",
	"rsqrt",
	"sign",
	"sin",
	"sinh",
	"sqrt",
	"tan",
	"tanh",
	"trunc",
	}


	def _assert_nvfuser_op_exists(fname: str):
	try:
	try:
	from nvfuser import ( # type: ignore[import, attr-defined]
	FusionDefinition as fd,
	)
	except ImportError:
	from nvfuser._C import FusionDefinition as fd # type: ignore[import]

	assert getattr(fd.Operators, fname)
	except ImportError:
	# Not all PyTorch builds have nvfuser
	pass


	for fname in _nvfuser_unary_ops:
	exec(
	f"""
	# Ensure that the nvfuser implementation exists
	_assert_nvfuser_op_exists("{fname}")

	def _{fname}_nvfuser(fd, a):
	return fd.ops.{fname}(a) # type: ignore[attr-defined]

	_nvfuser_impls["{fname}"] = _{fname}_nvfuser
	"""
	)

	_nvfuser_binary_ops = {
	"add",
	"atan2",
	"bitwise_and",
	"bitwise_or",
	"bitwise_xor",
	"div",
	"eq",
	"fmod",
	"ge",
	"gt",
	"le",
	"lt",
	"mul",
	"ne",
	"pow",
	"remainder",
	"sub",
	}

	for fname in _nvfuser_binary_ops:
	exec(
	f"""
	# Ensure that the nvfuser implementation exists
	_assert_nvfuser_op_exists("{fname}")

	def _{fname}_nvfuser(fd, a, b):
	return fd.ops.{fname}(a, b) # type: ignore[attr-defined]

	_nvfuser_impls["{fname}"] = _{fname}_nvfuser
	"""
	)

	_nvfuser_ternary_ops = {
	"where",
	}

	for fname in _nvfuser_ternary_ops:
	exec(
	f"""
	# Ensure that the nvfuser implementation exists
	_assert_nvfuser_op_exists("{fname}")

	def _{fname}_nvfuser(fd, a, b, c):
	return fd.ops.{fname}(a, b, c) # type: ignore[attr-defined]

	_nvfuser_impls["{fname}"] = _{fname}_nvfuser
	"""
	)


	def _native_batch_norm_nvfuser(
	fd, input, weight, bias, running_mean, running_var, training, momentum, eps
	):

	"""
	if weight is None:
	weight = fd.define_null_tensor()
	if bias is None:
	bias = fd.define_null_tensor()
	if running_mean is None:
	running_mean = fd.define_null_tensor()
	if running_var is None:
	running_var = fd.define_null_tensor()
	"""
	return fd.ops.batch_norm(
	input,
	weight,
	bias,
	running_mean,
	running_var,
	momentum,
	eps,
	training,
	)


	def _broadcast_in_dim_nvfuser(
	fd: Any,
	a: TensorLikeType,
	shape: ShapeType,
	broadcast_dimensions: ShapeType,
	):
	return fd.ops.broadcast_in_dim(a, shape, broadcast_dimensions) # type: ignore[attr-defined]


	def _convert_element_type_nvfuser(fd: Any, a: TensorLikeType, dtype: torch.dtype):
	nvfuser_dtype = getnvFuserDtype(dtype)
	return fd.ops.cast(a, nvfuser_dtype) # type: ignore[attr-defined]


	def _transpose_nvfuser(fd, a, dims):
	return fd.ops.permute(a, dims) # type: ignore[attr-defined]


	def _squeeze_nvfuser(fd, a, a_shape, dimensions):
	for idx in sorted(dimensions, reverse=True):
	a = fd.ops.squeeze(a, a_shape, idx)
	a_shape = a_shape[:idx] + a_shape[idx + 1 :]
	return a


	def _view_of_nvfuser(fd, a):
	return fd.ops.set(a)


	def _view_nvfuser(
	fd,
	a,
	a_shape,
	new_shape,
	):
	try:
	return fd.ops.view(a, a_shape, new_shape)
	except AttributeError:
	return fd.ops.reshape(a, a_shape, new_shape)


	def _sum_nvfuser(
	fd: Any,
	a: TensorLikeType,
	dims: DimsSequenceType,
	):
	keep_dims = False
	try:
	from nvfuser import DataType # type: ignore[import, attr-defined]
	except ImportError:
	from nvfuser._C import DataType # type: ignore[import]

	output_dtype = DataType.Null
	return fd.ops.sum(a, dims, keep_dims, output_dtype)


	def _var_nvfuser(
	fd: Any,
	a: TensorLikeType,
	dims: DimsSequenceType,
	*,
	correction: float,
	):
	keep_dims = False
	return fd.ops.var(a, dims, correction, keep_dims)


	def _var_mean_nvfuser(
	fd: Any,
	a: TensorLikeType,
	dims: DimsSequenceType,
	unbiased: Optional[bool] = None,
	keepdim: bool = False,
	*,
	correction: float,
	):
	# Unbiased arg shouldn't be set when this function is called
	assert unbiased is None
	# Ignore keepdim arg, because currently it's automatically converted into nvfuser's symbolic scalar
	# keepdim is handled by the reference implementation
	keepdim = False
	return fd.ops.var_mean(a, dims, correction, keepdim)


	def _rand_like_nvfuser(fd: Any, a: TensorLikeType):
	return fd.ops.rand_like(a)


	def _amax_nvfuser(
	fd: Any,
	a: TensorLikeType,
	dims: DimsSequenceType,
	):
	keep_dims = False
	return fd.ops.max(a, dims, keep_dims)


	def _amin_nvfuser(
	fd: Any,
	a: TensorLikeType,
	dims: DimsSequenceType,
	):
	keep_dims = False
	return fd.ops.min(a, dims, keep_dims)


	def _clone_nvfuser(fd: Any, input: TensorLikeType, *, memory_format=None):
	return fd.ops.set(input)


	def _full_nvfuser(
	fd: Any,
	shape: ShapeType,
	fill_value: NumberType,
	*,
	dtype: Optional[torch.dtype] = None,
	layout: Optional[torch.layout] = None,
	device: Optional[torch.device] = None,
	pin_memory: bool = False,
	requires_grad: bool = False,
	):
	assert device != torch.device("cpu")
	assert layout is None or layout is torch.strided
	assert pin_memory is False
	assert requires_grad is False
	dtype = dtype if dtype is not None else utils.type_to_dtype(type(fill_value))
	nvfuser_dtype = getnvFuserDtype(dtype)
	return fd.ops.full(shape, fill_value, nvfuser_dtype)


	_nvfuser_impls["native_batch_norm"] = _native_batch_norm_nvfuser
	_nvfuser_impls["broadcast_in_dim"] = _broadcast_in_dim_nvfuser
	_nvfuser_impls["convert_element_type"] = _convert_element_type_nvfuser
	_nvfuser_impls["clone"] = _clone_nvfuser
	_nvfuser_impls["transpose"] = _transpose_nvfuser
	_nvfuser_impls["squeeze"] = _squeeze_nvfuser
	_nvfuser_impls["view_of"] = _view_of_nvfuser
	_nvfuser_impls["view"] = _view_nvfuser
	_nvfuser_impls["rand_like"] = _rand_like_nvfuser
	_nvfuser_impls["sum"] = _sum_nvfuser
	_nvfuser_impls["var"] = _var_nvfuser
	_nvfuser_impls["var_mean"] = _var_mean_nvfuser
	_nvfuser_impls["amax"] = _amax_nvfuser
	_nvfuser_impls["amin"] = _amin_nvfuser
	_nvfuser_impls["full"] = _full_nvfuser


	def register_full():
	name = "full"

	nvprim.define(
	"full(SymInt[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, "
	+ "bool? pin_memory=None, bool? requires_grad=None) -> Tensor"
	)

	def _meta_impl(
	size,
	fill_value,
	*,
	out=None,
	dtype=None,
	layout=None,
	device=None,
	pin_memory=False,
	requires_grad=False,
	):
	strides = make_contiguous_strides_for(size)
	return torch._prims.TensorMeta(
	None,
	shape=size,
	strides=strides,
	dtype=dtype,
	device=device,
	)

	def _prim_impl(
	size,
	fill_value,
	*,
	out=None,
	dtype=None,
	layout=None,
	device=None,
	pin_memory=False,
	requires_grad=False,
	):
	return torch.full(
	size,
	fill_value,
	out=out,
	dtype=dtype,
	layout=layout,
	device=device,
	pin_memory=pin_memory,
	requires_grad=requires_grad,
	)

	nvprim_impl.impl(name, _prim_impl)
	nvprim_meta_impl.impl(name, _meta_impl)

	prim_packet = getattr(torch._ops.ops.nvprims, name)
	prim = prim_packet.default
	nvprim_autograd_impl.impl(name, backwards_not_supported(prim))
	for p in (prim_packet, prim):
	p.__doc__ = "Create a tensor with given size and filled with value"
	p.impl_nvfuser = _nvfuser_impls["full"]
	p.is_recomputable = _nvfuser_is_recomputable["full"]
	p.return_type = torch._prims_common.RETURN_TYPE.NEW # type: ignore[attr-defined]


	# functorch.compile.min_cut_rematerialization_partition accepts a list of
	# operators that can be recomputed in the backward pass. This list is used to
	# determine which operators can be recomputed. If an operator is not in this
	# list, it will not be recomputed.
	_nvfuser_is_recomputable: Dict[str, bool] = {
	# Reductions are not allowed to be recomputed
	"amax": False,
	"amin": False,
	"sum": False,
	"var": False,
	"var_mean": False,
	# Normalizations are not allowed to be recomputed
	"native_batch_norm": False,
	# Random ops are not allowed to be recomputed
	"rand_like": False,
	# Everything else is allowed to be recomputed
	"abs": True,
	"acos": True,
	"add": True,
	"asin": True,
	"atan": True,
	"atan2": True,
	"atanh": True,
	"bitwise_and": True,
	"bitwise_not": True,
	"bitwise_or": True,
	"bitwise_xor": True,
	"broadcast_in_dim": True,
	"ceil": True,
	"clone": True,
	"convert_element_type": True,
	"cos": True,
	"cosh": True,
	"div": True,
	"eq": True,
	"erf": True,
	"erfc": True,
	"exp": True,
	"expm1": True,
	"floor": True,
	"fmod": True,
	"full": True,
	"ge": True,
	"gt": True,
	"imag": True,
	"isfinite": True,
	"le": True,
	"lgamma": True,
	"log": True,
	"log10": True,
	"log1p": True,
	"log2": True,
	"lt": True,
	"mul": True,
	"ne": True,
	"neg": True,
	"pow": True,
	"real": True,
	"reciprocal": True,
	"remainder": True,
	"round": True,
	"rsqrt": True,
	"sign": True,
	"sin": True,
	"sinh": True,
	"sqrt": True,
	"squeeze": True,
	"sub": True,
	"tan": True,
	"tanh": True,
	"transpose": True,
	"trunc": True,
	"view": True,
	"view_of": True,
	"where": True,
	}


	def register_native_batch_norm():
	"""This function is used to register the native_batch_norm function in torch.ops.nvprims module."""
	name = "native_batch_norm"

	nvprim.define(
	f"{name}(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, "
	+ "bool training, float momentum, float eps)"
	+ " -> (Tensor, Tensor, Tensor)"
	)

	def _prim_impl(
	input, weight, bias, running_mean, running_var, training, momentum, eps
	):
	return torch.native_batch_norm(
	input, weight, bias, running_mean, running_var, training, momentum, eps
	)

	nvprim_impl.impl(name, _prim_impl)
	prim_packet = torch._ops.ops.nvprims.native_batch_norm
	prim = prim_packet.default

	def _native_batch_norm_ref(
	input: torch.Tensor,
	weight: Optional[torch.Tensor],
	bias: Optional[torch.Tensor],
	running_mean: Optional[torch.Tensor],
	running_var: Optional[torch.Tensor],
	training: bool,
	momentum: float,
	eps: float,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

	if torch._prims_common.is_complex_dtype(input.dtype):
	raise NotImplementedError("Complex tensors are not supported")

	# note: BN only promotes input to dtype of weight/bias, but keeps the same output dtype
	result_dtype = input.dtype
	computation_dtype, _ = elementwise_dtypes(
	input,
	weight,
	bias,
	type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.NO_OPMATH,
	)

	input_ = _maybe_convert_to_dtype(input, computation_dtype)
	output, mean, rstd = prim(
	input_, weight, bias, running_mean, running_var, training, momentum, eps
	)
	output_ = _maybe_convert_to_dtype(output, result_dtype) # type: ignore[arg-type]
	return (output_, mean, rstd) # type: ignore[return-value]

	def _native_batch_norm_autograd(
	input: torch.Tensor,
	weight: Optional[torch.Tensor],
	bias: Optional[torch.Tensor],
	running_mean: Optional[torch.Tensor],
	running_var: Optional[torch.Tensor],
	training: bool,
	momentum: float,
	eps: float,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	# This wrapper is needed to convert prims calls inside
	# _native_batch_norm_ref to nvprims calls
	from torch._prims.context import NvfuserPrimsMode

	with NvfuserPrimsMode():
	return backwards_not_supported(_native_batch_norm_ref)(
	input, weight, bias, running_mean, running_var, training, momentum, eps
	)

	nvprim_autograd_impl.impl(name, _native_batch_norm_autograd)

	for p in (prim_packet, prim):
	p.__doc__ = "Computes batch normalization."
	p.impl_nvfuser = _nvfuser_impls["native_batch_norm"]
	p.is_recomputable = _nvfuser_is_recomputable["native_batch_norm"]
	p.return_type = torch._prims_common.RETURN_TYPE.NEW # type: ignore[attr-defined]


	def register_rand_like():
	name = "rand_like"

	nvprim.define(
	"rand_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, "
	+ "Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor"
	)

	def _meta_rand_like(
	self,
	*,
	dtype=None,
	layout=None,
	device=None,
	pin_memory=None,
	memory_format=None,
	):
	strides = make_contiguous_strides_for(self.shape)
	return torch._prims.TensorMeta(
	self,
	shape=self.shape,
	strides=strides,
	dtype=dtype,
	device=device,
	)

	def _prim_impl(
	self,
	*,
	dtype=None,
	layout=None,
	device=None,
	pin_memory=None,
	memory_format=None,
	):
	return torch.rand_like(
	self,
	dtype=dtype,
	layout=layout,
	device=device,
	pin_memory=pin_memory,
	memory_format=memory_format,
	)

	nvprim_impl.impl(name, _prim_impl)
	nvprim_meta_impl.impl(name, _meta_rand_like)

	prim_packet = getattr(torch._ops.ops.nvprims, name)
	prim = prim_packet.default

	nvprim_autograd_impl.impl(name, backwards_not_supported(prim))

	for p in (prim_packet, prim):
	p.__doc__ = "Computes rand_like"
	p.impl_nvfuser = _nvfuser_impls["rand_like"]
	p.is_recomputable = _nvfuser_is_recomputable["rand_like"]
	p.return_type = torch._prims_common.RETURN_TYPE.NEW # type: ignore[attr-defined]


	def register_var_mean():
	"""This function is used to register the var_mean function in torch.ops.nvprims module."""
	name = "var_mean.main"

	# This overload must be default for correct dispatching of var_mean(Tensor, bool)
	nvprim.define("var_mean(Tensor inp, bool unbiased) -> (Tensor, Tensor)")

	# This signature tries to combine several overloads of the torch.var_mean function into one overload.
	nvprim.define(
	f"{name}(Tensor inp, int[1]? dim=None, bool? unbiased=None, bool keepdim=False, *, float? correction=None)"
	+ " -> (Tensor, Tensor)"
	)

	# This function is used for device="meta" Tensors.
	def _meta_var_mean(inp, dim=None, unbiased=None, keepdim=False, *, correction=None):
	if torch._prims_common.is_complex_dtype(inp.dtype):
	output_dtype = torch._prims_common.corresponding_real_dtype(inp.dtype)
	else:
	output_dtype = inp.dtype
	var = torch._prims._reduction_meta(inp, dim, output_dtype=output_dtype)
	mean = torch._prims._reduction_meta(inp, dim, output_dtype=inp.dtype)
	if keepdim:
	output_shape = [
	inp.shape[i] if i not in dim else 1 for i in range(inp.ndim)
	]
	broadcast_dims = [i for i in range(inp.ndim) if i not in dim]
	var = torch._ops.ops.nvprims.broadcast_in_dim(
	var, output_shape, broadcast_dims
	)
	mean = torch._ops.ops.nvprims.broadcast_in_dim(
	mean, output_shape, broadcast_dims
	)
	return (var, mean)

	# This function is used under _AutoDispatchBelowAutograd context
	def _prim_impl(inp, dim=None, unbiased=None, keepdim=False, *, correction=None):
	correction = torch._prims_common.set_correction(unbiased, correction)
	return torch.var_mean(inp, dim, correction=correction, keepdim=keepdim)

	nvprim_impl.impl(name, _prim_impl)
	nvprim_meta_impl.impl(name, _meta_var_mean)

	prim_packet = torch._ops.ops.nvprims.var_mean
	prim = prim_packet.main

	def _unbiased_overload_impl(inp, unbiased):
	return prim(inp, dim=None, unbiased=unbiased)

	nvprim_implicit_impl.impl("var_mean", _unbiased_overload_impl)

	@elementwise_type_promotion_wrapper(
	type_promoting_args=("a",),
	type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.COMPLEX_TO_FLOAT,
	)
	def _var_mean_ref(a, dim=None, unbiased=None, keepdim=False, *, correction=None):
	correction = torch._prims_common.set_correction(unbiased, correction)
	# reduces over all dimensions if dim=() is passed
	if dim == () or dim == []:
	dim = None
	dim = torch._prims_common.reduction_dims(a.shape, dim)

	# For complex tensors eager computes the variance as the sum of variances of
	# the real and imaginary parts
	# TODO: Creating a complex tensor from real and imaginary parts is not supported
	if torch._prims_common.is_complex_dtype(a.dtype):
	raise NotImplementedError("Complex tensors are not supported")

	var_mean = prim(a, dim, correction=correction)

	if keepdim:
	output_shape = [a.shape[i] if i not in dim else 1 for i in range(a.ndim)]
	broadcast_dims = [i for i in range(a.ndim) if i not in dim]
	var, mean = var_mean
	var = torch._ops.ops.nvprims.broadcast_in_dim(
	var, output_shape, broadcast_dims
	)
	mean = torch._ops.ops.nvprims.broadcast_in_dim(
	mean, output_shape, broadcast_dims
	)
	var_mean = (var, mean)
	return var_mean

	def _var_mean_autograd(
	a, dim=None, unbiased=None, keepdim=False, *, correction=None
	):
	# This wrapper is needed to convert prims calls inside
	# elementwise_type_promotion_wrapper to nvprims calls
	from torch._prims.context import NvfuserPrimsMode

	with NvfuserPrimsMode():
	return backwards_not_supported(_var_mean_ref)(
	a, dim, unbiased, keepdim, correction=correction
	)

	nvprim_autograd_impl.impl(name, _var_mean_autograd)

	for p in (prim_packet, prim):
	p.__doc__ = "Computes the variance and mean of x over the list of dimensions specified in the dim argument"
	p.impl_nvfuser = _nvfuser_impls["var_mean"]
	p.is_recomputable = _nvfuser_is_recomputable["var_mean"]
	p.return_type = torch._prims_common.RETURN_TYPE.NEW # type: ignore[attr-defined]


	def _nvprims_view_impl_aten(a, original_shape, new_shape):
	return a.reshape(new_shape)


	def register_view():
	"""This function is used to register the view function in torch.ops.view module."""
	# View is implemented as a decomposition into prims.split_dim,
	# prims.collapse_dim, and prims.reshape, but we would like to intercept
	# non-decomposed view for now
	name = "view"

	nvprim.define("view(Tensor inp, SymInt[] original_shape, SymInt[] shape) -> Tensor")
	nvprim.define("view.shape(Tensor inp, SymInt[] shape) -> Tensor")

	# This function is used under _AutoDispatchBelowAutograd context
	def _prim_impl(a, original_shape, new_shape):
	return a.reshape(new_shape)

	nvprim_impl.impl(name, _prim_impl)

	prim_packet = torch._ops.ops.nvprims.view
	prim = prim_packet.default

	def _view_no_original_shape_overload_impl(a, shape):
	if list(a.shape) == list(shape):
	return torch.ops.nvprims.view_of(a)
	return torch.ops.nvprims.view.default(a, a.shape, shape)

	nvprim_implicit_impl.impl("view.shape", _view_no_original_shape_overload_impl)
	nvprim_autograd_impl.impl(name, backwards_not_supported(prim))

	for p in (prim_packet, prim):
	p.__doc__ = "Creates a tensor with the specified shape containing a copy of the data in a."
	p.impl_nvfuser = _nvfuser_impls["view"]
	p.is_recomputable = _nvfuser_is_recomputable["view"]
	p.return_type = torch._prims_common.RETURN_TYPE.VIEW # type: ignore[attr-defined]
	p.impl_aten = _nvprims_view_impl_aten


	def register_nvprims():
	"""Registers all nvFuser primitives in the torch.ops.nvprims module."""
	register_var_mean()
	register_view()
	register_native_batch_norm()
	register_rand_like()
	register_full()

	for name in nvprim_names:
	main_prim = getattr(torch._ops.ops.prims, name)

	nvprim.define(main_prim.schema)
	nvprim_impl.impl(name, main_prim.prim_impl)
	nvprim_meta_impl.impl(name, main_prim.prim_meta_impl)

	prim_packet = getattr(torch._ops.ops.nvprims, name)
	prim = prim_packet.default

	nvprim_autograd_impl.impl(name, backwards_not_supported(prim))

	for p in (prim_packet, prim):
	p.__doc__ = main_prim.__doc__
	p.impl_nvfuser = _nvfuser_impls[name]
	p.is_recomputable = _nvfuser_is_recomputable.get(name, False)
	p.return_type = main_prim.return_type # type: ignore[attr-defined]
	p.impl_aten = main_prim.impl_aten