aten/src/ATen/TensorIterator.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <ATen/TensorMeta.h>
 #include <ATen/core/Dimname.h>
 #include <ATen/core/Range.h>
 #include <ATen/core/TensorBase.h>
 #include <c10/core/DynamicCast.h>
 #include <c10/util/FunctionRef.h>
 #include <c10/util/MaybeOwned.h>
 #include <c10/util/SmallVector.h>
 #include <c10/util/TypeCast.h>
 #include <c10/util/irange.h>

 #include <array>
 #include <bitset>

 C10_CLANG_DIAGNOSTIC_PUSH()
 #if C10_CLANG_HAS_WARNING("-Wshorten-64-to-32")
 C10_CLANG_DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
 #endif
 #if C10_CLANG_HAS_WARNING("-Wdeprecated-copy-dtor")
 C10_CLANG_DIAGNOSTIC_IGNORE("-Wdeprecated-copy-dtor")
 #endif

 namespace at {
 class Tensor;
 class OptionalTensorRef;
 using NameVector = SmallVector<Dimname, kDimVectorStaticSize>;
 } // namespace at

 // TensorIterator is a helper class for element-wise operations, such as
 // arithmetic, comparisons, and trigonometric functions. It handles
 // broadcasting and type conversions of operands.
 //
 // This is inspired by NumPy's Array Iterator API (NpyIter).
 //
 // The files Loops.h and Loops.cuh provide functions to build kernels that
 // use TensorIterator.
 //
 // Example:
 //
 //   auto iter = TensorIteratorConfig()
 //     .add_output(output)
 //     .add_input(input)
 //     .build()
 //
 // [MyKernel.cpp / MyKernel.cu]
 //   cpu_kernel(iter, [](float a, float b) {
 //     return a + b;
 //   });
 //
 //   gpu_kernel(iter, []GPU_LAMBDA(float a, float b) -> float {
 //     return a + b;
 //   });
 //
 // Note [Order of Construction]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // When setting up the tensor iterator configuration, the output Tensors
 // have to be added first via
 // TensorIteratorConfig::add_owned_output(at::Tensor). After adding all outputs,
 // the inputs can be added via
 // TensorIteratorConfig::add_owned_input(at::Tensor).
 // Adding another output after inputs have been added will rise an exception.
 //
 // Note [Common Dtype Computation]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // Some operations have a natural notion of a "common dtype" or
 //   "computation dtype" where all inputs are cast to one dtype, the
 //   operation is performed, and then the results are cast to all outputs.
 //
 // TensorIterator infers a common dtype if all inputs have the same dtype,
 //   and it computes one using type promotion rules on its inputs if
 //   promote_inputs_to_common_dtype_ is true. Attempting to query
 //   a common dtype otherwise will throw an exception.
 //
 // Note that the outputs are not considered when computing a common dtype.

 namespace at {

 namespace internal {
 // This parameter is heuristically chosen to determine the minimum number of
 // work that warrants parallelism. For example, when summing an array, it is
 // deemed inefficient to parallelise over arrays shorter than 32768. Further,
 // no parallel algorithm (such as parallel_reduce) should split work into
 // smaller than GRAIN_SIZE chunks.
 constexpr int64_t GRAIN_SIZE = 32768;

 // Storage for a non-owning Tensor, without needing to include Tensor.h
 class TORCH_API OpaqueOptionalTensorRef {
   alignas(alignof(TensorBase)) std::array<char, sizeof(TensorBase)> data_;

  public:
   OpaqueOptionalTensorRef();
   ~OpaqueOptionalTensorRef();

   OptionalTensorRef* get() {
     return reinterpret_cast<OptionalTensorRef*>(data_.data());
   }
   const OptionalTensorRef* get() const {
     return reinterpret_cast<const OptionalTensorRef*>(data_.data());
   }

   OptionalTensorRef& operator*() {
     return *get();
   }
   const OptionalTensorRef& operator*() const {
     return *get();
   }
   OptionalTensorRef* operator->() {
     return get();
   }
   const OptionalTensorRef* operator->() const {
     return get();
   }

   const Tensor& getTensor() const;
 };
 } // namespace internal

 struct TORCH_API OperandInfo {
   using StrideVector = SmallVector<int64_t, 6>;
   OperandInfo() = default;
   C10_ALWAYS_INLINE explicit OperandInfo(c10::MaybeOwned<TensorBase>&& t) {
     if (t->defined()) {
       device = t->device();
       target_dtype = t->scalar_type();
       current_dtype = target_dtype;
     }
     tensor(std::move(t));
     validate();
   }

   C10_ALWAYS_INLINE ~OperandInfo() = default;

   /// Stride after broadcasting. The stride is in bytes, not number of elements.
   StrideVector stride_bytes;

   /// The desired device and type for the operand. For inputs, this specifies
   /// that the input should be converted to this type if necessary. For outputs,
   /// this specifies which type to allocate. target_dtype and device are
   /// initialized with the dtype and device of the tensor but during type
   /// promotion target_dtype value can become different from tensor's dtype
   /// also, during type promotion target_dtype and device can be set for an
   /// undefined tensor so that tensor can be properly constructed later.
   c10::optional<Device> device = c10::nullopt;
   ScalarType target_dtype = ScalarType::Undefined;
   // Caches dtype of the tensor, because scalar_type is an expensive operation
   // If dtype of the tensor is changed (e.g. as a result of type promotion or in
   // allocate_outputs), this
   // value should be changed too.
   ScalarType current_dtype = ScalarType::Undefined;

   bool is_device_defined() const {
     return device.has_value();
   }
   bool is_type_defined() const {
     return target_dtype != ScalarType::Undefined;
   }
   TensorOptions options() const {
     return TensorOptions(target_dtype).device(device);
   }

   /// The data pointer. This may be different from tensor->data_ptr() if the
   /// iterator is split.
   void* data = nullptr;

   bool is_output = false;

   bool will_resize = false;

   bool is_read_write = false;

   void validate() {
     TORCH_CHECK(
         !tensor_base_->defined() || tensor_base_->layout() == kStrided,
         "unsupported tensor layout: ",
         tensor_base_->layout());
   }

   /// The tensor operand. Note that the strides, data pointer, and
   /// other attributes may differ due to dimension reordering and
   /// coalescing.
   const Tensor& tensor() const {
     return tensor_storage_.getTensor();
   }
   const TensorBase& tensor_base() const {
     return *tensor_base_;
   }
   void tensor(c10::MaybeOwned<TensorBase>&& tensor);

   // Save the original tensor operand in cases when an output is modified
   // (e.g. if dtype is changed)
   const Tensor& original_tensor() const {
     return original_tensor_storage_.getTensor();
   }
   const TensorBase& original_tensor_base() const {
     return *original_tensor_base_;
   }

   // Set tensor to a new value, and store the old tensor value in
   // original_tensor Should only ever be called once for the lifetime of an
   // operand
   void exchange_tensor(c10::MaybeOwned<TensorBase>&& new_tensor);

   // Move original_tensor back into tensor, exchange_tensor must have been
   // called before
   void restore_original_tensor();

  private:
   c10::MaybeOwned<TensorBase> tensor_base_;
   c10::MaybeOwned<TensorBase> original_tensor_base_ =
       c10::MaybeOwned<TensorBase>::owned(c10::in_place);

   // We store TensorBase visibly in the header to allow inline access.
   // However, we sometimes need a genuine `const Tensor &` for the
   // TensorIterator API. So, we also store a non-owning `Tensor`
   // object in these `_storage_` variables.
   internal::OpaqueOptionalTensorRef tensor_storage_;
   internal::OpaqueOptionalTensorRef original_tensor_storage_;
 };

 struct SplitUntil32Bit;

 enum class FastSetupType : uint8_t {
   NONE,
   CONTIGUOUS,
   CHANNELS_LAST,
   NON_OVERLAPPING_DENSE
 };

 class TensorIteratorConfig;
 struct TensorIterator;

 struct TORCH_API TensorIteratorBase : public impl::MetaBase {
   using DimMask = std::bitset<64>;
   using PtrVector = SmallVector<char*, 4>;
   using StrideVector = SmallVector<int64_t, 6>;

   TensorIteratorBase();
   void build(TensorIteratorConfig&);

   // The inner-loop function operates on the fastest moving dimension. It
   // implements element-wise operations in terms of 1-d strided tensors.
   //
   // Arguments:
   //  data: data pointers for each operand (length `ntensors`)
   //  strides: stride for each operand (length `ntensors`)
   //  size: size of inner loop
   //
   // The `size` often matches shape[0], but may be smaller due to
   // parallelization of the inner loop.
   using loop2d_t = c10::function_ref<
       void(char** data, const int64_t* strides, int64_t size0, int64_t size1)>;

   using loop_subiter_t = c10::function_ref<void(TensorIteratorBase& subiter)>;

   void foreach_reduced_elt(loop_subiter_t loop, bool parallelize = true);

   int ndim() const {
     return shape_.size();
   }
   IntArrayRef shape() const {
     return shape_;
   }
   int64_t numel() const;
   int ntensors() const {
     return operands_.size();
   }
   int noutputs() const {
     return num_outputs_;
   }
   int ninputs() const {
     return ntensors() - noutputs();
   }
   IntArrayRef view_offsets() const {
     return view_offsets_;
   }

   /// number of elements in the output operand. this is the same as numel() for
   /// operations that are not reductions.
   int64_t num_output_elements() const;

   /// number of reduced dimensions in a reduction operation
   int num_reduce_dims() const;

   /// 1-dimensional iteration and no buffering or type conversion
   bool is_trivial_1d() const;
   /// Reducible to 1-dimensional and all operands are contiguous
   bool is_contiguous() const;
   bool is_dim_reduced(int dim) const;

   /// Accessors for each operand
   IntArrayRef strides(int arg) const {
     return operands_[arg].stride_bytes;
   }
   void* data_ptr(int arg) const;
   ScalarType dtype(int arg = 0) const {
     return operands_[arg].current_dtype;
   }
   ScalarType common_dtype() const {
     TORCH_INTERNAL_ASSERT(
         common_dtype_ != ScalarType::Undefined,
         "Queried for invalid common dtype!");
     return common_dtype_;
   }
   ScalarType input_dtype(int arg = 0) const {
     return operands_[num_outputs_ + arg].current_dtype;
   }
   Device device(int arg = 0) const {
     return operands_[arg].device.value();
   }
   DeviceType device_type(int arg = 0) const {
     return device(arg).type();
   }
   int64_t element_size(int arg) const {
     return elementSize(dtype(arg));
   }
   bool is_scalar(int arg) const;
   bool is_cpu_scalar(int arg) const;

   const TensorBase& tensor_base(int arg) const {
     return operands_[arg].tensor_base();
   }
   const Tensor& tensor(int arg) const {
     return operands_[arg].tensor();
   }

   const TensorBase& output_base(int arg = 0) const {
     AT_ASSERT(arg < num_outputs_);
     return tensor_base(arg);
   }

   const Tensor& output(int arg = 0) const {
     AT_ASSERT(arg < num_outputs_);
     return tensor(arg);
   }

   const TensorBase& input_base(int arg = 0) const {
     AT_ASSERT(arg >= 0 && arg < ntensors() - num_outputs_);
     return tensor_base(num_outputs_ + arg);
   }
   const Tensor& input(int arg = 0) const {
     AT_ASSERT(arg >= 0 && arg < ntensors() - num_outputs_);
     return tensor(num_outputs_ + arg);
   }

   // Copies from temporary outputs back to the original outputs
   // NOTE: only used on CPU
   void cast_outputs();

   /// Removes an operand from this iterator
   void remove_operand(int arg);
   /// Shrinks an iterated dimension
   void narrow(int dim, int64_t start, int64_t size);
   /// Narrows every dim after and including `start_dim` to size one.
   void select_all_keeping_dim(int start_dim, IntArrayRef starts);
   /// Replaces the data pointer for the operand at index `arg`.
   /// The new pointer should have the same sizes, strides and dtype as the
   /// original
   void unsafe_replace_operand(int arg, void* data);

   /// Splits this TensorIterator into two iterators. Together they iterate over
   /// the entire operation. Used by `with_32bit_indexing()`.
   std::unique_ptr<TensorIterator> split(int dim);

   /// Returns the dimension with the largest extent: (size[dim]-1) * stride[dim]
   int get_dim_to_split() const;

   template <typename T>
   T scalar_value(int arg) {
     auto& op = operands_[arg];
     return c10::fetch_and_cast<T>(op.tensor_base().scalar_type(), op.data);
   }

  private:
   template <typename loop1d_t>
   auto loop_2d_from_1d(const loop1d_t& loop) {
     return
         [loop, ntensor = ntensors()](
             char** base, const int64_t* strides, int64_t size0, int64_t size1) {
           PtrVector data(base, base + ntensor);
           const int64_t* outer_strides = &strides[ntensor];
           for (const auto i : c10::irange(size1)) {
             if (i > 0) {
               for (const auto arg : c10::irange(ntensor)) {
                 data[arg] += outer_strides[arg];
               }
             }
             loop(data.data(), strides, size0);
           }
         };
   }

  public:
   template <
       typename loop1d_t,
       std::enable_if_t<
           std::is_convertible<
               loop1d_t,
               c10::function_ref<
                   void(char**, const int64_t* strides, int64_t size)>>::value,
           int> = 0>
   void for_each(loop1d_t loop, int64_t grain_size = at::internal::GRAIN_SIZE) {
     for_each(loop_2d_from_1d(loop), grain_size);
   }

   void for_each(loop2d_t loop, int64_t grain_size = at::internal::GRAIN_SIZE);

   void parallel_reduce(loop2d_t loop);

   template <
       typename loop1d_t,
       std::enable_if_t<
           std::is_convertible<
               loop1d_t,
               c10::function_ref<
                   void(char**, const int64_t* strides, int64_t size)>>::value,
           int> = 0>
   void serial_for_each(loop1d_t loop, Range range) {
     serial_for_each(loop_2d_from_1d(loop), range);
   }

   void serial_for_each(loop2d_t loop, Range range) const;

   /// Create a strides array for a Tensor with shape of this iterator. The
   /// parameter `element_size` specifies the size of Tensor's data type in
   /// bytes (e.g. `4` for `float`)
   StrideVector compatible_stride(int element_size) const;

   /// Inverts the re-ordering done by reorder_dimensions. This can only be
   /// called *before* coalesce_dimensions() is called.
   DimVector invert_perm(IntArrayRef input) const;

   /// Reapply same re-ordering as it is done by reorder_dimensions. This can
   /// only be called *before* coalesce_dimensions() is called.
   DimVector apply_perm_and_mul(IntArrayRef input, int mul) const;

   /// Helper functions for CPU iteration
   StrideVector get_dim_strides(int dim) const;
   StrideVector get_strides() const;
   StrideVector get_inner_strides() const {
     return get_dim_strides(0);
   }
   PtrVector get_base_ptrs() const;

   // Helper functions for advanced stride manipulations (e.g. torch.flip)
   void _unsafe_set_arg_strides(const int arg, IntArrayRef strides) {
     operands_[arg].stride_bytes = std::move(strides);
   }
   void _unsafe_set_arg_data(const int arg, void* data) {
     operands_[arg].data = data;
   }

   /// true if the stride computation can use 32-bit arithmetic. Used by GPU
   /// kernels
   bool can_use_32bit_indexing() const;

   /// An "iteratable" object that recursively splits this iterator into
   /// sub-iterators that can use 32-bit indexing.
   SplitUntil32Bit with_32bit_indexing() const;

   /// If the kernel should accumulate into the output. Only relevant for CUDA
   /// reductions.
   bool should_accumulate() const {
     return accumulate_;
   }

   /// Whether this iterator produces the actual output,
   /// as opposed to something that will be accumulated further. Only relevant
   /// for CUDA reductions.
   bool is_final_output() const {
     return final_output_;
   }

   bool has_contiguous_first_dim() const {
     if (ndim() == 0) {
       return true;
     }

     int num_tensors = ntensors();
     for (const auto i : c10::irange(num_tensors)) {
       if (strides(i)[0] != element_size(i)) {
         return false;
       }
     }
     return true;
   }

   void set_output_raw_strided(
       int64_t output_idx,
       IntArrayRef sizes,
       IntArrayRef strides,
       TensorOptions options,
       DimnameList names) override;

 #define TORCH_DISALLOW_TEMPORARIES_IMPL(methodname, maybestatic)            \
   maybestatic void methodname(                                              \
       TensorBase&& out, const TensorBase& a, const TensorBase& b) = delete; \
   maybestatic void methodname(                                              \
       const TensorBase& out, TensorBase&& a, const TensorBase& b) = delete; \
   maybestatic void methodname(                                              \
       const TensorBase& out, const TensorBase& a, TensorBase&& b) = delete; \
   maybestatic void methodname(                                              \
       TensorBase&& out, TensorBase&& a, const TensorBase& b) = delete;      \
   maybestatic void methodname(                                              \
       TensorBase&& out, const TensorBase& a, TensorBase&& b) = delete;      \
   maybestatic void methodname(                                              \
       const TensorBase& out, TensorBase&& a, TensorBase&& b) = delete;      \
   maybestatic void methodname(                                              \
       TensorBase&& out, TensorBase&& a, TensorBase&& b) = delete;

 #define TORCH_DISALLOW_TEMPORARIES(methodname) \
   TORCH_DISALLOW_TEMPORARIES_IMPL(methodname, )

   void build_binary_float_op(
       const TensorBase& out,
       const TensorBase& a,
       const TensorBase& b);
   void build_borrowing_binary_float_op(
       const TensorBase& out,
       const TensorBase& a,
       const TensorBase& b);
   TORCH_DISALLOW_TEMPORARIES(build_borrowing_binary_float_op)
   void build_binary_op(
       const TensorBase& out,
       const TensorBase& a,
       const TensorBase& b);
   void build_borrowing_binary_op(
       const TensorBase& out,
       const TensorBase& a,
       const TensorBase& b);
   TORCH_DISALLOW_TEMPORARIES(build_borrowing_binary_op)
   void build_unary_float_op(const TensorBase& out, const TensorBase& a);
   void build_borrowing_unary_float_op(
       const TensorBase& out,
       const TensorBase& a);
   TORCH_DISALLOW_TEMPORARIES(build_borrowing_unary_float_op)
   void build_unary_op(const TensorBase& out, const TensorBase& a);
   // Odd special case needed for pow. Has to borrow the output because
   // it's a structured kernel, but the argument is potentially a copy.
   void build_output_borrowing_argument_owning_unary_op(
       const TensorBase& out,
       const TensorBase& a);
   void build_borrowing_unary_op(const TensorBase& out, const TensorBase& a);
   TORCH_DISALLOW_TEMPORARIES(build_borrowing_unary_op)
   void build_borrowing_unary_force_boolean_op(
       const TensorBase& out,
       const TensorBase& a);
   TORCH_DISALLOW_TEMPORARIES(build_borrowing_unary_force_boolean_op)
   void build_comparison_op(
       const TensorBase& out,
       const TensorBase& a,
       const TensorBase& b);
   void build_borrowing_comparison_op(
       const TensorBase& out,
       const TensorBase& a,
       const TensorBase& b);
   TORCH_DISALLOW_TEMPORARIES(build_borrowing_comparison_op)
   // Another special case: we need to own the second argument for comparison
   // ops.
   void build_borrowing_except_last_argument_comparison_op(
       const TensorBase& out,
       const TensorBase& a,
       const TensorBase& b);
   void build_ternary_op(
       const TensorBase& out,
       const TensorBase& a,
       const TensorBase& b,
       const TensorBase& c);

 #undef TORCH_DISALLOW_TEMPORARIES
  protected:
   // Mutable reference as it moves tensors out of TensorIteratorConfig
   void populate_operands(TensorIteratorConfig&);
   void mark_outputs();
   void mark_resize_outputs(const TensorIteratorConfig&);
   void compute_mem_overlaps(const TensorIteratorConfig&);
   void compute_shape(const TensorIteratorConfig&);
   void compute_strides(const TensorIteratorConfig&);
   void reorder_dimensions();
   void permute_dimensions(IntArrayRef perm);
   void compute_types(const TensorIteratorConfig&);
   ScalarType compute_common_dtype();
   void allocate_or_resize_outputs();
   bool fast_set_up(const TensorIteratorConfig&);
   FastSetupType compute_fast_setup_type(const TensorIteratorConfig&);
   void compute_names(const TensorIteratorConfig&);
   void propagate_names_to_outputs();
   void coalesce_dimensions();

  protected:
   /// Records the "computation" shape of the output tensor. The computation
   /// shape is different from the regular shape in a few ways:
   ///
   ///   - The shape may be permuted (via permute_dimensions) so that we
   ///     process the dimensions in the most computationally efficient order
   ///     (rather than the logical order given to us by the users.)
   ///   - The shape may have adjacent dimensions collapsed (via
   ///     coalesce_dimensions) so that we minimize the number of
   ///     dimensions we have to explicitly iterate over.  For example,
   ///     a pointwise operation on a contiguous tensor "computationally"
   ///     consists of only a single dimension.
   ///
   /// In other words, the computation shape is the output shape as it
   /// actually matters for implementing the kernel, but not necessarily the
   /// output shape that the user will see in the end.
   ///
   /// The lifecycle of mutations to shape_ in TensorIterator:
   ///   - declare_static_shape() sets an initial shape explicitly
   ///     provided by user, otherwise
   ///   - compute_shape() computes the true (non-computational) shape
   ///     specified by the user.
   ///   - reorder_dimensions() reorders dimensions to improve coalescing.
   ///   - coalesce_dimensions() then coalesces adjacent dimensions when
   ///     possible.
   ///
   /// The shape may also be further modified if we create sub-TensorIterators,
   /// e.g., via narrow or select_all_keeping_dim.
   DimVector shape_;

   /// Temporarily records the permutation computed by reorder_dimensions.
   /// This permutation maps the computation output dimension (dim) to
   /// the original true output dimension (perm_[dim]).  It is used by
   /// invert_perm to undo the permutation.  After coalesce_dimensions is
   /// called, the permutation is no longer valid (as, in general, there
   /// is no permutation that will make computation dimensions to
   /// output dimensions); methods that manipulate perm_ are obligated
   /// to test that !has_coalesced_dimensions
   DimVector perm_;

   /// Has coalesce_dimensions() (or any moral equivalent, e.g., fast_build())
   /// been called?  This is SOLELY used to check validity of perm_.
   bool has_coalesced_dimensions_ = false;

   /// Whether iteration must be fixed. This disables dimension permuting and
   /// also changes how for_each divides work among threads.
   bool enforce_linear_iteration_ = false;

   /// The index offsets into the original tensors for each dimension.
   /// This is only non-zero when you narrow() a TensorIterator (e.g.,
   /// when you make sub-TensorIterators).
   DimVector view_offsets_;

   /// The computed names of the output tensor.  Computed by compute_names()
   NameVector names_;

   /// The operands of the TensorIterator: both the inputs and outputs.  The
   /// outputs MUST come first in the operands_ list.  There is always an
   /// operand for each output of the TensorIterator, even if TensorIterator
   /// will ultimately be responsible for allocating the output; in those
   /// cases, tensor is simply undefined (and will be populated later
   /// during build()).
   ///
   /// This list is initially populated prior to build(), but build() mutates
   /// OperandInfo to populate more information.
   SmallVector<OperandInfo, 4> operands_;

   /// Number of outputs in operands_ (the length of the outputs prefix
   /// in operands_).
   int num_outputs_ = 0;

   /// Whether or not all operands have the same shape.  Having all the same
   /// shape affects whether or not the iterator is eligible for fast setup.
   bool all_ops_same_shape_ = false;

   /// The "computation" dtype of TensorIterator, specifying what the dtype
   /// we will do the internal computation in TensorIterator.  Typically,
   /// this matches the dtype of the output tensors, but not always!
   ScalarType common_dtype_ = ScalarType::Undefined;

   /// This is currently defined as kCPU, or the device of the first non-CPU
   /// tensor argument. See TensorIteratorBase::compute_types for details.
   Device common_device_ = kCPU;

   /// Set by split(), see should_accumulate() and is_final_output()
   bool accumulate_ = false;
   bool final_output_ = true;

   // From TensorIteratorConfig
   bool is_reduction_ = false;

   /// Set by populate_operands(), says if we're handling meta tensors
   bool is_meta_ = false;
 };

 struct TORCH_API TensorIterator final : public TensorIteratorBase {
   TensorIterator() : TensorIteratorBase() {}
   // Slicing is OK, TensorIterator guaranteed NOT to have any fields
   TensorIterator(const TensorIteratorBase& iter) : TensorIteratorBase(iter) {}

 #define TORCH_DISALLOW_TEMPORARIES(methodname) \
   TORCH_DISALLOW_TEMPORARIES_IMPL(methodname, static)

   static TensorIterator binary_float_op(
       TensorBase& out,
       const TensorBase& a,
       const TensorBase& b);
   static TensorIterator binary_op(
       TensorBase& out,
       const TensorBase& a,
       const TensorBase& b);
   static TensorIterator borrowing_binary_op(
       const TensorBase& out,
       const TensorBase& a,
       const TensorBase& b);
   TORCH_DISALLOW_TEMPORARIES(borrowing_binary_op)
   static TensorIterator comparison_op(
       TensorBase& out,
       const TensorBase& a,
       const TensorBase& b);
   static TensorIterator unary_op(TensorBase& out, const TensorBase& a);
   static TensorIterator unary_float_op(TensorBase& out, const TensorBase& a);
   static TensorIterator nullary_op(TensorBase& out);
   static TensorIterator borrowing_nullary_op(const TensorBase& out);
   static TensorIterator borrowing_nullary_op(TensorBase&& out) = delete;
   static TensorIterator reduce_op(TensorBase& out, const TensorBase& a);
   static TensorIterator reduce_op(
       TensorBase& out1,
       TensorBase& out2,
       const TensorBase& a);
 #undef TORCH_DISALLOW_TEMPORARIES
 #undef TORCH_DISALLOW_TEMPORARIES_IMPL

   const Tensor& maybe_get_output(int64_t output_idx) override;
   void set_output_raw_strided(
       int64_t output_idx,
       IntArrayRef sizes,
       IntArrayRef strides,
       TensorOptions options,
       DimnameList names) override;
 };

 class TORCH_API TensorIteratorConfig final {
  public:
   friend struct TensorIteratorBase;
   friend struct TensorIterator;

   TensorIteratorConfig() {}

   C10_DISABLE_COPY_AND_ASSIGN(TensorIteratorConfig);

   /// Construction
   // Stores input/output Tensors without incrementing the reference count.
   // Important: the outputs have to be added before the inputs.
   TensorIteratorConfig& add_output(const TensorBase& output) {
     return add_borrowed_output(output);
   }
   TensorIteratorConfig& add_input(const TensorBase& input) {
     return add_borrowed_input(input);
   }

   // Borrowing from temporaries is unlikely to go well.
   TensorIteratorConfig& add_output(TensorBase&& output) = delete;
   TensorIteratorConfig& add_input(TensorBase&& input) = delete;

   // Stores input/output Tensors while incrementing the reference count.
   // Note that add_{in,out}put are nearly always what you
   // want, and the exception (adding an unnamed temporary) won't
   // compile.
   TensorIteratorConfig& add_owned_output(const TensorBase& output);
   TensorIteratorConfig& add_owned_input(const TensorBase& input);

   // Advanced API: stores input/output Tensors without incrementing
   // the reference count. The caller must ensure that these Tensors
   // live at least as long as this TensorIteratorConfig and any
   // TensorIteratorBase built from this TensorIteratorConfig.
   // Important: the outputs have to be added before the inputs.
   TensorIteratorConfig& add_borrowed_output(const TensorBase& output);
   TensorIteratorConfig& add_borrowed_input(const TensorBase& input);

   // Borrowing from temporaries is unlikely to go well.
   TensorIteratorConfig& add_borrowed_output(TensorBase&& output) = delete;
   TensorIteratorConfig& add_borrowed_input(TensorBase&& input) = delete;

   // Sets the check_mem_overlap_ flag, which is true by default.
   // If true, inputs are checked for partial overlap with the outputs and
   // outputs are checked for internal overlap (e.g. broadcasted views). An error
   // is raised if unacceptable overlap is detected.
   // If you're migrating an existing operator to using TensorIterator, please
   // consider if the previous implementation checked memory overlap. If it did
   // not, and if the operator is idempotent (for example, Tensor.fill_(0)), then
   // checking memory overlap is BC-breaking. Please don't check memory overlap
   // in that case.
   TensorIteratorConfig& set_check_mem_overlap(bool check_mem_overlap) {
     check_mem_overlap_ = check_mem_overlap;
     return *this;
   }

   // Sets the check_all_same_dtype_ flag, which is true by default
   // If true, checks that all inputs and defined outputs have the same dtype
   // Setting either of promote_inputs_to_common_dtype_
   //   or cast_common_dtype_to_outputs_ to true will set
   //   check_all_same_dtype_ to false.
   TensorIteratorConfig& check_all_same_dtype(const bool _check_all_same_dtype) {
     check_all_same_dtype_ = _check_all_same_dtype;
     return *this;
   }

   // Sets the check_all_same_device_ flag, which is true by default
   // If true, all operands must be on the same device, with the possible
   //   exception of CPU scalars, which can be passed to some CUDA kernels
   //   as kernel arguments.
   TensorIteratorConfig& check_all_same_device(
       const bool _check_all_same_device) {
     check_all_same_device_ = _check_all_same_device;
     return *this;
   }

   // Sets the enforce_safe_casting_to_output_ flag, which is false by default
   // If true, the iterator's "common dtype" must be computable
   //   (see the [Common Dtype Computation] note) and
   //   canCast(common dtype, output dtype) must be true for all outputs.
   TensorIteratorConfig& enforce_safe_casting_to_output(
       const bool _enforce_safe_casting_to_output) {
     enforce_safe_casting_to_output_ = _enforce_safe_casting_to_output;
     return *this;
   }

   // Sets the enforce_linear_iteration_ flag, which is false by default.
   // If true, iteration goes in the same order as a C-contiguous tensor
   // is layed out in memory. i.e. last dimension iterates fastest.
   //
   // This iteration order can be less efficient and may even prevent
   // vectorization. So only use if the correctness of your kernel depends on it.
   TensorIteratorConfig& enforce_linear_iteration(
       const bool _enforce_linear_iteration = true) {
     enforce_linear_iteration_ = _enforce_linear_iteration;
     return *this;
   }

   // Sets the promote_inputs_to_common_dtype_ flag, which is false by default
   // If true, the iterator's "common dtype" is always computed (see the
   //   [Common Dtype Computation] note) and, on the CPU, temporary copies of
   //   the inputs in the common dtype are passed as the actual inputs to
   //   the operation.
   // Setting this flag to true sets check_all_same_dtype_ to false.
   TensorIteratorConfig& promote_inputs_to_common_dtype(
       const bool _promote_inputs_to_common_dtype) {
     promote_inputs_to_common_dtype_ = _promote_inputs_to_common_dtype;
     if (_promote_inputs_to_common_dtype) {
       check_all_same_dtype_ = false;
     }
     return *this;
   }

   // Sets the promote_integer_inputs_to_float_ flag, which is false by default
   // NOTE: If set to true, the promote_inputs_to_common_dtype_ must also be
   // true. If true, if the iterator's "common dtype" is an integral type
   // (including bool)
   //   then it is changed to the default float scalar type.
   TensorIteratorConfig& promote_integer_inputs_to_float(
       const bool _promote_integer_inputs_to_float) {
     promote_integer_inputs_to_float_ = _promote_integer_inputs_to_float;
     TORCH_INTERNAL_ASSERT(
         !promote_integer_inputs_to_float_ || promote_inputs_to_common_dtype_);
     return *this;
   }

   TensorIteratorConfig& is_reduction(const bool _is_reduction) {
     is_reduction_ = _is_reduction;
     return *this;
   }

   TensorIteratorConfig& allow_cpu_scalars(const bool _allow_cpu_scalars) {
     allow_cpu_scalars_ = _allow_cpu_scalars;
     return *this;
   }

   // Sets the cast_common_dtype_to_outputs_ flag, which is false by default
   // If true, the iterator's "common dtype" must be computatable
   //   (see the [Common Dtype Computation] note) and, on the CPU, temporary
   //   copies of the outputs are passed as the actual output to the operation.
   //   These temporaries are then copied to the original outputs after
   //   the operation is performed (see cast_outputs()).
   // Setting this flag to true sets check_all_same_dtype_ to false.
   TensorIteratorConfig& cast_common_dtype_to_outputs(
       const bool _cast_common_dtype_to_outputs) {
     cast_common_dtype_to_outputs_ = _cast_common_dtype_to_outputs;
     if (_cast_common_dtype_to_outputs) {
       check_all_same_dtype_ = false;
     }
     return *this;
   }

   TensorIteratorConfig& resize_outputs(bool resize_outputs) {
     resize_outputs_ = resize_outputs;
     return *this;
   }

   // Bypass output dtype/device computation and fix the dtype/device as
   // specified here.
   TensorIteratorConfig& declare_static_dtype_and_device(
       ScalarType dtype,
       Device device);
   TensorIteratorConfig& declare_static_dtype(ScalarType dtype);
   TensorIteratorConfig& declare_static_device(Device device);
   TensorIteratorConfig& declare_static_shape(IntArrayRef shape);
   TensorIteratorConfig& declare_static_shape(
       IntArrayRef shape,
       IntArrayRef squash_dims);

   // It would be better if this was && qualified, but this would be at the cost
   // of a lot of boilerplate above
   TensorIterator build() {
     TensorIterator iter;
     iter.build(*this);
     return iter;
   }

  private:
   SmallVector<c10::MaybeOwned<TensorBase>, 4> tensors_;
   int num_outputs_ = 0;
   int num_inputs_ = 0;

   c10::optional<DimVector> static_shape_ = c10::nullopt;
   c10::optional<ScalarType> static_dtype_ = c10::nullopt;
   c10::optional<Device> static_device_ = c10::nullopt;
   bool check_mem_overlap_ = true;
   bool allow_cpu_scalars_ = false;
   bool is_reduction_ = false;
   bool resize_outputs_ = true;
   bool check_all_same_dtype_ = true;
   bool check_all_same_device_ = true;
   bool enforce_safe_casting_to_output_ = false;
   bool enforce_linear_iteration_ = false;
   bool promote_inputs_to_common_dtype_ = false;
   bool promote_integer_inputs_to_float_ = false;
   bool cast_common_dtype_to_outputs_ = false;
 };

 /// A container-like struct that acts as if it contains splits of a
 /// TensorIterator that can use 32-bit indexing. Taken together the splits cover
 /// the original TensorIterator.
 struct TORCH_API SplitUntil32Bit {
   struct TORCH_API iterator {
     iterator(){};
     iterator(const TensorIteratorBase& iter);
     iterator(iterator&&) = default;

     // Guaranteed to be a TensorIterator proper!
     TensorIterator& operator*() const;
     iterator& operator++();
     bool operator==(const iterator& other) const {
       // two iterators are equal if they are the same object or they're both
       // empty
       return this == &other || (vec.empty() && other.vec.empty());
     }
     // needed for C++11 range-based for loop
     bool operator!=(const iterator& other) const {
       return !(*this == other);
     }

     /// stack of TensorIterators to be split
     std::vector<std::unique_ptr<TensorIterator>> vec;
   };

   SplitUntil32Bit(const TensorIteratorBase& iter) : iter(iter) {}

   iterator begin() const;
   iterator end() const;

  private:
   const TensorIteratorBase& iter;
 };

 } // namespace at

 C10_CLANG_DIAGNOSTIC_POP()