caffe2/operators/segment_reduction_op.h - platform/external/pytorch - Git at Google

 #ifndef CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_
 #define CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_

 #include "caffe2/core/export_caffe2_op_to_c10.h"
 #include <c10/util/irange.h>
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/operators/reducer_functors.h"

 C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(LengthsSum);
 C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(LengthsMean);
 C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(LengthsMax);

 namespace caffe2 {

 template <typename TData>
 class BaseInputAccessor {
  public:
   BaseInputAccessor() {}

   bool observeInput(const Tensor& dataInput) {
     data_ = dataInput.raw_data();
     return dataInput.template IsType<TData>();
   }

   inline const TData*
   getBlockPtr(int64_t in_block_size, int64_t idx, int64_t /* blocks */ = 1) {
     return static_cast<const TData*>(data_) + in_block_size * idx;
   }

  protected:
   const void* data_ = nullptr;
 };

 ////////////////////////////////////////////////////////////////////////////////
 // Range reducer ops: leverage that input segment is continuous and allow
 // reducer functors to do something special
 // Note: for now there are no real use cases for it yet :)
 // Also, doesn't support additional arguments for now
 ////////////////////////////////////////////////////////////////////////////////

 /**
  * Base implementation for segment reduction op that leverages continuity of the
  * data
  *
  * Assumes that segments are sorted and there are no skip indices
  */
 template <
     typename T,
     typename SIndex,
     class Context,
     class RangeReducer,
     class InputAccessor = BaseInputAccessor<T>>
 class AbstractSortedSegmentRangeOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentRangeOp);

   bool RunOnDevice() override {
     auto& dataInput = Input(DATA);
     auto& segment_ids = Input(SEGMENT_IDS);

     CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector");
     auto N = segment_ids.size(0);
     CAFFE_ENFORCE_EQ(
         N,
         dataInput.size(0),
         "SEGMENT_IDS must have the same length as outer dimension of DATA");

     OPERATOR_NEEDS_FEATURE(
         inputAccessor_.observeInput(dataInput),
         "Unsupported input type: ",
         dataInput.dtype().name(),
         ".");

     const SIndex* s_ids = segment_ids.template data<SIndex>();

     const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
     auto shape = dataInput.sizes().vec();
     shape[0] = K;
     auto* output = Output(0, shape, at::dtype<T>());

     T* out = output->template mutable_data<T>();

     if (N == 0) {
       return true;
     }

     int64_t block_size = dataInput.numel() / N;

     // Assume the segments are sorted and there are no gaps
     CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
     for (int64_t i = 0; i < N;) {
       int64_t start = i;
       for (++i; i < N && s_ids[start] == s_ids[i]; ++i)
         ;

       RangeReducer()(
           block_size,
           i - start,
           inputAccessor_.getBlockPtr(block_size, start, i - start),
           out + block_size * s_ids[start],
           &context_);

       // check correctness of the next segment
       if (i < N) {
         CAFFE_ENFORCE_EQ(
             s_ids[start] + 1,
             s_ids[i],
             "Indices must be sorted and not have gaps");
       }
     }
     return true;
   }

   static constexpr int kNumInputs = 2;
   INPUT_TAGS(DATA, SEGMENT_IDS);

  private:
   InputAccessor inputAccessor_;
 };

 template <
     typename T,
     typename SIndex,
     class Context,
     class RangeReducerGradient>
 class AbstractSortedSegmentRangeGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentRangeGradientOp);

   bool RunOnDevice() override {
     // TODO(azzolini): avoid using input/output if not used by a particular op
     auto& data_in = Input(DATA_IN);
     auto& data_out = Input(DATA_OUT);
     auto& segment_grads = Input(SEGMENT_GRADS);
     auto& segment_ids = Input(SEGMENT_IDS);

     CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector");
     int64_t N = segment_ids.size(0);

     const SIndex* s_ids = segment_ids.template data<SIndex>();
     const T* s_grads = segment_grads.template data<T>();
     const T* d_in = data_in.template data<T>();
     const T* d_out = data_out.template data<T>();

     auto shape = segment_grads.sizes().vec();
     shape[0] = N;
     auto* data_grads = Output(0, shape, at::dtype<T>());

     const SIndex K = segment_grads.size(0);
     T* out = data_grads->template mutable_data<T>();

     if (N == 0) {
       return true;
     }

     int64_t block_size = segment_grads.size_from_dim(1);

     // Assume the segments are sorted and there are no gaps
     CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
     // repeat the check from forward op
     CAFFE_ENFORCE_EQ(
         K - 1, s_ids[N - 1], "Indices must be sorted and not have gaps");
     for (int64_t i = 0; i < N;) {
       int64_t start = i;
       for (++i; i < N && s_ids[start] == s_ids[i]; ++i)
         ;

       auto expanded_idx = block_size * start;
       auto reduced_idx = block_size * s_ids[start];
       RangeReducerGradient()(
           block_size,
           i - start,
           s_grads + reduced_idx,
           out + expanded_idx,
           d_in + expanded_idx,
           d_out + reduced_idx,
           &context_);

       // check correctness of the next segment
       if (i < N) {
         CAFFE_ENFORCE_EQ(
             s_ids[start] + 1,
             s_ids[i],
             "Indices must be sorted and not have gaps");
       }
     }
     return true;
   }

   static constexpr int kNumInputs = 4;
   INPUT_TAGS(DATA_IN, DATA_OUT, SEGMENT_GRADS, SEGMENT_IDS);
 };

 template <typename T, typename SIndex, typename Context, typename ReducerDef>
 struct AbstractSortedSegmentRangeDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "SortedSegmentRange";
   static constexpr const char* doc = R"DOC(
 Applies '{op}' to each segment of input tensor. In order to allow for more
 efficient implementation of '{op}', the input segments have to be contiguous
 and non-empty.

 SEGMENT_IDS is a vector that maps each of the first dimension slices of the
 DATA to a particular group (segment). Values belonging to the same segment are
 aggregated together.

 The first dimension of the output is equal to the number of input segments,
 i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.

 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Input(0, "DATA", "Input tensor to be aggregated");
     schema.Input(
         1,
         "SEGMENT_IDS",
         "Vector with the same length as the first dimension of DATA "
         "and values in the range 0..K-1 and in increasing order that "
         "maps each slice of DATA to one of the segments");
     schema.Output(
         0,
         "OUTPUT",
         "Aggregated tensor with the first dimension of K and the "
         "other dimentsions inherited from DATA");
   }
   using ForwardOp = AbstractSortedSegmentRangeOp<
       T,
       SIndex,
       Context,
       typename ReducerDef::template Reducer<T, Context>>;
   using BackwardOp = AbstractSortedSegmentRangeGradientOp<
       T,
       SIndex,
       Context,
       typename ReducerDef::template ReducerGradient<T, Context>>;
   struct GetGradient : public GradientMakerBase {
     using GradientMakerBase::GradientMakerBase;
     vector<OperatorDef> GetGradientDefs() override {
       return SingleGradientDef(
           string(basename) + ReducerDef::name + "Gradient",
           "",
           vector<string>{I(0), O(0), GO(0), I(1)},
           // no gradient on segment_ids!
           vector<string>{GI(0)});
     }
   };
 };

 ////////////////////////////////////////////////////////////////////////////////
 // Incremental reducer ops: assume that reducer consumes pieces of data one by
 // one. Also, supports additional arguments passed to reducer, e.g. scalers for
 // weighted sum.
 //
 // Note: in current implementation additional inputs are considered auxiliary
 // constants and have limitations:
 // - there is no gradient computation for auxiliary inputs
 // - auxiliary inputs aren't affected by fused embedding lookup in operations
 // like sparse_sorted_segment
 ////////////////////////////////////////////////////////////////////////////////

 /**
  * @brief Simple non-segmented reduction over the first few dimensions of the
  * tensor
  *
  * Inputs:
  *   0: DATA - input embedding to do lookups in
  *   1..P: AUX_ARG_<I> - optional additional arguments to be passed to the
  *                       reducer
  *
  * Args:
  *   num_reduce_dim (default 1) - the number of dims in front of the tensor to
  *                                reduce
  *
  * Output:
  *   Tensor without the first `num_dim` dimensions of DATA
  */
 template <
     typename T,
     class Context,
     class Reducer,
     bool FirstDim,
     class InputAccessor = BaseInputAccessor<T>>
 class AbstractReduceFrontOrBackOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   template <class... Args>
   explicit AbstractReduceFrontOrBackOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...),
         OP_SINGLE_ARG(int, "num_reduce_dim", num_reduce_dims_, 1) {}

   bool RunOnDevice() override {
     auto& data = Input(0);
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     int64_t in_block_size = FirstDim
         ? data.size_from_dim(num_reduce_dims_)
         : data.size_to_dim(data.dim() - num_reduce_dims_);
     return DispatchHelper<typename Reducer::FixedDispatch>::call(
         this, in_block_size);
   }

   template <int FixedSize>
   bool DoRunWithValue() {
     auto& data = Input(0);

     CAFFE_ENFORCE_LE(num_reduce_dims_, data.dim());

     typename Reducer::Meta ctx(FirstDim);
     ctx.observeInput(0, data, num_reduce_dims_);
     for (int i = 1; i < Reducer::kInputCount; ++i) {
       auto& aux_in = Input(i);
       ctx.observeInput(i, aux_in, num_reduce_dims_);
     }

     OPERATOR_NEEDS_FEATURE(
         inputAccessor_.observeInput(data),
         "Unsupported input type: ",
         data.dtype().name(),
         ".");

     vector<int64_t> shape;
     ctx.appendOutputShape(&shape);
     auto* output = Output(0, shape, at::dtype<T>());

     T* out = output->template mutable_data<T>();

     const int block_size = FirstDim
         ? data.size_from_dim(num_reduce_dims_)
         : data.size_from_dim(data.dim() - num_reduce_dims_);

     const int num_blocks = block_size > 0 ? data.numel() / block_size : 0;

     Reducer r(ctx, out, &context_);
     for (const auto i : c10::irange(num_blocks)) {
       r.template process<FixedSize>(
           ctx, inputAccessor_.getBlockPtr(block_size, i), i, &context_);
     }
     r.template finish<FixedSize>(ctx, &context_);
     return true;
   }

   static constexpr int kNumInputs = Reducer::kInputCount;

  private:
   int num_reduce_dims_;
   InputAccessor inputAccessor_;
 };

 template <
     typename T,
     class Context,
     class ReducerGradient,
     bool FirstDim = true>
 class AbstractReduceFrontOrBackGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   template <class... Args>
   explicit AbstractReduceFrontOrBackGradientOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...),
         OP_SINGLE_ARG(int, "num_reduce_dim", num_reduce_dims_, 1) {}

   bool RunOnDevice() override {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     int64_t grad_block_size = Input(REDUCTION_GRAD).numel();
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, grad_block_size);
   }

   template <int FixedSize>
   bool DoRunWithValue() {
     auto& reduction_grad = Input(REDUCTION_GRAD);
     auto& source_shape = this->template Input<Tensor>(SOURCE_SHAPE, CPU);

     typename ReducerGradient::Meta ctx(reduction_grad, 0, FirstDim);
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
     for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
       auto& aux_in = Input(i);
       ctx.observeOriginalInput(
           ReducerGradient::originalInputs()[i],
           aux_in,
           nullptr, /*no grad*/
           num_reduce_dims_);
     }

     const T* r_grad = reduction_grad.template data<T>();

     CAFFE_ENFORCE_LE(num_reduce_dims_, source_shape.numel());

     vector<int64_t> shape(
         source_shape.template data<int64_t>(),
         source_shape.template data<int64_t>() + source_shape.numel());

     auto* data_grads = Output(0, shape, at::dtype<T>());

     int64_t block_size = FirstDim
         ? data_grads->size_from_dim(num_reduce_dims_)
         : data_grads->size_from_dim(data_grads->dim() - num_reduce_dims_);
     int64_t block_num = block_size > 0 ? data_grads->numel() / block_size : 0;

     T* out = data_grads->template mutable_data<T>();

     ReducerGradient r(ctx, r_grad, &context_);
     for (const auto i : c10::irange(block_num)) {
       r.template fillGrad<FixedSize>(
           ctx,
           out + block_size * i,
           i,
           &context_,
           FirstDim ? block_num : block_size);
     }
     return true;
   }

   static constexpr int kNumInputs =
       ReducerGradient::originalInputs().size() + 2;
   enum _InputTags {
     REDUCTION_GRAD = ReducerGradient::originalInputs().size(),
     SOURCE_SHAPE
   };

  private:
   int num_reduce_dims_;
 };

 template <typename T, typename Context, typename ReducerDef>
 struct AbstractReduceFrontDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "ReduceFront";
   static constexpr const char* doc = R"DOC(
 Reduces the input tensor along the first dimension of the input tensor by
 applying '{op}'. This op acts in a similar way to SortedSegment{op} and
 UnsortedSegment{op} but as if all input slices belong to a single segment.

 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Input(
         0, "DATA", "Input tensor to be reduced on the first dimension");
     schema.TensorInferenceFunction([](const OperatorDef& def,
                                       const vector<TensorShape>& in) {
       CAFFE_ENFORCE_EQ(1, in.size());
       ArgumentHelper helper(def);
       int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1);
       typename ReducerDef::template Reducer<T, Context>::Meta ctx(true);
       vector<int64_t> out_dims = ctx.getOutputShape(in[0], num_reduce_dims);
       return vector<TensorShape>{
           CreateTensorShape(out_dims, in[0].data_type())};
     });
     ReducerDef::PopulateSchema(schema);
   }
   using ReducerGradient =
       typename ReducerDef::template ReducerGradient<T, Context>;
   using ForwardOp = AbstractReduceFrontOrBackOp<
       T,
       Context,
       typename ReducerDef::template Reducer<T, Context>,
       true>;
   using BackwardOp =
       AbstractReduceFrontOrBackGradientOp<T, Context, ReducerGradient, true>;
   struct GetGradient : public GradientMakerBase {
     using GradientMakerBase::GradientMakerBase;
     vector<OperatorDef> GetGradientDefs() override {
       // Have utility function generating these names?
       string tmp_dims = "_" + O(0) + "_dims";

       vector<string> grad_ins;
       for (const int i : ReducerGradient::originalInputs()) {
         grad_ins.push_back(I(i));
       }
       grad_ins.push_back(GO(0));
       grad_ins.push_back(tmp_dims);

       vector<Argument> args;
       if (ArgumentHelper::HasArgument(def_, "num_reduce_dim")) {
         args.push_back(GetArgument(def_, "num_reduce_dim"));
       }
       // FIXME: pass in num_reduce_dims?!
       return vector<OperatorDef>{
           CreateOperatorDef(
               "Shape", "", vector<string>{I(0)}, vector<string>{tmp_dims}),
           CreateOperatorDef(
               string(basename) + ReducerDef::name + "Gradient",
               "",
               grad_ins,
               // no gradient on auxiliary inputs for now
               vector<string>{GI(0)}),
       };
     }
   };
 };

 template <typename T, typename Context, typename ReducerDef>
 struct AbstractReduceBackDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "ReduceBack";
   static constexpr const char* doc = R"DOC(
 Reduces the input tensor along the last dimension of the input tensor by
 applying '{op}'. This op acts in a similar way to SortedSegment{op} and
 UnsortedSegment{op} but as if all input slices belong to a single segment.

 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Input(
         0, "DATA", "Input tensor to be reduced on the first dimension");
     schema.TensorInferenceFunction([](const OperatorDef& def,
                                       const vector<TensorShape>& in) {
       CAFFE_ENFORCE_EQ(1, in.size());
       ArgumentHelper helper(def);
       int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1);
       typename ReducerDef::template Reducer<T, Context>::Meta ctx(false);
       vector<int64_t> out_dims = ctx.getOutputShape(in[0], num_reduce_dims);
       return vector<TensorShape>{
           CreateTensorShape(out_dims, in[0].data_type())};
     });
     ReducerDef::PopulateSchema(schema);
   }
   using ReducerGradient =
       typename ReducerDef::template ReducerGradient<T, Context>;
   using ForwardOp = AbstractReduceFrontOrBackOp<
       T,
       Context,
       typename ReducerDef::template Reducer<T, Context>,
       false>;
   using BackwardOp =
       AbstractReduceFrontOrBackGradientOp<T, Context, ReducerGradient, false>;
   struct GetGradient : public GradientMakerBase {
     using GradientMakerBase::GradientMakerBase;
     vector<OperatorDef> GetGradientDefs() override {
       // Have utility function generating these names?
       string tmp_dims = "_" + O(0) + "_dims";

       vector<string> grad_ins;
       for (const int i : ReducerGradient::originalInputs()) {
         grad_ins.push_back(I(i));
       }
       grad_ins.push_back(GO(0));
       grad_ins.push_back(tmp_dims);

       vector<Argument> args;
       if (ArgumentHelper::HasArgument(def_, "num_reduce_dim")) {
         args.push_back(GetArgument(def_, "num_reduce_dim"));
       }
       // FIXME: pass in num_reduce_dims?!
       return vector<OperatorDef>{
           CreateOperatorDef(
               "Shape", "", vector<string>{I(0)}, vector<string>{tmp_dims}),
           CreateOperatorDef(
               string(basename) + ReducerDef::name + "Gradient",
               "",
               grad_ins,
               // no gradient on auxiliary inputs for now
               vector<string>{GI(0)}),
       };
     }
   };
 };

 /**
  * @brief Segment reduction op with optional fused embedding lookup
  *
  * Base implementation for SortedSegmentXXX and SparseSortedSegmentXXX depending
  * on SparseFused static argument.
  *
  * Inputs:
  *   0: DATA - input embedding to do lookups in
  *   1..P: AUX_ARG_<I> - optional additional arguments to be passed to the
  *                       reducer, should have the same first dimension as
  *                       SEGMENT_IDS (e.g. scalars in WeightedSum)
  *   # if SparseFused == true:
  *   P+1: INDICES - 1-D vector with indices to look up in DATA. Should have the
  *                  same dimension as SEGMENT_IDS
  *   # P+1 if SparseFused == false:
  *   P+1 or P+2: SEGMENT_IDS - sorted segment ids 1-D vector
  *
  * Output:
  *   Tensor with first dimension of K, where K is the max segment id + 1. Rest
  *   of dimensions are decided by reducer but usually are the same size as extra
  *   dimensions of DATA
  */
 template <
     typename T,
     typename SIndex,
     class Context,
     class Reducer,
     bool SparseFused = true,
     class InputAccessor = BaseInputAccessor<T>>
 class AbstractSortedSegmentOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentOp);

   bool RunOnDevice() override {
     if (SparseFused) {
       return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
           this, Input(INDICES));
     } else {
       // type doesn't matter
       return DoRunWithType<int64_t>();
     }
   }

   template <typename IndexType>
   bool DoRunWithType() {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     int64_t in_block_size = Input(0).size_from_dim(1);
     return DispatchHelper<typename Reducer::FixedDispatch, IndexType>::call(
         this, in_block_size);
   }

   template <typename IndexType, int FixedSize>
   bool DoRunWithValue() {
     auto& dataInput = Input(0);
     auto& segment_ids = Input(SEGMENT_IDS);

     CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector");
     int64_t N = segment_ids.size(0);
     const int64_t M = dataInput.size(0);

     const IndexType* idxs;
     if (SparseFused) { // static if
       auto& indices = Input(INDICES);
       CAFFE_ENFORCE_EQ(1, indices.dim(), "INDICES must be a vector");
       CAFFE_ENFORCE_EQ(
           N,
           indices.size(0),
           "SEGMENT_IDS must have the same length as INDICES");
       idxs = indices.template data<IndexType>();
     } else {
       CAFFE_ENFORCE_EQ(
           N, M, "DATA must have the same first dimension as SEGMENT_IDS");
     }

     // It would probably look nicer with varargs templates but it's too much
     // metaprogramming
     typename Reducer::Meta ctx;
     ctx.observeInput(0, dataInput, 1);
     for (int i = 1; i < Reducer::kInputCount; ++i) {
       auto& aux_in = Input(i);
       CAFFE_ENFORCE_EQ(
           N,
           aux_in.size(0),
           "Input ",
           i,
           " must have the same first dim as SEGMENT_IDS");
       ctx.observeInput(i, aux_in, 1);
     }

     OPERATOR_NEEDS_FEATURE(
         inputAccessor_.observeInput(dataInput),
         "Unsupported input type: ",
         dataInput.dtype().name(),
         ".");

     const SIndex* s_ids = segment_ids.template data<SIndex>();

     const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0;
     vector<int64_t> shape;
     shape.push_back(K);
     ctx.appendOutputShape(&shape);
     auto* output = Output(0, shape, at::dtype<T>());

     T* out = output->template mutable_data<T>();
     if (N == 0) {
       return true;
     }
     int64_t in_block_size = dataInput.size_from_dim(1);
     int64_t out_block_size = output->size_from_dim(1);

     // Assume the segments are sorted and there are no gaps
     CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
     for (int64_t i = 0; i < N;) {
       int64_t start = i;

       Reducer r(ctx, out + out_block_size * s_ids[start], &context_);
       for (; i < N && s_ids[start] == s_ids[i]; ++i) {
         IndexType idx;
         if (SparseFused) { // static if
           CAFFE_ENFORCE(
               0 <= idxs[i] && idxs[i] < M,
               "Index out of bounds: ",
               idxs[i],
               ", range 0 to ",
               M);
           idx = idxs[i];
         } else {
           idx = i;
         }
         r.template process<FixedSize>(
             ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_);
       }

       r.template finish<FixedSize>(ctx, &context_);
       // check correctness of the next segment
       if (i < N) {
         CAFFE_ENFORCE_EQ(
             s_ids[start] + 1,
             s_ids[i],
             "Indices must be sorted and not have gaps");
       }
     }
     return true;
   }

   enum {
     INDICES = Reducer::kInputCount,
     SEGMENT_IDS = Reducer::kInputCount + (SparseFused ? 1 : 0)
   };
   static constexpr int kSelfInputs = SparseFused ? 2 : 1;
   static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;

  private:
   InputAccessor inputAccessor_;
 };

 // Gradient actually doesn't depend on whether sparse lookup is fused or not
 template <typename T, typename SIndex, class Context, class ReducerGradient>
 class AbstractSortedSegmentGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentGradientOp);

   bool RunOnDevice() override {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     int64_t grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, grad_block_size);
   }

   template <int FixedSize>
   bool DoRunWithValue() {
     auto& segment_grads = Input(SEGMENT_GRADS);
     auto& segment_ids = Input(SEGMENT_IDS);

     CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector");
     int64_t N = segment_ids.size(0);

     typename ReducerGradient::Meta ctx(segment_grads, 1);
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
     for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
       auto& aux_in = Input(i);
       CAFFE_ENFORCE_EQ(
           N,
           aux_in.size(0),
           "Input ",
           i,
           " must have the same first dim as SEGMENT_IDS");
       ctx.observeOriginalInput(
           ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
     }

     const SIndex* s_ids = segment_ids.template data<SIndex>();
     const T* s_grads = segment_grads.template data<T>();

     vector<int64_t> shape;
     shape.push_back(N);
     ctx.appendGradShape(&shape);
     auto* data_grads = Output(0, shape, at::dtype<T>());

     int64_t d_block_size = data_grads->size_from_dim(1);
     const SIndex K = segment_grads.size(0);
     int64_t s_block_size = segment_grads.size_from_dim(1);
     T* out = data_grads->template mutable_data<T>();

     if (N == 0) {
       return true;
     }

     // Assume the segments are sorted and there are no gaps
     CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps");
     // repeat the check from forward op
     CAFFE_ENFORCE_EQ(
         K - 1, s_ids[N - 1], "Indices must be sorted and not have gaps");
     for (int64_t i = 0; i < N;) {
       int64_t start = i;
       int64_t end = start;

       if (ReducerGradient::computeLength()) {
         for (; end < N && s_ids[start] == s_ids[end]; ++end) {
         }
       }

       ReducerGradient r(ctx, s_grads + s_block_size * s_ids[start], &context_);
       for (; i < N && s_ids[start] == s_ids[i]; ++i) {
         r.template fillGrad<FixedSize>(
             ctx, out + d_block_size * i, i, &context_, end - start);
       }

       // check correctness of the next segment
       if (i < N) {
         CAFFE_ENFORCE_EQ(
             s_ids[start] + 1,
             s_ids[i],
             "Indices must be sorted and not have gaps");
       }
     }
     return true;
   }

   // Input layout:
   //   orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS
   // orig_argXs represent original op's inputs and will be passed to the reducer
   // directly
   static constexpr int kNumInputs =
       ReducerGradient::originalInputs().size() + 2;
   enum _InputTags {
     SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
     SEGMENT_IDS
   };
 };

 // base implementation of sorted/unsorted sparse/non-sparse gradient computation
 template <
     typename ForwardOp,
     typename ReducerDef,
     typename ReducerGradient,
     bool Sorted,
     bool SparseFused>
 struct SegmentOpGetGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     CAFFE_ENFORCE(
         !ReducerGradient::requiresDataInput(Def()),
         "grads on aux inputs are not yet implemented for Segment operators.");
     vector<string> grad_ins;
     for (const int i : ReducerGradient::originalInputs()) {
       grad_ins.push_back(I(i));
     }
     grad_ins.push_back(GO(0));
     grad_ins.push_back(I(ForwardOp::SEGMENT_IDS));
     vector<OperatorDef> r{CreateOperatorDef(
         string(Sorted ? "SortedSegment" : "UnsortedSegment") +
             ReducerDef::name + "Gradient",
         "",
         grad_ins,
         // no gradient on segment_ids or auxiliary inputs for now
         vector<string>{SparseFused ? GI_V(0) : GI(0)})};
     if (SparseFused) {
       SetSparse(0, I(ForwardOp::INDICES), GI_V(0));
     }
     return r;
   }
 };

 template <typename T, typename SIndex, typename Context, typename ReducerDef>
 struct AbstractSortedSegmentDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "SortedSegment";
   static constexpr const char* doc = R"DOC(
 Applies '{op}' to each segment of input tensor. Segments need to be sorted and
 contiguous. See also UnsortedSegment{op} that doesn't have this requirement.

 SEGMENT_IDS is a vector that maps each of the first dimension slices of the
 DATA to a particular group (segment). Values belonging to the same segment are
 aggregated together.

 The first dimension of the output is equal to the number of input segments,
 i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.

 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
     schema.Input(
         Reducer::kInputCount,
         "SEGMENT_IDS",
         "Vector with the same length as the first dimension of DATA "
         "and values in the range 0..K-1 and in increasing order that "
         "maps each slice of DATA to one of the segments");
     schema.Output(
         0,
         "OUTPUT",
         "Aggregated output tensor. Has the first dimension of K "
         "(the number of segments).");
     ReducerDef::PopulateSchema(schema);
   }
   using Reducer = typename ReducerDef::template Reducer<T, Context>;
   using ReducerGradient =
       typename ReducerDef::template ReducerGradient<T, Context>;
   using ForwardOp = AbstractSortedSegmentOp<T, SIndex, Context, Reducer, false>;
   using BackwardOp =
       AbstractSortedSegmentGradientOp<T, SIndex, Context, ReducerGradient>;
   using GetGradient = SegmentOpGetGradient<
       ForwardOp,
       ReducerDef,
       ReducerGradient,
       true /*Sorted*/,
       false /*SparseFused*/>;
 };

 template <typename T, typename SIndex, typename Context, typename ReducerDef>
 struct AbstractSparseSortedSegmentDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "SparseSortedSegment";
   static constexpr const char* doc = R"DOC(
 Pulls in slices of the input tensor, groups them into segments and applies
 '{op}' to each segment. Segments need to be sorted and contiguous. See also
 SparseUnsortedSegment{op} that doesn't have this requirement.

 This op is basically Gather and SortedSegment{op} fused together.

 INDICES should contain integers in range 0..N-1 where N is the first dimension
 of DATA. INDICES represent which slices of DATA need to be pulled in.

 SEGMENT_IDS is a vector that maps each referenced slice of the DATA to a
 particular group (segment). Values belonging to the same segment are aggregated
 together. SEGMENT_IDS should have the same dimension as INDICES.

 The first dimension of the output is equal to the number of input segments,
 i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor.

 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
     schema.Input(
         Reducer::kInputCount,
         "INDICES",
         "Integer vector containing indices of the first dimension of DATA for "
         "the slices that are being aggregated");
     schema.Input(
         Reducer::kInputCount + 1,
         "SEGMENT_IDS",
         "Vector with the same length as INDICES and values in the range "
         "0..K-1 and in increasing order that maps each slice of DATA referenced"
         " by INDICES to one of the segments");
     schema.Output(
         0,
         "OUTPUT",
         "Aggregated output tensor. Has the first dimension of K "
         "(the number of segments).");
     ReducerDef::PopulateSchema(schema);
   }
   using Reducer = typename ReducerDef::template Reducer<T, Context>;
   using ReducerGradient =
       typename ReducerDef::template ReducerGradient<T, Context>;
   using ForwardOp = AbstractSortedSegmentOp<T, SIndex, Context, Reducer>;
   // TODO(dzhulgakov): we're registering the same class twice here,
   // consider avoiding op duplication here
   using BackwardOp =
       AbstractSortedSegmentGradientOp<T, SIndex, Context, ReducerGradient>;
   using GetGradient = SegmentOpGetGradient<
       ForwardOp,
       ReducerDef,
       ReducerGradient,
       true /*Sorted*/,
       true /*SparseFused*/>;
 };

 /**
  * @brief Unsorted segment reduction op with optional fused embedding lookup
  *
  * Base implementation for UnsortedSegmentXXX and UnsparseSortedSegmentXXX
  * depending on SparseFused static argument.
  *
  * Unlike the sorted version it allows to have "gaps" in segment ids.
  *
  * Inputs:
  *   0: DATA - input embedding to do lookups in
  *   1..P: AUX_ARG_<I> - optional additional arguments to be passed to the
  *                       reducer, should have the same first dimension as
  *                       SEGMENT_IDS (e.g. scalars in WeightedSum)
  *   # if SparseFused == true:
  *   P+1: INDICES - 1-D vector with indices to look up in DATA. Should have the
  *                  same dimension as SEGMENT_IDS
  *   # P+1 if SparseFused == false:
  *   P+1 or P+2: SEGMENT_IDS - unsorted segment ids 1-D vector
  *
  * Args:
  *   num_segments - allows to override the dimension of the output. If not set
  *                  it would be inferred from segment_ids tensor.
  *
  *
  * Output:
  *   Tensor with first dimension of K, where K is the max segment id + 1. Rest
  *   of dimensions are decided by reducer but usually are the same size as extra
  *   dimensions of DATA
  */
 template <
     typename T,
     typename SIndex,
     class Context,
     class Reducer,
     bool SparseFused = true,
     class InputAccessor = BaseInputAccessor<T>>
 class AbstractUnsortedSegmentOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   template <class... Args>
   explicit AbstractUnsortedSegmentOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...),
         OP_SINGLE_ARG(int, "num_segments", num_segments_, -1) {}

   bool RunOnDevice() override {
     if (SparseFused) {
       return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
           this, Input(INDICES));
     } else {
       // type doesn't matter
       return DoRunWithType<int64_t>();
     }
   }

   template <typename IndexType>
   bool DoRunWithType() {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     int64_t in_block_size = Input(0).size_from_dim(1);
     return DispatchHelper<typename Reducer::FixedDispatch, IndexType>::call(
         this, in_block_size);
   }

   template <typename IndexType, int FixedSize>
   bool DoRunWithValue() {
     auto& data = Input(0);
     auto& segment_ids = Input(SEGMENT_IDS);

     CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector");
     int64_t N = segment_ids.size(0);
     const int64_t M = data.size(0);

     const IndexType* idxs;
     if (SparseFused) { // static if
       auto& indices = Input(INDICES);
       CAFFE_ENFORCE_EQ(1, indices.dim(), "INDICES must be a vector");
       CAFFE_ENFORCE_EQ(
           N,
           indices.size(0),
           "SEGMENT_IDS must have the same length as INDICES");
       idxs = indices.template data<IndexType>();
     } else {
       CAFFE_ENFORCE_EQ(
           N, M, "DATA must have the same first dimension as SEGMENT_IDS");
     }

     // It would probably look nicer with varargs templates but it's too much
     // metaprogramming
     typename Reducer::Meta ctx;
     ctx.observeInput(0, data, 1);
     for (int i = 1; i < Reducer::kInputCount; ++i) {
       auto& aux_in = Input(i);
       CAFFE_ENFORCE_EQ(
           N,
           aux_in.size(0),
           "Input ",
           i,
           " must have the same first dim as SEGMENT_IDS");
       ctx.observeInput(i, aux_in, 1);
     }

     const SIndex* s_ids = segment_ids.template data<SIndex>();
     OPERATOR_NEEDS_FEATURE(
         inputAccessor_.observeInput(data),
         "Unsupported input type: ",
         data.dtype().name(),
         ".");

     // determine the number of segments
     SIndex K;
     if (num_segments_ != -1) {
       K = num_segments_;
     } else {
       K = 0;
       for (const auto i : c10::irange(N)) {
         K = std::max(K, s_ids[i] + 1);
       }
     }

     vector<int64_t> shape;
     shape.push_back(K);
     ctx.appendOutputShape(&shape);
     auto* output = Output(0, shape, at::dtype<T>());

     int64_t in_block_size = data.size_from_dim(1);
     int64_t out_block_size = output->size_from_dim(1);
     T* out = output->template mutable_data<T>();

     reducers_.clear();
     reducers_.reserve(K);
     for (const auto i : c10::irange(K)) {
       reducers_.emplace_back(ctx, out + out_block_size * i, &context_);
     }

     for (const auto i : c10::irange(N)) {
       auto s_id = s_ids[i];
       CAFFE_ENFORCE(
           0 <= s_id && s_id < K,
           "Segment id out of range: ",
           s_id,
           ", range 0 to ",
           K);
       IndexType idx;
       if (SparseFused) { // static if
         CAFFE_ENFORCE(
             0 <= idxs[i] && idxs[i] < M,
             "Index out of bounds: ",
             idxs[i],
             ", range 0 to ",
             M);
         idx = idxs[i];
       } else {
         idx = i;
       }
       reducers_[s_id].template process<FixedSize>(
           ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_);
     }

     for (const auto i : c10::irange(K)) {
       reducers_[i].template finish<FixedSize>(ctx, &context_);
     }
     // call reducers destructors (if there is any)
     reducers_.clear();
     return true;
   }

   enum {
     INDICES = Reducer::kInputCount,
     SEGMENT_IDS = Reducer::kInputCount + (SparseFused ? 1 : 0)
   };
   static constexpr int kSelfInputs = SparseFused ? 2 : 1;
   static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;

  private:
   int64_t num_segments_;
   // member field to reuse memory
   vector<Reducer> reducers_;
   InputAccessor inputAccessor_;
 };

 // Gradient actually doesn't depend on whether sparse lookup is fused or not
 template <typename T, typename SIndex, class Context, class ReducerGradient>
 class AbstractUnsortedSegmentGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractUnsortedSegmentGradientOp);

   bool RunOnDevice() override {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     int64_t grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, grad_block_size);
   }

   template <int FixedSize>
   bool DoRunWithValue() {
     auto& segment_grads = Input(SEGMENT_GRADS);
     auto& segment_ids = Input(SEGMENT_IDS);

     CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector");
     int64_t N = segment_ids.size(0);

     typename ReducerGradient::Meta ctx(segment_grads, 1);
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
     for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
       auto& aux_in = Input(i);
       CAFFE_ENFORCE_EQ(
           N,
           aux_in.size(0),
           "Input ",
           i,
           " must have the same first dim as SEGMENT_IDS");
       ctx.observeOriginalInput(
           ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
     }

     const SIndex* s_ids = segment_ids.template data<SIndex>();
     const T* s_grads = segment_grads.template data<T>();

     vector<int64_t> shape;
     shape.push_back(N);
     ctx.appendGradShape(&shape);
     auto* data_grads = Output(0, shape, at::dtype<T>());

     int64_t d_block_size = data_grads->size_from_dim(1);
     const SIndex K = segment_grads.size(0);
     int64_t s_block_size = segment_grads.size_from_dim(1);
     T* out = data_grads->template mutable_data<T>();

     if (ReducerGradient::computeLength()) {
       segment_length_.resize(K, 0);
       for (const auto i : c10::irange(N)) {
         auto s_id = s_ids[i];
         CAFFE_ENFORCE(
             0 <= s_id && s_id < K,
             "Segment id out of range: ",
             s_id,
             ", range 0 to ",
             K);
         segment_length_[s_ids[i]]++;
       }
     }

     reducers_.clear();
     reducers_.reserve(K);
     for (SIndex i = 0; i < K; ++i) {
       reducers_.emplace_back(ctx, s_grads + s_block_size * i, &context_);
     }

     for (const auto i : c10::irange(N)) {
       auto s_id = s_ids[i];
       if (ReducerGradient::computeLength()) {
         reducers_[s_id].template fillGrad<FixedSize>(
             ctx, out + d_block_size * i, i, &context_, segment_length_[s_id]);
       } else {
         reducers_[s_id].template fillGrad<FixedSize>(
             ctx, out + d_block_size * i, i, &context_, 0);
       }
     }
     // call reducers destructors (if there is any)
     reducers_.clear();
     return true;
   }

   // Input layout:
   //   orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS
   // orig_argXs represent original op's inputs and will be passed to the reducer
   // directly
   static constexpr int kNumInputs =
       ReducerGradient::originalInputs().size() + 2;
   enum _InputTags {
     SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
     SEGMENT_IDS
   };

  private:
   // member field to reuse memory
   vector<ReducerGradient> reducers_;
   vector<int> segment_length_;
 };

 template <typename T, typename SIndex, typename Context, typename ReducerDef>
 struct AbstractUnsortedSegmentDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "UnsortedSegment";
   static constexpr const char* doc = R"DOC(
 Applies '{op}' to each segment of input tensor. Segments ids can appear in
 arbitrary order (unlike in SortedSegment{op}).

 SEGMENT_IDS is a vector that maps each of the first dimension slices of the
 DATA to a particular group (segment). Values belonging to the same segment are
 aggregated together.

 If `num_segments` argument is passed it would be used as a first dimension for
 the output. Otherwise, it'd be dynamically calculated from as the max value of
 SEGMENT_IDS plus one. Other output dimensions are inherited from the input
 tensor.

 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Arg(
         "num_segments",
         "Optional int argument specifying the number of output segments and "
         "thus the first dimension of the output");
     schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
     schema.Input(
         Reducer::kInputCount,
         "SEGMENT_IDS",
         "Integer vector with the same length as the first dimension of DATA "
         "that maps each slice of DATA to one of the segments");
     schema.Output(
         0,
         "OUTPUT",
         "Aggregated output tensor. Has the first dimension of equal to the "
         "number of segments.");
     ReducerDef::PopulateSchema(schema);
   }
   using Reducer = typename ReducerDef::template Reducer<T, Context>;
   using ReducerGradient =
       typename ReducerDef::template ReducerGradient<T, Context>;
   using ForwardOp = AbstractUnsortedSegmentOp<
       T,
       SIndex,
       Context,
       typename ReducerDef::template Reducer<T, Context>,
       false>;
   using BackwardOp =
       AbstractUnsortedSegmentGradientOp<T, SIndex, Context, ReducerGradient>;
   using GetGradient = SegmentOpGetGradient<
       ForwardOp,
       ReducerDef,
       ReducerGradient,
       false /*Sorted*/,
       false /*SparseFused*/>;
 };

 template <typename T, typename SIndex, typename Context, typename ReducerDef>
 struct AbstractSparseUnsortedSegmentDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "SparseUnsortedSegment";
   static constexpr const char* doc = R"DOC(
 Pulls in slices of the input tensor, groups them into segments and applies
 '{op}' to each segment. Segments ids can appear in arbitrary order (unlike in
 SparseSortedSegment{op}).

 This op is basically Gather and UnsortedSegment{op} fused together.

 INDICES should contain integers in range 0..N-1 where N is the first dimension
 of DATA. INDICES represent which slices of DATA need to be pulled in.

 SEGMENT_IDS is a vector that maps each referenced slice of the DATA to a
 particular group (segment). Values belonging to the same segment are aggregated
 together. SEGMENT_IDS should have the same dimension as INDICES.

 If `num_segments` argument is passed it would be used as a first dimension for
 the output. Otherwise, it'd be dynamically calculated from as the max value of
 SEGMENT_IDS plus one. Other output dimensions are inherited from the input
 tensor.

 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
     schema.Input(
         Reducer::kInputCount,
         "INDICES",
         "Integer vector containing indices of the first dimension of DATA for "
         "the slices that are being aggregated");
     schema.Input(
         Reducer::kInputCount + 1,
         "SEGMENT_IDS",
         "Integer vector with the same length as INDICES that maps each slice "
         "of DATA referenced by INDICES to one of the segments");
     schema.Output(
         0,
         "OUTPUT",
         "Aggregated output tensor. Has the first dimension of equal to the "
         "number of segments.");
     ReducerDef::PopulateSchema(schema);
   }
   using Reducer = typename ReducerDef::template Reducer<T, Context>;
   using ReducerGradient =
       typename ReducerDef::template ReducerGradient<T, Context>;
   using ForwardOp = AbstractUnsortedSegmentOp<T, SIndex, Context, Reducer>;
   // TODO(dzhulgakov): we're registering the same class twice here,
   // consider avoiding op duplication here
   using BackwardOp =
       AbstractUnsortedSegmentGradientOp<T, SIndex, Context, ReducerGradient>;
   using GetGradient = SegmentOpGetGradient<
       ForwardOp,
       ReducerDef,
       ReducerGradient,
       false /*Sorted*/,
       true /*SparseFused*/>;
 };

 /**
  * @brief Segment reduction op with optional fused embedding lookup
  *
  * Base implementation for LengthsXXX and SparseLengthsXXX depending
  * on SparseFused static argument.
  *
  * Inputs:
  *   0: DATA - input embedding to do lookups in
  *   1..P: AUX_ARG_<I> - optional additional arguments to be passed to the
  *                       reducer, should have the same first dimension as
  *                       LENGTHS (e.g. scalars in WeightedSum)
  *   # if SparseFused == true:
  *   P+1: INDICES - 1-D vector with indices to look up in DATA. Should have the
  *                  same dimension as LENGTHS
  *   # P+1 if SparseFused == false:
  *   P+1 or P+2: LENGTHS - lengths on indecies vector
  *
  * Output:
  *   Tensor with first dimension of K, where K = len(LENGTHS). Rest
  *   of dimensions are decided by reducer but usually are the same size as extra
  *   dimensions of DATA
  */
 // TODO(dzhulgakov): for now it's implemented with incremental reducers because
 // of fused sparse support. But using "lengths" representation actually implies
 // continuous segments and thus range reducers can be used for non-sparse
 // version.

 template <
     typename TData,
     typename TLengths,
     class Context,
     class Reducer,
     bool SparseFused = true,
     class InputAccessor = BaseInputAccessor<TData>>
 class AbstractLengthsOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractLengthsOp);

   bool RunOnDevice() override {
     if (SparseFused) {
       return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
           this, Input(INDICES));
     } else {
       // type doesn't matter
       return DoRunWithType<int64_t>();
     }
   }

   template <typename IndexType>
   bool DoRunWithType() {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     int64_t in_block_size = Input(0).size_from_dim(1);
     return DispatchHelper<typename Reducer::FixedDispatch, IndexType>::call(
         this, in_block_size);
   }

   template <typename IndexType, int FixedSize>
   bool DoRunWithValue() {
     auto& dataInput = Input(0);
     auto& lengthsInput = Input(LENGTHS);

     CAFFE_ENFORCE_EQ(1, lengthsInput.dim(), "LENGTHS must be a vector");
     const int64_t dataSize = dataInput.size(0);
     // Either first dim the data or how much we pull in indexies from it
     int64_t dataToReduceSize;
     const int64_t outputSize = lengthsInput.size(0);

     const IndexType* indices;
     if (SparseFused) { // static if
       auto& indicesInput = Input(INDICES);
       CAFFE_ENFORCE_EQ(1, indicesInput.dim(), "INDICES must be a vector");
       indices = indicesInput.template data<IndexType>();
       dataToReduceSize = indicesInput.size(0);
     } else {
       dataToReduceSize = dataSize;
     }

     typename Reducer::Meta ctx;
     ctx.observeInput(0, dataInput, 1);
     for (int i = 1; i < Reducer::kInputCount; ++i) {
       auto& aux_in = Input(i);
       CAFFE_ENFORCE(
           dataToReduceSize == aux_in.size(0),
           "Input ",
           i,
           " must have the same first dim as SEGMENT_IDS");
       ctx.observeInput(i, aux_in, 1);
     }

     const TLengths* lengths = lengthsInput.template data<TLengths>();

     OPERATOR_NEEDS_FEATURE(
         inputAccessor_.observeInput(dataInput),
         "Unsupported input type: ",
         dataInput.dtype().name(),
         ".");

     vector<int64_t> shape{outputSize};
     ctx.appendOutputShape(&shape);
     auto* output = Output(0, shape, at::dtype<TData>());

     int64_t in_block_size = dataInput.size_from_dim(1);
     int64_t out_block_size = output->size_from_dim(1);
     TData* out = output->template mutable_data<TData>();

     int64_t dataIndex = 0;
     for (const auto rangeIndex : c10::irange(outputSize)) {
       Reducer reducer(ctx, out + out_block_size * rangeIndex, &context_);
       for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         IndexType idx;
         if (SparseFused) { // static if
           idx = indices[dataIndex];
           CAFFE_ENFORCE(
               0 <= idx && idx < dataSize,
               "The ",
               dataIndex,
               "th index from the input indices is out of bounds: ",
               idx,
               " vs. valid range 0 to ",
               dataSize);
         } else {
           idx = dataIndex;
           CAFFE_ENFORCE(
               0 <= idx && idx < dataSize,
               "When calculating the ",
               rangeIndex,
               "th output with length=",
               lengths[rangeIndex],
               ", the index is out of bounds: ",
               idx,
               " vs. valid range 0 to ",
               dataSize);
         }

         const TData* input = inputAccessor_.getBlockPtr(in_block_size, idx);
         reducer.template process<FixedSize>(ctx, input, dataIndex, &context_);
       }
       reducer.template finish<FixedSize>(ctx, &context_);
     }
     CAFFE_ENFORCE(
         dataIndex == dataToReduceSize, dataIndex, " != ", dataToReduceSize);

     return true;
   }

   enum {
     INDICES = Reducer::kInputCount,
     LENGTHS = Reducer::kInputCount + (SparseFused ? 1 : 0)
   };
   static constexpr int kSelfInputs = SparseFused ? 2 : 1;
   static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs;

  private:
   InputAccessor inputAccessor_;
 };

 /*
  * Some notice:
  * 1. Gradient actually doesn't depend on whether sparse lookup is fused or not
  * 2. INDICES are not used in CPU version, but they are needed in async CUDA
  *    version. So we register 3 input version for CPU as gradient op for
  *    GPU/CPU convert. We then register 2 input version for CPU for backward
  *    compatibility with older nets.
  */
 template <
     typename T,
     typename TLengths,
     class Context,
     class ReducerGradient,
     bool GradientNeedIndices = false>
 class AbstractLengthsGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractLengthsGradientOp);

   bool RunOnDevice() override {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     int64_t gradBlockSize = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, gradBlockSize);
   }

   template <int FixedSize>
   bool DoRunWithValue() {
     auto& segmentGradsInput = Input(SEGMENT_GRADS);
     auto& lengthsInput = Input(LENGTHS);

     CAFFE_ENFORCE(lengthsInput.dim() == 1, "LENGTHS must be a vector");
     int64_t reducedDataSize = 0;
     int64_t numSegments = lengthsInput.size(0);
     CAFFE_ENFORCE(segmentGradsInput.dim() > 0);
     CAFFE_ENFORCE(numSegments == segmentGradsInput.size(0));
     const TLengths* lengths = lengthsInput.template data<TLengths>();
     for (const auto i : c10::irange(numSegments)) {
       reducedDataSize += lengths[i];
     }

     typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
     for (auto i = 0U; i < ReducerGradient::originalInputs().size(); ++i) {
       auto& aux_in = Input(i);
       CAFFE_ENFORCE_EQ(
           reducedDataSize,
           aux_in.size(0),
           "Input ",
           i,
           " must have the same first dim as SEGMENT_IDS");
       ctx.observeOriginalInput(
           ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1);
     }

     const T* segmentGrads = segmentGradsInput.template data<T>();

     vector<int64_t> shape;
     shape.push_back(reducedDataSize);
     ctx.appendGradShape(&shape);
     auto* dataGradsOutput = Output(0, shape, at::dtype<T>());

     int64_t dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
     int64_t segmentBlockSize = segmentGradsInput.size_from_dim(1);
     T* dataGrads = dataGradsOutput->template mutable_data<T>();

     int64_t dataIndex = 0;
     for (const auto rangeIndex : c10::irange(numSegments)) {
       ReducerGradient reducer(
           ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
       for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         reducer.template fillGrad<FixedSize>(
             ctx,
             dataGrads + dataGradsBlockSize * dataIndex,
             dataIndex,
             &context_,
             lengths[rangeIndex]);
       }
     }
     CAFFE_ENFORCE(
         dataIndex == reducedDataSize, dataIndex, " != ", reducedDataSize);
     return true;
   }

   // Input layout:
   //   orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, LENGTHS, INDICES
   // orig_argXs represent original op's inputs and will be passed to the reducer
   // directly
   static constexpr int kNumInputs = ReducerGradient::originalInputs().size() +
       2 + (GradientNeedIndices ? 1 : 0);
   enum _InputTags {
     SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
     LENGTHS,
     INDICES
   };
 };

 // Version of gradient that requires the main input and thus needs to receive
 // length, indices and other stuff
 template <
     typename Tembedding,
     typename T,
     typename TLengths,
     class Context,
     class ReducerGradient,
     bool SparseFused = true,
     bool GradientNeedIndices = false>
 class AbstractLengthsWithMainInputGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractLengthsWithMainInputGradientOp);

   bool RunOnDevice() override {
     if (SparseFused) {
       return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
           this, Input(INDICES));
     } else {
       // type doesn't matter
       return DoRunWithType<int64_t>();
     }
   }

   template <typename IndexType>
   bool DoRunWithType() {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class
     int64_t in_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch, IndexType>::
         call(this, in_block_size);
   }

   template <typename IndexType, int FixedSize>
   bool DoRunWithValue() {
     auto& dataInput = Input(DATA_INPUT);
     auto& segmentGradsInput = Input(SEGMENT_GRADS);
     auto& lengthsInput = Input(LENGTHS);

     CAFFE_ENFORCE(lengthsInput.dim() == 1, "LENGTHS must be a vector");
     int64_t numSegments = lengthsInput.size(0);
     CAFFE_ENFORCE(segmentGradsInput.dim() > 0);
     CAFFE_ENFORCE(numSegments == segmentGradsInput.size(0));
     const TLengths* lengths = lengthsInput.template data<TLengths>();

     typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
     for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
       int aux_num = ReducerGradient::originalInputs()[i];
       auto& aux_in = Input(i);
       auto* aux_grad = aux_num < OutputSize() ? Output(aux_num) : nullptr;
       ctx.observeOriginalInput(aux_num, aux_in, aux_grad, 1);
     }

     // Either first dim the data or how much we pull in indexies from it
     int64_t dataToReduceSize;
     const IndexType* indices = nullptr;
     if (SparseFused) { // static if
       auto& indicesInput = Input(INDICES);
       indices = indicesInput.template data<IndexType>();
       dataToReduceSize = indicesInput.size(0);
     } else {
       dataToReduceSize = dataInput.size(0);
     }

     const T* segmentGrads = segmentGradsInput.template data<T>();

     vector<int64_t> shape;
     shape.push_back(dataToReduceSize);
     ctx.appendGradShape(&shape);
     auto* dataGradsOutput = Output(0, shape, at::dtype<T>());

     int64_t dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
     int64_t segmentBlockSize = segmentGradsInput.size_from_dim(1);
     T* dataGrads = dataGradsOutput->template mutable_data<T>();

     const Tembedding* data = dataInput.template data<Tembedding>();
     int64_t dataIndex = 0;
     for (const auto rangeIndex : c10::irange(numSegments)) {
       ReducerGradient reducer(
           ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
       for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         IndexType data_pos;
         // No range checking, should've been verified in forward pass
         if (SparseFused) { // static if
           data_pos = indices[dataIndex];
         } else {
           data_pos = dataIndex;
         }
         reducer.template fillGradWithMainInput<FixedSize>(
             ctx,
             data + dataGradsBlockSize * data_pos,
             dataGrads + dataGradsBlockSize * dataIndex,
             dataIndex,
             &context_,
             lengths[rangeIndex]);
       }
     }
     return true;
   }

   // Input layout:
   //   orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, LENGTHS,
   //      DATA_INPUT, [INDICES]
   // orig_argXs represent original op's inputs and will be passed to the reducer
   // directly
   static constexpr int kNumInputs = ReducerGradient::originalInputs().size() +
       3 + (SparseFused ? 1 : 0) + (GradientNeedIndices ? 1 : 0);
   enum _InputTags {
     SEGMENT_GRADS = ReducerGradient::originalInputs().size(),
     LENGTHS,
     DATA_INPUT,
     INDICES,
   };
 };

 // Version of gradient that requires the main input as well as the output of the
 // forward op.
 template <typename T, typename TLengths, class Context, class ReducerGradient>
 class AbstractLengthsWithMainInputAndForwardOutputGradientOp
     : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   USE_SIMPLE_CTOR_DTOR(AbstractLengthsWithMainInputAndForwardOutputGradientOp);

   bool RunOnDevice() override {
     // If more complicated fixed size logic becomes necessary, it can be moved
     // to the reducer class.
     int64_t in_block_size = Input(SEGMENT_GRADS).size_from_dim(1);
     return DispatchHelper<typename ReducerGradient::FixedDispatch>::call(
         this, in_block_size);
   }

   template <int FixedSize>
   bool DoRunWithValue() {
     auto& dataInput = Input(DATA_INPUT);
     auto& segmentGradsInput = Input(SEGMENT_GRADS);
     auto& lengthsInput = Input(LENGTHS);
     auto& forwardOutputInput = Input(FORWARD_OUTPUT);

     CAFFE_ENFORCE(lengthsInput.dim() == 1, "LENGTHS must be a vector");
     int64_t numSegments = lengthsInput.size(0);
     CAFFE_ENFORCE(segmentGradsInput.dim() > 0);
     CAFFE_ENFORCE(numSegments == segmentGradsInput.size(0));
     const TLengths* lengths = lengthsInput.template data<TLengths>();

     typename ReducerGradient::Meta ctx(segmentGradsInput, 1);
     // NOLINTNEXTLINE(clang-diagnostic-sign-compare)
     for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) {
       int aux_num = ReducerGradient::originalInputs()[i];
       auto& aux_in = Input(i);
       auto* aux_grad = aux_num < OutputSize() ? Output(aux_num) : nullptr;
       ctx.observeOriginalInput(aux_num, aux_in, aux_grad, 1);
     }

     CAFFE_ENFORCE(forwardOutputInput.dim() > 0);
     CAFFE_ENFORCE(numSegments == forwardOutputInput.size(0));
     const T* forwardOutput = forwardOutputInput.template data<T>();

     int64_t dataToReduceSize = dataInput.size(0);

     const T* segmentGrads = segmentGradsInput.template data<T>();

     vector<int64_t> shape;
     shape.push_back(dataToReduceSize);
     ctx.appendGradShape(&shape);
     auto* dataGradsOutput = Output(0, shape, at::dtype<T>());

     int64_t dataGradsBlockSize = dataGradsOutput->size_from_dim(1);
     int64_t segmentBlockSize = segmentGradsInput.size_from_dim(1);
     T* dataGrads = dataGradsOutput->template mutable_data<T>();

     const T* data = dataInput.template data<T>();

     int64_t dataIndex = 0;
     for (const auto rangeIndex : c10::irange(numSegments)) {
       ReducerGradient reducer(
           ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_);
       for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         // No range checking, should've been verified in forward pass
         reducer.template fillGradWithMainInputAndForwardOutput<FixedSize>(
             ctx,
             data + dataGradsBlockSize * dataIndex,
             dataGrads + dataGradsBlockSize * dataIndex,
             forwardOutput + segmentBlockSize * rangeIndex,
             dataIndex,
             &context_,
             lengths[rangeIndex]);
       }
     }
     return true;
   }

   // Input layout:
   //   orig_arg1, orig_arg2, ..., orig_argN, FORWARD_OUTPUT, SEGMENT_GRADS,
   //      LENGTHS, DATA_INPUT
   // orig_argXs represent original op's inputs and will be passed to the reducer
   // directly
   static constexpr int kNumInputs =
       ReducerGradient::originalInputs().size() + 4;
   enum _InputTags {
     FORWARD_OUTPUT = ReducerGradient::originalInputs().size(),
     SEGMENT_GRADS,
     LENGTHS,
     DATA_INPUT,
   };
 };

 // base implementation of sparse/non-sparse gradient computation
 template <
     typename ForwardOp,
     typename ReducerDef,
     typename ReducerGradient,
     bool SparseFused,
     bool GradientNeedIndices = false>
 struct LengthsOpGetGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     vector<string> grad_ins;
     string suffix = "Gradient";
     for (const int i : ReducerGradient::originalInputs()) {
       grad_ins.push_back(I(i));
     }
     if (ReducerGradient::requiresForwardOutput()) {
       grad_ins.push_back(O(0));
       CAFFE_ENFORCE(
           !SparseFused,
           "Forward pass output not yet supported as input for backward pass "
           "for SparseLengthsXXX operators");
       suffix = "AndForwardOutput" + suffix;
     }
     grad_ins.push_back(GO(0));
     grad_ins.push_back(I(ForwardOp::LENGTHS));
     bool indices_pushed = false;
     if (ReducerGradient::requiresDataInput(Def())) {
       grad_ins.push_back(I(0));
       if (SparseFused) {
         grad_ins.push_back(I(ForwardOp::INDICES));
         indices_pushed = true;
       }
       suffix = "WithMainInput" + suffix;
     }
     if (GradientNeedIndices && !indices_pushed) {
       if (SparseFused) {
         grad_ins.push_back(I(ForwardOp::INDICES));
       } else {
         // Hacky: using Input as Indices, remove this after we have specialized
         // cuda LengthsIndicesInGradientSumGradient
         grad_ins.push_back(I(0));
       }
     }
     vector<string> grad_outs;
     grad_outs.push_back({SparseFused ? GI_V(0) : GI(0)});
     int aux_grads = ReducerGradient::numAuxInputsWithGrads(Def());
     for (int i = 1; i <= aux_grads; ++i) {
       grad_outs.push_back(GI(i));
     }
     vector<OperatorDef> r{CreateOperatorDef(
         string(SparseFused ? "SparseLengths" : "Lengths") +
             string(GradientNeedIndices ? "IndicesInGradient" : "") +
             ReducerDef::name + suffix,
         "",
         grad_ins,
         grad_outs)};
     if (SparseFused) {
       SetSparse(0, I(ForwardOp::INDICES), GI_V(0));
     }
     return r;
   }
 };

 template <
     typename T,
     typename SIndex,
     typename Context,
     typename ReducerDef,
     bool GradientNeedIndices = false>
 struct AbstractLengthsDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "Lengths";
   static constexpr const char* doc = R"DOC(
 Applies '{op}' to each segment of the input tensor. Segments are defined
 by their *LENGTHS*. *LENGTHS* is a vector that maps each of the slices of
 *DATA* to a particular segment. Values belonging to the same segment are
 aggregated together and considered for the '{op}' operation.

 For example *LENGTHS = [2, 1]* stands for segments *DATA[0..1]* and *DATA[2]*

 The sum of elements in *LENGTHS* must equal the number of elements in the first
 dimension of *DATA*. The length of *OUTPUT* is equal to the number of input
 segments, i.e. len(*LENGTHS*).

 {op_doc}

 {extra}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
     schema.Input(
         Reducer::kInputCount,
         "LENGTHS",
         "Vector with the same sum of elements as the first dimension of DATA");
     schema.Output(
         0,
         "OUTPUT",
         "Aggregated output tensor. Has the first dimension of len(LENGTHS) ");
     schema.TensorInferenceFunction(
         [](const OperatorDef& def, const vector<TensorShape>& in) {
           vector<TensorShape> out(0);
           TensorShape output;
           for (int d : in[Reducer::kInputCount].dims()) {
             output.add_dims(d);
           }
           for (int j = 1; j < in[0].dims_size(); j++) {
             output.add_dims(in[0].dims(j));
           }
           output.set_data_type(in[0].data_type());
           out.push_back(output);
           return out;
         });
     ReducerDef::PopulateSchema(schema);
   }
   using Reducer = typename ReducerDef::template Reducer<T, Context>;
   using ReducerGradient =
       typename ReducerDef::template ReducerGradient<T, Context>;
   using ForwardOp = AbstractLengthsOp<T, SIndex, Context, Reducer, false>;
   using BackwardOp =
       AbstractLengthsGradientOp<T, SIndex, Context, ReducerGradient>;
   using WithMainInputBackwardOp = AbstractLengthsWithMainInputGradientOp<
       T,
       T,
       SIndex,
       Context,
       ReducerGradient,
       false>;
   using WithMainInputAndForwardOutputBackwardOp =
       AbstractLengthsWithMainInputAndForwardOutputGradientOp<
           T,
           SIndex,
           Context,
           ReducerGradient>;
   using GetGradient = LengthsOpGetGradient<
       ForwardOp,
       ReducerDef,
       ReducerGradient,
       false /*SparseFused*/,
       GradientNeedIndices>;
 };

 OpSchema::Cost CostInferenceForSparseLengths(
     const OperatorDef& def,
     const vector<TensorShape>& inputs,
     bool use_weight);

 template <
     typename T,
     typename SIndex,
     typename Context,
     typename ReducerDef,
     bool GradientNeedIndices = false>
 struct AbstractSparseLengthsDef {
   using OpDef = ReducerDef;
   static constexpr const char* basename = "SparseLengths";
   static constexpr const char* doc = R"DOC(
 Pulls in slices of the input tensor, groups them into segments and applies
 '{op}' to each segment. Segments are defined by their LENGTHS.

 This op is basically Gather and Lengths{op} fused together.

 INDICES should contain integers in range 0..N-1 where N is the first dimension
 of DATA. INDICES represent which slices of DATA need to be pulled in.

 LENGTHS is a vector that defines slice sizes by first dimension of DATA. Values
 belonging to the same segment are aggregated together. sum(LENGTHS) has
 to match INDICES size.

 The first dimension of the output is equal to the number of input segment,
 i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor.

 {op_doc}
   )DOC";
   static void PopulateSchema(OpSchema& schema) {
     schema.Input(0, "DATA", "Input tensor, slices of which are aggregated.");
     schema.Input(
         Reducer::kInputCount,
         "INDICES",
         "Integer vector containing indices of the first dimension of DATA for "
         "the slices that are being aggregated");
     schema.Input(
         Reducer::kInputCount + 1,
         "LENGTHS",
         "Non negative vector with sum of elements equal to INDICES length");
     schema.Output(
         0,
         "OUTPUT",
         "Aggregated output tensor. Has the first dimension of K "
         "(the number of segments).");
     schema.TensorInferenceFunction(OpSchema::NeedsAllInputShapes(
         [](const OperatorDef&, const std::vector<TensorShape>& input_types) {
           std::vector<TensorShape> out(1);
           out[0] = input_types[0];
           out[0].set_dims(0, input_types[Reducer::kInputCount + 1].dims(0));
           return out;
         }));
     ReducerDef::PopulateSchema(schema);

     schema.CostInferenceFunction(
         [](const OperatorDef& def,
            const vector<TensorShape>& inputs) -> OpSchema::Cost {
           return CostInferenceForSparseLengths(
               def, inputs, strcmp(OpDef::name, "WeightedSum") == 0);
         });
   }
   using Reducer = typename ReducerDef::template Reducer<T, Context>;
   using ReducerGradient =
       typename ReducerDef::template ReducerGradient<T, Context>;
   using ForwardOp = AbstractLengthsOp<T, SIndex, Context, Reducer>;
   // TODO(dzhulgakov): we're registering the same class twice here,
   // consider avoiding op duplication here
   // Note: registering 2 input version for now because of naming in the macro,
   // will register 3 input version alone
   /* INDICES are not used in CPU version, but they are needed in async CUDA
    *    version. So we register 3 input version for CPU as gradient op for
    *    GPU/CPU convert. We then register 2 input version for CPU for backward
    *    compatibility with older nets.
    */
   using BackwardOp = AbstractLengthsGradientOp<
       T,
       SIndex,
       Context,
       ReducerGradient,
       false /*GradientNeedIndices*/>;
   using WithMainInputBackwardOp = AbstractLengthsWithMainInputGradientOp<
       T,
       T,
       SIndex,
       Context,
       ReducerGradient>;
   // Will return 3 input version. This is aligning new CPU/GPU nets.
   using GetGradient = LengthsOpGetGradient<
       ForwardOp,
       ReducerDef,
       ReducerGradient,
       true /*SparseFused*/,
       GradientNeedIndices>;
 };
 } // namespace caffe2

 #endif // CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_