| #ifndef CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_ |
| #define CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_ |
| |
| #include "caffe2/core/export_caffe2_op_to_c10.h" |
| #include <c10/util/irange.h> |
| #include "caffe2/core/context.h" |
| #include "caffe2/core/logging.h" |
| #include "caffe2/core/operator.h" |
| #include "caffe2/operators/reducer_functors.h" |
| |
| C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(LengthsSum); |
| C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(LengthsMean); |
| C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(LengthsMax); |
| |
| namespace caffe2 { |
| |
| template <typename TData> |
| class BaseInputAccessor { |
| public: |
| BaseInputAccessor() {} |
| |
| bool observeInput(const Tensor& dataInput) { |
| data_ = dataInput.raw_data(); |
| return dataInput.template IsType<TData>(); |
| } |
| |
| inline const TData* |
| getBlockPtr(int64_t in_block_size, int64_t idx, int64_t /* blocks */ = 1) { |
| return static_cast<const TData*>(data_) + in_block_size * idx; |
| } |
| |
| protected: |
| const void* data_ = nullptr; |
| }; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // Range reducer ops: leverage that input segment is continuous and allow |
| // reducer functors to do something special |
| // Note: for now there are no real use cases for it yet :) |
| // Also, doesn't support additional arguments for now |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| /** |
| * Base implementation for segment reduction op that leverages continuity of the |
| * data |
| * |
| * Assumes that segments are sorted and there are no skip indices |
| */ |
| template < |
| typename T, |
| typename SIndex, |
| class Context, |
| class RangeReducer, |
| class InputAccessor = BaseInputAccessor<T>> |
| class AbstractSortedSegmentRangeOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentRangeOp); |
| |
| bool RunOnDevice() override { |
| auto& dataInput = Input(DATA); |
| auto& segment_ids = Input(SEGMENT_IDS); |
| |
| CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector"); |
| auto N = segment_ids.size(0); |
| CAFFE_ENFORCE_EQ( |
| N, |
| dataInput.size(0), |
| "SEGMENT_IDS must have the same length as outer dimension of DATA"); |
| |
| OPERATOR_NEEDS_FEATURE( |
| inputAccessor_.observeInput(dataInput), |
| "Unsupported input type: ", |
| dataInput.dtype().name(), |
| "."); |
| |
| const SIndex* s_ids = segment_ids.template data<SIndex>(); |
| |
| const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0; |
| auto shape = dataInput.sizes().vec(); |
| shape[0] = K; |
| auto* output = Output(0, shape, at::dtype<T>()); |
| |
| T* out = output->template mutable_data<T>(); |
| |
| if (N == 0) { |
| return true; |
| } |
| |
| int64_t block_size = dataInput.numel() / N; |
| |
| // Assume the segments are sorted and there are no gaps |
| CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps"); |
| for (int64_t i = 0; i < N;) { |
| int64_t start = i; |
| for (++i; i < N && s_ids[start] == s_ids[i]; ++i) |
| ; |
| |
| RangeReducer()( |
| block_size, |
| i - start, |
| inputAccessor_.getBlockPtr(block_size, start, i - start), |
| out + block_size * s_ids[start], |
| &context_); |
| |
| // check correctness of the next segment |
| if (i < N) { |
| CAFFE_ENFORCE_EQ( |
| s_ids[start] + 1, |
| s_ids[i], |
| "Indices must be sorted and not have gaps"); |
| } |
| } |
| return true; |
| } |
| |
| static constexpr int kNumInputs = 2; |
| INPUT_TAGS(DATA, SEGMENT_IDS); |
| |
| private: |
| InputAccessor inputAccessor_; |
| }; |
| |
| template < |
| typename T, |
| typename SIndex, |
| class Context, |
| class RangeReducerGradient> |
| class AbstractSortedSegmentRangeGradientOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentRangeGradientOp); |
| |
| bool RunOnDevice() override { |
| // TODO(azzolini): avoid using input/output if not used by a particular op |
| auto& data_in = Input(DATA_IN); |
| auto& data_out = Input(DATA_OUT); |
| auto& segment_grads = Input(SEGMENT_GRADS); |
| auto& segment_ids = Input(SEGMENT_IDS); |
| |
| CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector"); |
| int64_t N = segment_ids.size(0); |
| |
| const SIndex* s_ids = segment_ids.template data<SIndex>(); |
| const T* s_grads = segment_grads.template data<T>(); |
| const T* d_in = data_in.template data<T>(); |
| const T* d_out = data_out.template data<T>(); |
| |
| auto shape = segment_grads.sizes().vec(); |
| shape[0] = N; |
| auto* data_grads = Output(0, shape, at::dtype<T>()); |
| |
| const SIndex K = segment_grads.size(0); |
| T* out = data_grads->template mutable_data<T>(); |
| |
| if (N == 0) { |
| return true; |
| } |
| |
| int64_t block_size = segment_grads.size_from_dim(1); |
| |
| // Assume the segments are sorted and there are no gaps |
| CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps"); |
| // repeat the check from forward op |
| CAFFE_ENFORCE_EQ( |
| K - 1, s_ids[N - 1], "Indices must be sorted and not have gaps"); |
| for (int64_t i = 0; i < N;) { |
| int64_t start = i; |
| for (++i; i < N && s_ids[start] == s_ids[i]; ++i) |
| ; |
| |
| auto expanded_idx = block_size * start; |
| auto reduced_idx = block_size * s_ids[start]; |
| RangeReducerGradient()( |
| block_size, |
| i - start, |
| s_grads + reduced_idx, |
| out + expanded_idx, |
| d_in + expanded_idx, |
| d_out + reduced_idx, |
| &context_); |
| |
| // check correctness of the next segment |
| if (i < N) { |
| CAFFE_ENFORCE_EQ( |
| s_ids[start] + 1, |
| s_ids[i], |
| "Indices must be sorted and not have gaps"); |
| } |
| } |
| return true; |
| } |
| |
| static constexpr int kNumInputs = 4; |
| INPUT_TAGS(DATA_IN, DATA_OUT, SEGMENT_GRADS, SEGMENT_IDS); |
| }; |
| |
| template <typename T, typename SIndex, typename Context, typename ReducerDef> |
| struct AbstractSortedSegmentRangeDef { |
| using OpDef = ReducerDef; |
| static constexpr const char* basename = "SortedSegmentRange"; |
| static constexpr const char* doc = R"DOC( |
| Applies '{op}' to each segment of input tensor. In order to allow for more |
| efficient implementation of '{op}', the input segments have to be contiguous |
| and non-empty. |
| |
| SEGMENT_IDS is a vector that maps each of the first dimension slices of the |
| DATA to a particular group (segment). Values belonging to the same segment are |
| aggregated together. |
| |
| The first dimension of the output is equal to the number of input segments, |
| i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor. |
| |
| {op_doc} |
| )DOC"; |
| static void PopulateSchema(OpSchema& schema) { |
| schema.Input(0, "DATA", "Input tensor to be aggregated"); |
| schema.Input( |
| 1, |
| "SEGMENT_IDS", |
| "Vector with the same length as the first dimension of DATA " |
| "and values in the range 0..K-1 and in increasing order that " |
| "maps each slice of DATA to one of the segments"); |
| schema.Output( |
| 0, |
| "OUTPUT", |
| "Aggregated tensor with the first dimension of K and the " |
| "other dimentsions inherited from DATA"); |
| } |
| using ForwardOp = AbstractSortedSegmentRangeOp< |
| T, |
| SIndex, |
| Context, |
| typename ReducerDef::template Reducer<T, Context>>; |
| using BackwardOp = AbstractSortedSegmentRangeGradientOp< |
| T, |
| SIndex, |
| Context, |
| typename ReducerDef::template ReducerGradient<T, Context>>; |
| struct GetGradient : public GradientMakerBase { |
| using GradientMakerBase::GradientMakerBase; |
| vector<OperatorDef> GetGradientDefs() override { |
| return SingleGradientDef( |
| string(basename) + ReducerDef::name + "Gradient", |
| "", |
| vector<string>{I(0), O(0), GO(0), I(1)}, |
| // no gradient on segment_ids! |
| vector<string>{GI(0)}); |
| } |
| }; |
| }; |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // Incremental reducer ops: assume that reducer consumes pieces of data one by |
| // one. Also, supports additional arguments passed to reducer, e.g. scalers for |
| // weighted sum. |
| // |
| // Note: in current implementation additional inputs are considered auxiliary |
| // constants and have limitations: |
| // - there is no gradient computation for auxiliary inputs |
| // - auxiliary inputs aren't affected by fused embedding lookup in operations |
| // like sparse_sorted_segment |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| /** |
| * @brief Simple non-segmented reduction over the first few dimensions of the |
| * tensor |
| * |
| * Inputs: |
| * 0: DATA - input embedding to do lookups in |
| * 1..P: AUX_ARG_<I> - optional additional arguments to be passed to the |
| * reducer |
| * |
| * Args: |
| * num_reduce_dim (default 1) - the number of dims in front of the tensor to |
| * reduce |
| * |
| * Output: |
| * Tensor without the first `num_dim` dimensions of DATA |
| */ |
| template < |
| typename T, |
| class Context, |
| class Reducer, |
| bool FirstDim, |
| class InputAccessor = BaseInputAccessor<T>> |
| class AbstractReduceFrontOrBackOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| |
| template <class... Args> |
| explicit AbstractReduceFrontOrBackOp(Args&&... args) |
| : Operator<Context>(std::forward<Args>(args)...), |
| OP_SINGLE_ARG(int, "num_reduce_dim", num_reduce_dims_, 1) {} |
| |
| bool RunOnDevice() override { |
| auto& data = Input(0); |
| // If more complicated fixed size logic becomes necessary, it can be moved |
| // to the reducer class |
| int64_t in_block_size = FirstDim |
| ? data.size_from_dim(num_reduce_dims_) |
| : data.size_to_dim(data.dim() - num_reduce_dims_); |
| return DispatchHelper<typename Reducer::FixedDispatch>::call( |
| this, in_block_size); |
| } |
| |
| template <int FixedSize> |
| bool DoRunWithValue() { |
| auto& data = Input(0); |
| |
| CAFFE_ENFORCE_LE(num_reduce_dims_, data.dim()); |
| |
| typename Reducer::Meta ctx(FirstDim); |
| ctx.observeInput(0, data, num_reduce_dims_); |
| for (int i = 1; i < Reducer::kInputCount; ++i) { |
| auto& aux_in = Input(i); |
| ctx.observeInput(i, aux_in, num_reduce_dims_); |
| } |
| |
| OPERATOR_NEEDS_FEATURE( |
| inputAccessor_.observeInput(data), |
| "Unsupported input type: ", |
| data.dtype().name(), |
| "."); |
| |
| vector<int64_t> shape; |
| ctx.appendOutputShape(&shape); |
| auto* output = Output(0, shape, at::dtype<T>()); |
| |
| T* out = output->template mutable_data<T>(); |
| |
| const int block_size = FirstDim |
| ? data.size_from_dim(num_reduce_dims_) |
| : data.size_from_dim(data.dim() - num_reduce_dims_); |
| |
| const int num_blocks = block_size > 0 ? data.numel() / block_size : 0; |
| |
| Reducer r(ctx, out, &context_); |
| for (const auto i : c10::irange(num_blocks)) { |
| r.template process<FixedSize>( |
| ctx, inputAccessor_.getBlockPtr(block_size, i), i, &context_); |
| } |
| r.template finish<FixedSize>(ctx, &context_); |
| return true; |
| } |
| |
| static constexpr int kNumInputs = Reducer::kInputCount; |
| |
| private: |
| int num_reduce_dims_; |
| InputAccessor inputAccessor_; |
| }; |
| |
| template < |
| typename T, |
| class Context, |
| class ReducerGradient, |
| bool FirstDim = true> |
| class AbstractReduceFrontOrBackGradientOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| |
| template <class... Args> |
| explicit AbstractReduceFrontOrBackGradientOp(Args&&... args) |
| : Operator<Context>(std::forward<Args>(args)...), |
| OP_SINGLE_ARG(int, "num_reduce_dim", num_reduce_dims_, 1) {} |
| |
| bool RunOnDevice() override { |
| // If more complicated fixed size logic becomes necessary, it can be moved |
| // to the reducer class |
| int64_t grad_block_size = Input(REDUCTION_GRAD).numel(); |
| return DispatchHelper<typename ReducerGradient::FixedDispatch>::call( |
| this, grad_block_size); |
| } |
| |
| template <int FixedSize> |
| bool DoRunWithValue() { |
| auto& reduction_grad = Input(REDUCTION_GRAD); |
| auto& source_shape = this->template Input<Tensor>(SOURCE_SHAPE, CPU); |
| |
| typename ReducerGradient::Meta ctx(reduction_grad, 0, FirstDim); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) { |
| auto& aux_in = Input(i); |
| ctx.observeOriginalInput( |
| ReducerGradient::originalInputs()[i], |
| aux_in, |
| nullptr, /*no grad*/ |
| num_reduce_dims_); |
| } |
| |
| const T* r_grad = reduction_grad.template data<T>(); |
| |
| CAFFE_ENFORCE_LE(num_reduce_dims_, source_shape.numel()); |
| |
| vector<int64_t> shape( |
| source_shape.template data<int64_t>(), |
| source_shape.template data<int64_t>() + source_shape.numel()); |
| |
| auto* data_grads = Output(0, shape, at::dtype<T>()); |
| |
| int64_t block_size = FirstDim |
| ? data_grads->size_from_dim(num_reduce_dims_) |
| : data_grads->size_from_dim(data_grads->dim() - num_reduce_dims_); |
| int64_t block_num = block_size > 0 ? data_grads->numel() / block_size : 0; |
| |
| T* out = data_grads->template mutable_data<T>(); |
| |
| ReducerGradient r(ctx, r_grad, &context_); |
| for (const auto i : c10::irange(block_num)) { |
| r.template fillGrad<FixedSize>( |
| ctx, |
| out + block_size * i, |
| i, |
| &context_, |
| FirstDim ? block_num : block_size); |
| } |
| return true; |
| } |
| |
| static constexpr int kNumInputs = |
| ReducerGradient::originalInputs().size() + 2; |
| enum _InputTags { |
| REDUCTION_GRAD = ReducerGradient::originalInputs().size(), |
| SOURCE_SHAPE |
| }; |
| |
| private: |
| int num_reduce_dims_; |
| }; |
| |
| template <typename T, typename Context, typename ReducerDef> |
| struct AbstractReduceFrontDef { |
| using OpDef = ReducerDef; |
| static constexpr const char* basename = "ReduceFront"; |
| static constexpr const char* doc = R"DOC( |
| Reduces the input tensor along the first dimension of the input tensor by |
| applying '{op}'. This op acts in a similar way to SortedSegment{op} and |
| UnsortedSegment{op} but as if all input slices belong to a single segment. |
| |
| {op_doc} |
| )DOC"; |
| static void PopulateSchema(OpSchema& schema) { |
| schema.Input( |
| 0, "DATA", "Input tensor to be reduced on the first dimension"); |
| schema.TensorInferenceFunction([](const OperatorDef& def, |
| const vector<TensorShape>& in) { |
| CAFFE_ENFORCE_EQ(1, in.size()); |
| ArgumentHelper helper(def); |
| int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1); |
| typename ReducerDef::template Reducer<T, Context>::Meta ctx(true); |
| vector<int64_t> out_dims = ctx.getOutputShape(in[0], num_reduce_dims); |
| return vector<TensorShape>{ |
| CreateTensorShape(out_dims, in[0].data_type())}; |
| }); |
| ReducerDef::PopulateSchema(schema); |
| } |
| using ReducerGradient = |
| typename ReducerDef::template ReducerGradient<T, Context>; |
| using ForwardOp = AbstractReduceFrontOrBackOp< |
| T, |
| Context, |
| typename ReducerDef::template Reducer<T, Context>, |
| true>; |
| using BackwardOp = |
| AbstractReduceFrontOrBackGradientOp<T, Context, ReducerGradient, true>; |
| struct GetGradient : public GradientMakerBase { |
| using GradientMakerBase::GradientMakerBase; |
| vector<OperatorDef> GetGradientDefs() override { |
| // Have utility function generating these names? |
| string tmp_dims = "_" + O(0) + "_dims"; |
| |
| vector<string> grad_ins; |
| for (const int i : ReducerGradient::originalInputs()) { |
| grad_ins.push_back(I(i)); |
| } |
| grad_ins.push_back(GO(0)); |
| grad_ins.push_back(tmp_dims); |
| |
| vector<Argument> args; |
| if (ArgumentHelper::HasArgument(def_, "num_reduce_dim")) { |
| args.push_back(GetArgument(def_, "num_reduce_dim")); |
| } |
| // FIXME: pass in num_reduce_dims?! |
| return vector<OperatorDef>{ |
| CreateOperatorDef( |
| "Shape", "", vector<string>{I(0)}, vector<string>{tmp_dims}), |
| CreateOperatorDef( |
| string(basename) + ReducerDef::name + "Gradient", |
| "", |
| grad_ins, |
| // no gradient on auxiliary inputs for now |
| vector<string>{GI(0)}), |
| }; |
| } |
| }; |
| }; |
| |
| template <typename T, typename Context, typename ReducerDef> |
| struct AbstractReduceBackDef { |
| using OpDef = ReducerDef; |
| static constexpr const char* basename = "ReduceBack"; |
| static constexpr const char* doc = R"DOC( |
| Reduces the input tensor along the last dimension of the input tensor by |
| applying '{op}'. This op acts in a similar way to SortedSegment{op} and |
| UnsortedSegment{op} but as if all input slices belong to a single segment. |
| |
| {op_doc} |
| )DOC"; |
| static void PopulateSchema(OpSchema& schema) { |
| schema.Input( |
| 0, "DATA", "Input tensor to be reduced on the first dimension"); |
| schema.TensorInferenceFunction([](const OperatorDef& def, |
| const vector<TensorShape>& in) { |
| CAFFE_ENFORCE_EQ(1, in.size()); |
| ArgumentHelper helper(def); |
| int num_reduce_dims = helper.GetSingleArgument<int>("num_reduce_dim", 1); |
| typename ReducerDef::template Reducer<T, Context>::Meta ctx(false); |
| vector<int64_t> out_dims = ctx.getOutputShape(in[0], num_reduce_dims); |
| return vector<TensorShape>{ |
| CreateTensorShape(out_dims, in[0].data_type())}; |
| }); |
| ReducerDef::PopulateSchema(schema); |
| } |
| using ReducerGradient = |
| typename ReducerDef::template ReducerGradient<T, Context>; |
| using ForwardOp = AbstractReduceFrontOrBackOp< |
| T, |
| Context, |
| typename ReducerDef::template Reducer<T, Context>, |
| false>; |
| using BackwardOp = |
| AbstractReduceFrontOrBackGradientOp<T, Context, ReducerGradient, false>; |
| struct GetGradient : public GradientMakerBase { |
| using GradientMakerBase::GradientMakerBase; |
| vector<OperatorDef> GetGradientDefs() override { |
| // Have utility function generating these names? |
| string tmp_dims = "_" + O(0) + "_dims"; |
| |
| vector<string> grad_ins; |
| for (const int i : ReducerGradient::originalInputs()) { |
| grad_ins.push_back(I(i)); |
| } |
| grad_ins.push_back(GO(0)); |
| grad_ins.push_back(tmp_dims); |
| |
| vector<Argument> args; |
| if (ArgumentHelper::HasArgument(def_, "num_reduce_dim")) { |
| args.push_back(GetArgument(def_, "num_reduce_dim")); |
| } |
| // FIXME: pass in num_reduce_dims?! |
| return vector<OperatorDef>{ |
| CreateOperatorDef( |
| "Shape", "", vector<string>{I(0)}, vector<string>{tmp_dims}), |
| CreateOperatorDef( |
| string(basename) + ReducerDef::name + "Gradient", |
| "", |
| grad_ins, |
| // no gradient on auxiliary inputs for now |
| vector<string>{GI(0)}), |
| }; |
| } |
| }; |
| }; |
| |
| /** |
| * @brief Segment reduction op with optional fused embedding lookup |
| * |
| * Base implementation for SortedSegmentXXX and SparseSortedSegmentXXX depending |
| * on SparseFused static argument. |
| * |
| * Inputs: |
| * 0: DATA - input embedding to do lookups in |
| * 1..P: AUX_ARG_<I> - optional additional arguments to be passed to the |
| * reducer, should have the same first dimension as |
| * SEGMENT_IDS (e.g. scalars in WeightedSum) |
| * # if SparseFused == true: |
| * P+1: INDICES - 1-D vector with indices to look up in DATA. Should have the |
| * same dimension as SEGMENT_IDS |
| * # P+1 if SparseFused == false: |
| * P+1 or P+2: SEGMENT_IDS - sorted segment ids 1-D vector |
| * |
| * Output: |
| * Tensor with first dimension of K, where K is the max segment id + 1. Rest |
| * of dimensions are decided by reducer but usually are the same size as extra |
| * dimensions of DATA |
| */ |
| template < |
| typename T, |
| typename SIndex, |
| class Context, |
| class Reducer, |
| bool SparseFused = true, |
| class InputAccessor = BaseInputAccessor<T>> |
| class AbstractSortedSegmentOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentOp); |
| |
| bool RunOnDevice() override { |
| if (SparseFused) { |
| return DispatchHelper<TensorTypes<int32_t, int64_t>>::call( |
| this, Input(INDICES)); |
| } else { |
| // type doesn't matter |
| return DoRunWithType<int64_t>(); |
| } |
| } |
| |
| template <typename IndexType> |
| bool DoRunWithType() { |
| // If more complicated fixed size logic becomes necessary, it can be moved |
| // to the reducer class |
| int64_t in_block_size = Input(0).size_from_dim(1); |
| return DispatchHelper<typename Reducer::FixedDispatch, IndexType>::call( |
| this, in_block_size); |
| } |
| |
| template <typename IndexType, int FixedSize> |
| bool DoRunWithValue() { |
| auto& dataInput = Input(0); |
| auto& segment_ids = Input(SEGMENT_IDS); |
| |
| CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector"); |
| int64_t N = segment_ids.size(0); |
| const int64_t M = dataInput.size(0); |
| |
| const IndexType* idxs; |
| if (SparseFused) { // static if |
| auto& indices = Input(INDICES); |
| CAFFE_ENFORCE_EQ(1, indices.dim(), "INDICES must be a vector"); |
| CAFFE_ENFORCE_EQ( |
| N, |
| indices.size(0), |
| "SEGMENT_IDS must have the same length as INDICES"); |
| idxs = indices.template data<IndexType>(); |
| } else { |
| CAFFE_ENFORCE_EQ( |
| N, M, "DATA must have the same first dimension as SEGMENT_IDS"); |
| } |
| |
| // It would probably look nicer with varargs templates but it's too much |
| // metaprogramming |
| typename Reducer::Meta ctx; |
| ctx.observeInput(0, dataInput, 1); |
| for (int i = 1; i < Reducer::kInputCount; ++i) { |
| auto& aux_in = Input(i); |
| CAFFE_ENFORCE_EQ( |
| N, |
| aux_in.size(0), |
| "Input ", |
| i, |
| " must have the same first dim as SEGMENT_IDS"); |
| ctx.observeInput(i, aux_in, 1); |
| } |
| |
| OPERATOR_NEEDS_FEATURE( |
| inputAccessor_.observeInput(dataInput), |
| "Unsupported input type: ", |
| dataInput.dtype().name(), |
| "."); |
| |
| const SIndex* s_ids = segment_ids.template data<SIndex>(); |
| |
| const SIndex K = N > 0 ? s_ids[N - 1] + 1 : 0; |
| vector<int64_t> shape; |
| shape.push_back(K); |
| ctx.appendOutputShape(&shape); |
| auto* output = Output(0, shape, at::dtype<T>()); |
| |
| T* out = output->template mutable_data<T>(); |
| if (N == 0) { |
| return true; |
| } |
| int64_t in_block_size = dataInput.size_from_dim(1); |
| int64_t out_block_size = output->size_from_dim(1); |
| |
| // Assume the segments are sorted and there are no gaps |
| CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps"); |
| for (int64_t i = 0; i < N;) { |
| int64_t start = i; |
| |
| Reducer r(ctx, out + out_block_size * s_ids[start], &context_); |
| for (; i < N && s_ids[start] == s_ids[i]; ++i) { |
| IndexType idx; |
| if (SparseFused) { // static if |
| CAFFE_ENFORCE( |
| 0 <= idxs[i] && idxs[i] < M, |
| "Index out of bounds: ", |
| idxs[i], |
| ", range 0 to ", |
| M); |
| idx = idxs[i]; |
| } else { |
| idx = i; |
| } |
| r.template process<FixedSize>( |
| ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_); |
| } |
| |
| r.template finish<FixedSize>(ctx, &context_); |
| // check correctness of the next segment |
| if (i < N) { |
| CAFFE_ENFORCE_EQ( |
| s_ids[start] + 1, |
| s_ids[i], |
| "Indices must be sorted and not have gaps"); |
| } |
| } |
| return true; |
| } |
| |
| enum { |
| INDICES = Reducer::kInputCount, |
| SEGMENT_IDS = Reducer::kInputCount + (SparseFused ? 1 : 0) |
| }; |
| static constexpr int kSelfInputs = SparseFused ? 2 : 1; |
| static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs; |
| |
| private: |
| InputAccessor inputAccessor_; |
| }; |
| |
| // Gradient actually doesn't depend on whether sparse lookup is fused or not |
| template <typename T, typename SIndex, class Context, class ReducerGradient> |
| class AbstractSortedSegmentGradientOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(AbstractSortedSegmentGradientOp); |
| |
| bool RunOnDevice() override { |
| // If more complicated fixed size logic becomes necessary, it can be moved |
| // to the reducer class |
| int64_t grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1); |
| return DispatchHelper<typename ReducerGradient::FixedDispatch>::call( |
| this, grad_block_size); |
| } |
| |
| template <int FixedSize> |
| bool DoRunWithValue() { |
| auto& segment_grads = Input(SEGMENT_GRADS); |
| auto& segment_ids = Input(SEGMENT_IDS); |
| |
| CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector"); |
| int64_t N = segment_ids.size(0); |
| |
| typename ReducerGradient::Meta ctx(segment_grads, 1); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) { |
| auto& aux_in = Input(i); |
| CAFFE_ENFORCE_EQ( |
| N, |
| aux_in.size(0), |
| "Input ", |
| i, |
| " must have the same first dim as SEGMENT_IDS"); |
| ctx.observeOriginalInput( |
| ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1); |
| } |
| |
| const SIndex* s_ids = segment_ids.template data<SIndex>(); |
| const T* s_grads = segment_grads.template data<T>(); |
| |
| vector<int64_t> shape; |
| shape.push_back(N); |
| ctx.appendGradShape(&shape); |
| auto* data_grads = Output(0, shape, at::dtype<T>()); |
| |
| int64_t d_block_size = data_grads->size_from_dim(1); |
| const SIndex K = segment_grads.size(0); |
| int64_t s_block_size = segment_grads.size_from_dim(1); |
| T* out = data_grads->template mutable_data<T>(); |
| |
| if (N == 0) { |
| return true; |
| } |
| |
| // Assume the segments are sorted and there are no gaps |
| CAFFE_ENFORCE_EQ(0, s_ids[0], "Indices must be sorted and not have gaps"); |
| // repeat the check from forward op |
| CAFFE_ENFORCE_EQ( |
| K - 1, s_ids[N - 1], "Indices must be sorted and not have gaps"); |
| for (int64_t i = 0; i < N;) { |
| int64_t start = i; |
| int64_t end = start; |
| |
| if (ReducerGradient::computeLength()) { |
| for (; end < N && s_ids[start] == s_ids[end]; ++end) { |
| } |
| } |
| |
| ReducerGradient r(ctx, s_grads + s_block_size * s_ids[start], &context_); |
| for (; i < N && s_ids[start] == s_ids[i]; ++i) { |
| r.template fillGrad<FixedSize>( |
| ctx, out + d_block_size * i, i, &context_, end - start); |
| } |
| |
| // check correctness of the next segment |
| if (i < N) { |
| CAFFE_ENFORCE_EQ( |
| s_ids[start] + 1, |
| s_ids[i], |
| "Indices must be sorted and not have gaps"); |
| } |
| } |
| return true; |
| } |
| |
| // Input layout: |
| // orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS |
| // orig_argXs represent original op's inputs and will be passed to the reducer |
| // directly |
| static constexpr int kNumInputs = |
| ReducerGradient::originalInputs().size() + 2; |
| enum _InputTags { |
| SEGMENT_GRADS = ReducerGradient::originalInputs().size(), |
| SEGMENT_IDS |
| }; |
| }; |
| |
| // base implementation of sorted/unsorted sparse/non-sparse gradient computation |
| template < |
| typename ForwardOp, |
| typename ReducerDef, |
| typename ReducerGradient, |
| bool Sorted, |
| bool SparseFused> |
| struct SegmentOpGetGradient : public GradientMakerBase { |
| using GradientMakerBase::GradientMakerBase; |
| vector<OperatorDef> GetGradientDefs() override { |
| CAFFE_ENFORCE( |
| !ReducerGradient::requiresDataInput(Def()), |
| "grads on aux inputs are not yet implemented for Segment operators."); |
| vector<string> grad_ins; |
| for (const int i : ReducerGradient::originalInputs()) { |
| grad_ins.push_back(I(i)); |
| } |
| grad_ins.push_back(GO(0)); |
| grad_ins.push_back(I(ForwardOp::SEGMENT_IDS)); |
| vector<OperatorDef> r{CreateOperatorDef( |
| string(Sorted ? "SortedSegment" : "UnsortedSegment") + |
| ReducerDef::name + "Gradient", |
| "", |
| grad_ins, |
| // no gradient on segment_ids or auxiliary inputs for now |
| vector<string>{SparseFused ? GI_V(0) : GI(0)})}; |
| if (SparseFused) { |
| SetSparse(0, I(ForwardOp::INDICES), GI_V(0)); |
| } |
| return r; |
| } |
| }; |
| |
| template <typename T, typename SIndex, typename Context, typename ReducerDef> |
| struct AbstractSortedSegmentDef { |
| using OpDef = ReducerDef; |
| static constexpr const char* basename = "SortedSegment"; |
| static constexpr const char* doc = R"DOC( |
| Applies '{op}' to each segment of input tensor. Segments need to be sorted and |
| contiguous. See also UnsortedSegment{op} that doesn't have this requirement. |
| |
| SEGMENT_IDS is a vector that maps each of the first dimension slices of the |
| DATA to a particular group (segment). Values belonging to the same segment are |
| aggregated together. |
| |
| The first dimension of the output is equal to the number of input segments, |
| i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor. |
| |
| {op_doc} |
| )DOC"; |
| static void PopulateSchema(OpSchema& schema) { |
| schema.Input(0, "DATA", "Input tensor, slices of which are aggregated."); |
| schema.Input( |
| Reducer::kInputCount, |
| "SEGMENT_IDS", |
| "Vector with the same length as the first dimension of DATA " |
| "and values in the range 0..K-1 and in increasing order that " |
| "maps each slice of DATA to one of the segments"); |
| schema.Output( |
| 0, |
| "OUTPUT", |
| "Aggregated output tensor. Has the first dimension of K " |
| "(the number of segments)."); |
| ReducerDef::PopulateSchema(schema); |
| } |
| using Reducer = typename ReducerDef::template Reducer<T, Context>; |
| using ReducerGradient = |
| typename ReducerDef::template ReducerGradient<T, Context>; |
| using ForwardOp = AbstractSortedSegmentOp<T, SIndex, Context, Reducer, false>; |
| using BackwardOp = |
| AbstractSortedSegmentGradientOp<T, SIndex, Context, ReducerGradient>; |
| using GetGradient = SegmentOpGetGradient< |
| ForwardOp, |
| ReducerDef, |
| ReducerGradient, |
| true /*Sorted*/, |
| false /*SparseFused*/>; |
| }; |
| |
| template <typename T, typename SIndex, typename Context, typename ReducerDef> |
| struct AbstractSparseSortedSegmentDef { |
| using OpDef = ReducerDef; |
| static constexpr const char* basename = "SparseSortedSegment"; |
| static constexpr const char* doc = R"DOC( |
| Pulls in slices of the input tensor, groups them into segments and applies |
| '{op}' to each segment. Segments need to be sorted and contiguous. See also |
| SparseUnsortedSegment{op} that doesn't have this requirement. |
| |
| This op is basically Gather and SortedSegment{op} fused together. |
| |
| INDICES should contain integers in range 0..N-1 where N is the first dimension |
| of DATA. INDICES represent which slices of DATA need to be pulled in. |
| |
| SEGMENT_IDS is a vector that maps each referenced slice of the DATA to a |
| particular group (segment). Values belonging to the same segment are aggregated |
| together. SEGMENT_IDS should have the same dimension as INDICES. |
| |
| The first dimension of the output is equal to the number of input segments, |
| i.e. `SEGMENT_IDS[-1]+1`. Other dimensions are inherited from the input tensor. |
| |
| {op_doc} |
| )DOC"; |
| static void PopulateSchema(OpSchema& schema) { |
| schema.Input(0, "DATA", "Input tensor, slices of which are aggregated."); |
| schema.Input( |
| Reducer::kInputCount, |
| "INDICES", |
| "Integer vector containing indices of the first dimension of DATA for " |
| "the slices that are being aggregated"); |
| schema.Input( |
| Reducer::kInputCount + 1, |
| "SEGMENT_IDS", |
| "Vector with the same length as INDICES and values in the range " |
| "0..K-1 and in increasing order that maps each slice of DATA referenced" |
| " by INDICES to one of the segments"); |
| schema.Output( |
| 0, |
| "OUTPUT", |
| "Aggregated output tensor. Has the first dimension of K " |
| "(the number of segments)."); |
| ReducerDef::PopulateSchema(schema); |
| } |
| using Reducer = typename ReducerDef::template Reducer<T, Context>; |
| using ReducerGradient = |
| typename ReducerDef::template ReducerGradient<T, Context>; |
| using ForwardOp = AbstractSortedSegmentOp<T, SIndex, Context, Reducer>; |
| // TODO(dzhulgakov): we're registering the same class twice here, |
| // consider avoiding op duplication here |
| using BackwardOp = |
| AbstractSortedSegmentGradientOp<T, SIndex, Context, ReducerGradient>; |
| using GetGradient = SegmentOpGetGradient< |
| ForwardOp, |
| ReducerDef, |
| ReducerGradient, |
| true /*Sorted*/, |
| true /*SparseFused*/>; |
| }; |
| |
| /** |
| * @brief Unsorted segment reduction op with optional fused embedding lookup |
| * |
| * Base implementation for UnsortedSegmentXXX and UnsparseSortedSegmentXXX |
| * depending on SparseFused static argument. |
| * |
| * Unlike the sorted version it allows to have "gaps" in segment ids. |
| * |
| * Inputs: |
| * 0: DATA - input embedding to do lookups in |
| * 1..P: AUX_ARG_<I> - optional additional arguments to be passed to the |
| * reducer, should have the same first dimension as |
| * SEGMENT_IDS (e.g. scalars in WeightedSum) |
| * # if SparseFused == true: |
| * P+1: INDICES - 1-D vector with indices to look up in DATA. Should have the |
| * same dimension as SEGMENT_IDS |
| * # P+1 if SparseFused == false: |
| * P+1 or P+2: SEGMENT_IDS - unsorted segment ids 1-D vector |
| * |
| * Args: |
| * num_segments - allows to override the dimension of the output. If not set |
| * it would be inferred from segment_ids tensor. |
| * |
| * |
| * Output: |
| * Tensor with first dimension of K, where K is the max segment id + 1. Rest |
| * of dimensions are decided by reducer but usually are the same size as extra |
| * dimensions of DATA |
| */ |
| template < |
| typename T, |
| typename SIndex, |
| class Context, |
| class Reducer, |
| bool SparseFused = true, |
| class InputAccessor = BaseInputAccessor<T>> |
| class AbstractUnsortedSegmentOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| |
| template <class... Args> |
| explicit AbstractUnsortedSegmentOp(Args&&... args) |
| : Operator<Context>(std::forward<Args>(args)...), |
| OP_SINGLE_ARG(int, "num_segments", num_segments_, -1) {} |
| |
| bool RunOnDevice() override { |
| if (SparseFused) { |
| return DispatchHelper<TensorTypes<int32_t, int64_t>>::call( |
| this, Input(INDICES)); |
| } else { |
| // type doesn't matter |
| return DoRunWithType<int64_t>(); |
| } |
| } |
| |
| template <typename IndexType> |
| bool DoRunWithType() { |
| // If more complicated fixed size logic becomes necessary, it can be moved |
| // to the reducer class |
| int64_t in_block_size = Input(0).size_from_dim(1); |
| return DispatchHelper<typename Reducer::FixedDispatch, IndexType>::call( |
| this, in_block_size); |
| } |
| |
| template <typename IndexType, int FixedSize> |
| bool DoRunWithValue() { |
| auto& data = Input(0); |
| auto& segment_ids = Input(SEGMENT_IDS); |
| |
| CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector"); |
| int64_t N = segment_ids.size(0); |
| const int64_t M = data.size(0); |
| |
| const IndexType* idxs; |
| if (SparseFused) { // static if |
| auto& indices = Input(INDICES); |
| CAFFE_ENFORCE_EQ(1, indices.dim(), "INDICES must be a vector"); |
| CAFFE_ENFORCE_EQ( |
| N, |
| indices.size(0), |
| "SEGMENT_IDS must have the same length as INDICES"); |
| idxs = indices.template data<IndexType>(); |
| } else { |
| CAFFE_ENFORCE_EQ( |
| N, M, "DATA must have the same first dimension as SEGMENT_IDS"); |
| } |
| |
| // It would probably look nicer with varargs templates but it's too much |
| // metaprogramming |
| typename Reducer::Meta ctx; |
| ctx.observeInput(0, data, 1); |
| for (int i = 1; i < Reducer::kInputCount; ++i) { |
| auto& aux_in = Input(i); |
| CAFFE_ENFORCE_EQ( |
| N, |
| aux_in.size(0), |
| "Input ", |
| i, |
| " must have the same first dim as SEGMENT_IDS"); |
| ctx.observeInput(i, aux_in, 1); |
| } |
| |
| const SIndex* s_ids = segment_ids.template data<SIndex>(); |
| OPERATOR_NEEDS_FEATURE( |
| inputAccessor_.observeInput(data), |
| "Unsupported input type: ", |
| data.dtype().name(), |
| "."); |
| |
| // determine the number of segments |
| SIndex K; |
| if (num_segments_ != -1) { |
| K = num_segments_; |
| } else { |
| K = 0; |
| for (const auto i : c10::irange(N)) { |
| K = std::max(K, s_ids[i] + 1); |
| } |
| } |
| |
| vector<int64_t> shape; |
| shape.push_back(K); |
| ctx.appendOutputShape(&shape); |
| auto* output = Output(0, shape, at::dtype<T>()); |
| |
| int64_t in_block_size = data.size_from_dim(1); |
| int64_t out_block_size = output->size_from_dim(1); |
| T* out = output->template mutable_data<T>(); |
| |
| reducers_.clear(); |
| reducers_.reserve(K); |
| for (const auto i : c10::irange(K)) { |
| reducers_.emplace_back(ctx, out + out_block_size * i, &context_); |
| } |
| |
| for (const auto i : c10::irange(N)) { |
| auto s_id = s_ids[i]; |
| CAFFE_ENFORCE( |
| 0 <= s_id && s_id < K, |
| "Segment id out of range: ", |
| s_id, |
| ", range 0 to ", |
| K); |
| IndexType idx; |
| if (SparseFused) { // static if |
| CAFFE_ENFORCE( |
| 0 <= idxs[i] && idxs[i] < M, |
| "Index out of bounds: ", |
| idxs[i], |
| ", range 0 to ", |
| M); |
| idx = idxs[i]; |
| } else { |
| idx = i; |
| } |
| reducers_[s_id].template process<FixedSize>( |
| ctx, inputAccessor_.getBlockPtr(in_block_size, idx), i, &context_); |
| } |
| |
| for (const auto i : c10::irange(K)) { |
| reducers_[i].template finish<FixedSize>(ctx, &context_); |
| } |
| // call reducers destructors (if there is any) |
| reducers_.clear(); |
| return true; |
| } |
| |
| enum { |
| INDICES = Reducer::kInputCount, |
| SEGMENT_IDS = Reducer::kInputCount + (SparseFused ? 1 : 0) |
| }; |
| static constexpr int kSelfInputs = SparseFused ? 2 : 1; |
| static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs; |
| |
| private: |
| int64_t num_segments_; |
| // member field to reuse memory |
| vector<Reducer> reducers_; |
| InputAccessor inputAccessor_; |
| }; |
| |
| // Gradient actually doesn't depend on whether sparse lookup is fused or not |
| template <typename T, typename SIndex, class Context, class ReducerGradient> |
| class AbstractUnsortedSegmentGradientOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(AbstractUnsortedSegmentGradientOp); |
| |
| bool RunOnDevice() override { |
| // If more complicated fixed size logic becomes necessary, it can be moved |
| // to the reducer class |
| int64_t grad_block_size = Input(SEGMENT_GRADS).size_from_dim(1); |
| return DispatchHelper<typename ReducerGradient::FixedDispatch>::call( |
| this, grad_block_size); |
| } |
| |
| template <int FixedSize> |
| bool DoRunWithValue() { |
| auto& segment_grads = Input(SEGMENT_GRADS); |
| auto& segment_ids = Input(SEGMENT_IDS); |
| |
| CAFFE_ENFORCE_EQ(1, segment_ids.dim(), "SEGMENT_IDS must be a vector"); |
| int64_t N = segment_ids.size(0); |
| |
| typename ReducerGradient::Meta ctx(segment_grads, 1); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) { |
| auto& aux_in = Input(i); |
| CAFFE_ENFORCE_EQ( |
| N, |
| aux_in.size(0), |
| "Input ", |
| i, |
| " must have the same first dim as SEGMENT_IDS"); |
| ctx.observeOriginalInput( |
| ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1); |
| } |
| |
| const SIndex* s_ids = segment_ids.template data<SIndex>(); |
| const T* s_grads = segment_grads.template data<T>(); |
| |
| vector<int64_t> shape; |
| shape.push_back(N); |
| ctx.appendGradShape(&shape); |
| auto* data_grads = Output(0, shape, at::dtype<T>()); |
| |
| int64_t d_block_size = data_grads->size_from_dim(1); |
| const SIndex K = segment_grads.size(0); |
| int64_t s_block_size = segment_grads.size_from_dim(1); |
| T* out = data_grads->template mutable_data<T>(); |
| |
| if (ReducerGradient::computeLength()) { |
| segment_length_.resize(K, 0); |
| for (const auto i : c10::irange(N)) { |
| auto s_id = s_ids[i]; |
| CAFFE_ENFORCE( |
| 0 <= s_id && s_id < K, |
| "Segment id out of range: ", |
| s_id, |
| ", range 0 to ", |
| K); |
| segment_length_[s_ids[i]]++; |
| } |
| } |
| |
| reducers_.clear(); |
| reducers_.reserve(K); |
| for (SIndex i = 0; i < K; ++i) { |
| reducers_.emplace_back(ctx, s_grads + s_block_size * i, &context_); |
| } |
| |
| for (const auto i : c10::irange(N)) { |
| auto s_id = s_ids[i]; |
| if (ReducerGradient::computeLength()) { |
| reducers_[s_id].template fillGrad<FixedSize>( |
| ctx, out + d_block_size * i, i, &context_, segment_length_[s_id]); |
| } else { |
| reducers_[s_id].template fillGrad<FixedSize>( |
| ctx, out + d_block_size * i, i, &context_, 0); |
| } |
| } |
| // call reducers destructors (if there is any) |
| reducers_.clear(); |
| return true; |
| } |
| |
| // Input layout: |
| // orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, SEGMENT_IDS |
| // orig_argXs represent original op's inputs and will be passed to the reducer |
| // directly |
| static constexpr int kNumInputs = |
| ReducerGradient::originalInputs().size() + 2; |
| enum _InputTags { |
| SEGMENT_GRADS = ReducerGradient::originalInputs().size(), |
| SEGMENT_IDS |
| }; |
| |
| private: |
| // member field to reuse memory |
| vector<ReducerGradient> reducers_; |
| vector<int> segment_length_; |
| }; |
| |
| template <typename T, typename SIndex, typename Context, typename ReducerDef> |
| struct AbstractUnsortedSegmentDef { |
| using OpDef = ReducerDef; |
| static constexpr const char* basename = "UnsortedSegment"; |
| static constexpr const char* doc = R"DOC( |
| Applies '{op}' to each segment of input tensor. Segments ids can appear in |
| arbitrary order (unlike in SortedSegment{op}). |
| |
| SEGMENT_IDS is a vector that maps each of the first dimension slices of the |
| DATA to a particular group (segment). Values belonging to the same segment are |
| aggregated together. |
| |
| If `num_segments` argument is passed it would be used as a first dimension for |
| the output. Otherwise, it'd be dynamically calculated from as the max value of |
| SEGMENT_IDS plus one. Other output dimensions are inherited from the input |
| tensor. |
| |
| {op_doc} |
| )DOC"; |
| static void PopulateSchema(OpSchema& schema) { |
| schema.Arg( |
| "num_segments", |
| "Optional int argument specifying the number of output segments and " |
| "thus the first dimension of the output"); |
| schema.Input(0, "DATA", "Input tensor, slices of which are aggregated."); |
| schema.Input( |
| Reducer::kInputCount, |
| "SEGMENT_IDS", |
| "Integer vector with the same length as the first dimension of DATA " |
| "that maps each slice of DATA to one of the segments"); |
| schema.Output( |
| 0, |
| "OUTPUT", |
| "Aggregated output tensor. Has the first dimension of equal to the " |
| "number of segments."); |
| ReducerDef::PopulateSchema(schema); |
| } |
| using Reducer = typename ReducerDef::template Reducer<T, Context>; |
| using ReducerGradient = |
| typename ReducerDef::template ReducerGradient<T, Context>; |
| using ForwardOp = AbstractUnsortedSegmentOp< |
| T, |
| SIndex, |
| Context, |
| typename ReducerDef::template Reducer<T, Context>, |
| false>; |
| using BackwardOp = |
| AbstractUnsortedSegmentGradientOp<T, SIndex, Context, ReducerGradient>; |
| using GetGradient = SegmentOpGetGradient< |
| ForwardOp, |
| ReducerDef, |
| ReducerGradient, |
| false /*Sorted*/, |
| false /*SparseFused*/>; |
| }; |
| |
| template <typename T, typename SIndex, typename Context, typename ReducerDef> |
| struct AbstractSparseUnsortedSegmentDef { |
| using OpDef = ReducerDef; |
| static constexpr const char* basename = "SparseUnsortedSegment"; |
| static constexpr const char* doc = R"DOC( |
| Pulls in slices of the input tensor, groups them into segments and applies |
| '{op}' to each segment. Segments ids can appear in arbitrary order (unlike in |
| SparseSortedSegment{op}). |
| |
| This op is basically Gather and UnsortedSegment{op} fused together. |
| |
| INDICES should contain integers in range 0..N-1 where N is the first dimension |
| of DATA. INDICES represent which slices of DATA need to be pulled in. |
| |
| SEGMENT_IDS is a vector that maps each referenced slice of the DATA to a |
| particular group (segment). Values belonging to the same segment are aggregated |
| together. SEGMENT_IDS should have the same dimension as INDICES. |
| |
| If `num_segments` argument is passed it would be used as a first dimension for |
| the output. Otherwise, it'd be dynamically calculated from as the max value of |
| SEGMENT_IDS plus one. Other output dimensions are inherited from the input |
| tensor. |
| |
| {op_doc} |
| )DOC"; |
| static void PopulateSchema(OpSchema& schema) { |
| schema.Input(0, "DATA", "Input tensor, slices of which are aggregated."); |
| schema.Input( |
| Reducer::kInputCount, |
| "INDICES", |
| "Integer vector containing indices of the first dimension of DATA for " |
| "the slices that are being aggregated"); |
| schema.Input( |
| Reducer::kInputCount + 1, |
| "SEGMENT_IDS", |
| "Integer vector with the same length as INDICES that maps each slice " |
| "of DATA referenced by INDICES to one of the segments"); |
| schema.Output( |
| 0, |
| "OUTPUT", |
| "Aggregated output tensor. Has the first dimension of equal to the " |
| "number of segments."); |
| ReducerDef::PopulateSchema(schema); |
| } |
| using Reducer = typename ReducerDef::template Reducer<T, Context>; |
| using ReducerGradient = |
| typename ReducerDef::template ReducerGradient<T, Context>; |
| using ForwardOp = AbstractUnsortedSegmentOp<T, SIndex, Context, Reducer>; |
| // TODO(dzhulgakov): we're registering the same class twice here, |
| // consider avoiding op duplication here |
| using BackwardOp = |
| AbstractUnsortedSegmentGradientOp<T, SIndex, Context, ReducerGradient>; |
| using GetGradient = SegmentOpGetGradient< |
| ForwardOp, |
| ReducerDef, |
| ReducerGradient, |
| false /*Sorted*/, |
| true /*SparseFused*/>; |
| }; |
| |
| /** |
| * @brief Segment reduction op with optional fused embedding lookup |
| * |
| * Base implementation for LengthsXXX and SparseLengthsXXX depending |
| * on SparseFused static argument. |
| * |
| * Inputs: |
| * 0: DATA - input embedding to do lookups in |
| * 1..P: AUX_ARG_<I> - optional additional arguments to be passed to the |
| * reducer, should have the same first dimension as |
| * LENGTHS (e.g. scalars in WeightedSum) |
| * # if SparseFused == true: |
| * P+1: INDICES - 1-D vector with indices to look up in DATA. Should have the |
| * same dimension as LENGTHS |
| * # P+1 if SparseFused == false: |
| * P+1 or P+2: LENGTHS - lengths on indecies vector |
| * |
| * Output: |
| * Tensor with first dimension of K, where K = len(LENGTHS). Rest |
| * of dimensions are decided by reducer but usually are the same size as extra |
| * dimensions of DATA |
| */ |
| // TODO(dzhulgakov): for now it's implemented with incremental reducers because |
| // of fused sparse support. But using "lengths" representation actually implies |
| // continuous segments and thus range reducers can be used for non-sparse |
| // version. |
| |
| template < |
| typename TData, |
| typename TLengths, |
| class Context, |
| class Reducer, |
| bool SparseFused = true, |
| class InputAccessor = BaseInputAccessor<TData>> |
| class AbstractLengthsOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(AbstractLengthsOp); |
| |
| bool RunOnDevice() override { |
| if (SparseFused) { |
| return DispatchHelper<TensorTypes<int32_t, int64_t>>::call( |
| this, Input(INDICES)); |
| } else { |
| // type doesn't matter |
| return DoRunWithType<int64_t>(); |
| } |
| } |
| |
| template <typename IndexType> |
| bool DoRunWithType() { |
| // If more complicated fixed size logic becomes necessary, it can be moved |
| // to the reducer class |
| int64_t in_block_size = Input(0).size_from_dim(1); |
| return DispatchHelper<typename Reducer::FixedDispatch, IndexType>::call( |
| this, in_block_size); |
| } |
| |
| template <typename IndexType, int FixedSize> |
| bool DoRunWithValue() { |
| auto& dataInput = Input(0); |
| auto& lengthsInput = Input(LENGTHS); |
| |
| CAFFE_ENFORCE_EQ(1, lengthsInput.dim(), "LENGTHS must be a vector"); |
| const int64_t dataSize = dataInput.size(0); |
| // Either first dim the data or how much we pull in indexies from it |
| int64_t dataToReduceSize; |
| const int64_t outputSize = lengthsInput.size(0); |
| |
| const IndexType* indices; |
| if (SparseFused) { // static if |
| auto& indicesInput = Input(INDICES); |
| CAFFE_ENFORCE_EQ(1, indicesInput.dim(), "INDICES must be a vector"); |
| indices = indicesInput.template data<IndexType>(); |
| dataToReduceSize = indicesInput.size(0); |
| } else { |
| dataToReduceSize = dataSize; |
| } |
| |
| typename Reducer::Meta ctx; |
| ctx.observeInput(0, dataInput, 1); |
| for (int i = 1; i < Reducer::kInputCount; ++i) { |
| auto& aux_in = Input(i); |
| CAFFE_ENFORCE( |
| dataToReduceSize == aux_in.size(0), |
| "Input ", |
| i, |
| " must have the same first dim as SEGMENT_IDS"); |
| ctx.observeInput(i, aux_in, 1); |
| } |
| |
| const TLengths* lengths = lengthsInput.template data<TLengths>(); |
| |
| OPERATOR_NEEDS_FEATURE( |
| inputAccessor_.observeInput(dataInput), |
| "Unsupported input type: ", |
| dataInput.dtype().name(), |
| "."); |
| |
| vector<int64_t> shape{outputSize}; |
| ctx.appendOutputShape(&shape); |
| auto* output = Output(0, shape, at::dtype<TData>()); |
| |
| int64_t in_block_size = dataInput.size_from_dim(1); |
| int64_t out_block_size = output->size_from_dim(1); |
| TData* out = output->template mutable_data<TData>(); |
| |
| int64_t dataIndex = 0; |
| for (const auto rangeIndex : c10::irange(outputSize)) { |
| Reducer reducer(ctx, out + out_block_size * rangeIndex, &context_); |
| for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex]; |
| ++dataIndex) { |
| IndexType idx; |
| if (SparseFused) { // static if |
| idx = indices[dataIndex]; |
| CAFFE_ENFORCE( |
| 0 <= idx && idx < dataSize, |
| "The ", |
| dataIndex, |
| "th index from the input indices is out of bounds: ", |
| idx, |
| " vs. valid range 0 to ", |
| dataSize); |
| } else { |
| idx = dataIndex; |
| CAFFE_ENFORCE( |
| 0 <= idx && idx < dataSize, |
| "When calculating the ", |
| rangeIndex, |
| "th output with length=", |
| lengths[rangeIndex], |
| ", the index is out of bounds: ", |
| idx, |
| " vs. valid range 0 to ", |
| dataSize); |
| } |
| |
| const TData* input = inputAccessor_.getBlockPtr(in_block_size, idx); |
| reducer.template process<FixedSize>(ctx, input, dataIndex, &context_); |
| } |
| reducer.template finish<FixedSize>(ctx, &context_); |
| } |
| CAFFE_ENFORCE( |
| dataIndex == dataToReduceSize, dataIndex, " != ", dataToReduceSize); |
| |
| return true; |
| } |
| |
| enum { |
| INDICES = Reducer::kInputCount, |
| LENGTHS = Reducer::kInputCount + (SparseFused ? 1 : 0) |
| }; |
| static constexpr int kSelfInputs = SparseFused ? 2 : 1; |
| static constexpr int kNumInputs = Reducer::kInputCount + kSelfInputs; |
| |
| private: |
| InputAccessor inputAccessor_; |
| }; |
| |
| /* |
| * Some notice: |
| * 1. Gradient actually doesn't depend on whether sparse lookup is fused or not |
| * 2. INDICES are not used in CPU version, but they are needed in async CUDA |
| * version. So we register 3 input version for CPU as gradient op for |
| * GPU/CPU convert. We then register 2 input version for CPU for backward |
| * compatibility with older nets. |
| */ |
| template < |
| typename T, |
| typename TLengths, |
| class Context, |
| class ReducerGradient, |
| bool GradientNeedIndices = false> |
| class AbstractLengthsGradientOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(AbstractLengthsGradientOp); |
| |
| bool RunOnDevice() override { |
| // If more complicated fixed size logic becomes necessary, it can be moved |
| // to the reducer class |
| int64_t gradBlockSize = Input(SEGMENT_GRADS).size_from_dim(1); |
| return DispatchHelper<typename ReducerGradient::FixedDispatch>::call( |
| this, gradBlockSize); |
| } |
| |
| template <int FixedSize> |
| bool DoRunWithValue() { |
| auto& segmentGradsInput = Input(SEGMENT_GRADS); |
| auto& lengthsInput = Input(LENGTHS); |
| |
| CAFFE_ENFORCE(lengthsInput.dim() == 1, "LENGTHS must be a vector"); |
| int64_t reducedDataSize = 0; |
| int64_t numSegments = lengthsInput.size(0); |
| CAFFE_ENFORCE(segmentGradsInput.dim() > 0); |
| CAFFE_ENFORCE(numSegments == segmentGradsInput.size(0)); |
| const TLengths* lengths = lengthsInput.template data<TLengths>(); |
| for (const auto i : c10::irange(numSegments)) { |
| reducedDataSize += lengths[i]; |
| } |
| |
| typename ReducerGradient::Meta ctx(segmentGradsInput, 1); |
| for (auto i = 0U; i < ReducerGradient::originalInputs().size(); ++i) { |
| auto& aux_in = Input(i); |
| CAFFE_ENFORCE_EQ( |
| reducedDataSize, |
| aux_in.size(0), |
| "Input ", |
| i, |
| " must have the same first dim as SEGMENT_IDS"); |
| ctx.observeOriginalInput( |
| ReducerGradient::originalInputs()[i], aux_in, nullptr /*no grad*/, 1); |
| } |
| |
| const T* segmentGrads = segmentGradsInput.template data<T>(); |
| |
| vector<int64_t> shape; |
| shape.push_back(reducedDataSize); |
| ctx.appendGradShape(&shape); |
| auto* dataGradsOutput = Output(0, shape, at::dtype<T>()); |
| |
| int64_t dataGradsBlockSize = dataGradsOutput->size_from_dim(1); |
| int64_t segmentBlockSize = segmentGradsInput.size_from_dim(1); |
| T* dataGrads = dataGradsOutput->template mutable_data<T>(); |
| |
| int64_t dataIndex = 0; |
| for (const auto rangeIndex : c10::irange(numSegments)) { |
| ReducerGradient reducer( |
| ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_); |
| for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex]; |
| ++dataIndex) { |
| reducer.template fillGrad<FixedSize>( |
| ctx, |
| dataGrads + dataGradsBlockSize * dataIndex, |
| dataIndex, |
| &context_, |
| lengths[rangeIndex]); |
| } |
| } |
| CAFFE_ENFORCE( |
| dataIndex == reducedDataSize, dataIndex, " != ", reducedDataSize); |
| return true; |
| } |
| |
| // Input layout: |
| // orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, LENGTHS, INDICES |
| // orig_argXs represent original op's inputs and will be passed to the reducer |
| // directly |
| static constexpr int kNumInputs = ReducerGradient::originalInputs().size() + |
| 2 + (GradientNeedIndices ? 1 : 0); |
| enum _InputTags { |
| SEGMENT_GRADS = ReducerGradient::originalInputs().size(), |
| LENGTHS, |
| INDICES |
| }; |
| }; |
| |
| // Version of gradient that requires the main input and thus needs to receive |
| // length, indices and other stuff |
| template < |
| typename Tembedding, |
| typename T, |
| typename TLengths, |
| class Context, |
| class ReducerGradient, |
| bool SparseFused = true, |
| bool GradientNeedIndices = false> |
| class AbstractLengthsWithMainInputGradientOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(AbstractLengthsWithMainInputGradientOp); |
| |
| bool RunOnDevice() override { |
| if (SparseFused) { |
| return DispatchHelper<TensorTypes<int32_t, int64_t>>::call( |
| this, Input(INDICES)); |
| } else { |
| // type doesn't matter |
| return DoRunWithType<int64_t>(); |
| } |
| } |
| |
| template <typename IndexType> |
| bool DoRunWithType() { |
| // If more complicated fixed size logic becomes necessary, it can be moved |
| // to the reducer class |
| int64_t in_block_size = Input(SEGMENT_GRADS).size_from_dim(1); |
| return DispatchHelper<typename ReducerGradient::FixedDispatch, IndexType>:: |
| call(this, in_block_size); |
| } |
| |
| template <typename IndexType, int FixedSize> |
| bool DoRunWithValue() { |
| auto& dataInput = Input(DATA_INPUT); |
| auto& segmentGradsInput = Input(SEGMENT_GRADS); |
| auto& lengthsInput = Input(LENGTHS); |
| |
| CAFFE_ENFORCE(lengthsInput.dim() == 1, "LENGTHS must be a vector"); |
| int64_t numSegments = lengthsInput.size(0); |
| CAFFE_ENFORCE(segmentGradsInput.dim() > 0); |
| CAFFE_ENFORCE(numSegments == segmentGradsInput.size(0)); |
| const TLengths* lengths = lengthsInput.template data<TLengths>(); |
| |
| typename ReducerGradient::Meta ctx(segmentGradsInput, 1); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) { |
| int aux_num = ReducerGradient::originalInputs()[i]; |
| auto& aux_in = Input(i); |
| auto* aux_grad = aux_num < OutputSize() ? Output(aux_num) : nullptr; |
| ctx.observeOriginalInput(aux_num, aux_in, aux_grad, 1); |
| } |
| |
| // Either first dim the data or how much we pull in indexies from it |
| int64_t dataToReduceSize; |
| const IndexType* indices = nullptr; |
| if (SparseFused) { // static if |
| auto& indicesInput = Input(INDICES); |
| indices = indicesInput.template data<IndexType>(); |
| dataToReduceSize = indicesInput.size(0); |
| } else { |
| dataToReduceSize = dataInput.size(0); |
| } |
| |
| const T* segmentGrads = segmentGradsInput.template data<T>(); |
| |
| vector<int64_t> shape; |
| shape.push_back(dataToReduceSize); |
| ctx.appendGradShape(&shape); |
| auto* dataGradsOutput = Output(0, shape, at::dtype<T>()); |
| |
| int64_t dataGradsBlockSize = dataGradsOutput->size_from_dim(1); |
| int64_t segmentBlockSize = segmentGradsInput.size_from_dim(1); |
| T* dataGrads = dataGradsOutput->template mutable_data<T>(); |
| |
| const Tembedding* data = dataInput.template data<Tembedding>(); |
| int64_t dataIndex = 0; |
| for (const auto rangeIndex : c10::irange(numSegments)) { |
| ReducerGradient reducer( |
| ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_); |
| for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex]; |
| ++dataIndex) { |
| IndexType data_pos; |
| // No range checking, should've been verified in forward pass |
| if (SparseFused) { // static if |
| data_pos = indices[dataIndex]; |
| } else { |
| data_pos = dataIndex; |
| } |
| reducer.template fillGradWithMainInput<FixedSize>( |
| ctx, |
| data + dataGradsBlockSize * data_pos, |
| dataGrads + dataGradsBlockSize * dataIndex, |
| dataIndex, |
| &context_, |
| lengths[rangeIndex]); |
| } |
| } |
| return true; |
| } |
| |
| // Input layout: |
| // orig_arg1, orig_arg2, ..., orig_argN, SEGMENT_GRADS, LENGTHS, |
| // DATA_INPUT, [INDICES] |
| // orig_argXs represent original op's inputs and will be passed to the reducer |
| // directly |
| static constexpr int kNumInputs = ReducerGradient::originalInputs().size() + |
| 3 + (SparseFused ? 1 : 0) + (GradientNeedIndices ? 1 : 0); |
| enum _InputTags { |
| SEGMENT_GRADS = ReducerGradient::originalInputs().size(), |
| LENGTHS, |
| DATA_INPUT, |
| INDICES, |
| }; |
| }; |
| |
| // Version of gradient that requires the main input as well as the output of the |
| // forward op. |
| template <typename T, typename TLengths, class Context, class ReducerGradient> |
| class AbstractLengthsWithMainInputAndForwardOutputGradientOp |
| : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(AbstractLengthsWithMainInputAndForwardOutputGradientOp); |
| |
| bool RunOnDevice() override { |
| // If more complicated fixed size logic becomes necessary, it can be moved |
| // to the reducer class. |
| int64_t in_block_size = Input(SEGMENT_GRADS).size_from_dim(1); |
| return DispatchHelper<typename ReducerGradient::FixedDispatch>::call( |
| this, in_block_size); |
| } |
| |
| template <int FixedSize> |
| bool DoRunWithValue() { |
| auto& dataInput = Input(DATA_INPUT); |
| auto& segmentGradsInput = Input(SEGMENT_GRADS); |
| auto& lengthsInput = Input(LENGTHS); |
| auto& forwardOutputInput = Input(FORWARD_OUTPUT); |
| |
| CAFFE_ENFORCE(lengthsInput.dim() == 1, "LENGTHS must be a vector"); |
| int64_t numSegments = lengthsInput.size(0); |
| CAFFE_ENFORCE(segmentGradsInput.dim() > 0); |
| CAFFE_ENFORCE(numSegments == segmentGradsInput.size(0)); |
| const TLengths* lengths = lengthsInput.template data<TLengths>(); |
| |
| typename ReducerGradient::Meta ctx(segmentGradsInput, 1); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| for (int i = 0; i < ReducerGradient::originalInputs().size(); ++i) { |
| int aux_num = ReducerGradient::originalInputs()[i]; |
| auto& aux_in = Input(i); |
| auto* aux_grad = aux_num < OutputSize() ? Output(aux_num) : nullptr; |
| ctx.observeOriginalInput(aux_num, aux_in, aux_grad, 1); |
| } |
| |
| CAFFE_ENFORCE(forwardOutputInput.dim() > 0); |
| CAFFE_ENFORCE(numSegments == forwardOutputInput.size(0)); |
| const T* forwardOutput = forwardOutputInput.template data<T>(); |
| |
| int64_t dataToReduceSize = dataInput.size(0); |
| |
| const T* segmentGrads = segmentGradsInput.template data<T>(); |
| |
| vector<int64_t> shape; |
| shape.push_back(dataToReduceSize); |
| ctx.appendGradShape(&shape); |
| auto* dataGradsOutput = Output(0, shape, at::dtype<T>()); |
| |
| int64_t dataGradsBlockSize = dataGradsOutput->size_from_dim(1); |
| int64_t segmentBlockSize = segmentGradsInput.size_from_dim(1); |
| T* dataGrads = dataGradsOutput->template mutable_data<T>(); |
| |
| const T* data = dataInput.template data<T>(); |
| |
| int64_t dataIndex = 0; |
| for (const auto rangeIndex : c10::irange(numSegments)) { |
| ReducerGradient reducer( |
| ctx, segmentGrads + segmentBlockSize * rangeIndex, &context_); |
| for (int64_t start = dataIndex; dataIndex < start + lengths[rangeIndex]; |
| ++dataIndex) { |
| // No range checking, should've been verified in forward pass |
| reducer.template fillGradWithMainInputAndForwardOutput<FixedSize>( |
| ctx, |
| data + dataGradsBlockSize * dataIndex, |
| dataGrads + dataGradsBlockSize * dataIndex, |
| forwardOutput + segmentBlockSize * rangeIndex, |
| dataIndex, |
| &context_, |
| lengths[rangeIndex]); |
| } |
| } |
| return true; |
| } |
| |
| // Input layout: |
| // orig_arg1, orig_arg2, ..., orig_argN, FORWARD_OUTPUT, SEGMENT_GRADS, |
| // LENGTHS, DATA_INPUT |
| // orig_argXs represent original op's inputs and will be passed to the reducer |
| // directly |
| static constexpr int kNumInputs = |
| ReducerGradient::originalInputs().size() + 4; |
| enum _InputTags { |
| FORWARD_OUTPUT = ReducerGradient::originalInputs().size(), |
| SEGMENT_GRADS, |
| LENGTHS, |
| DATA_INPUT, |
| }; |
| }; |
| |
| // base implementation of sparse/non-sparse gradient computation |
| template < |
| typename ForwardOp, |
| typename ReducerDef, |
| typename ReducerGradient, |
| bool SparseFused, |
| bool GradientNeedIndices = false> |
| struct LengthsOpGetGradient : public GradientMakerBase { |
| using GradientMakerBase::GradientMakerBase; |
| vector<OperatorDef> GetGradientDefs() override { |
| vector<string> grad_ins; |
| string suffix = "Gradient"; |
| for (const int i : ReducerGradient::originalInputs()) { |
| grad_ins.push_back(I(i)); |
| } |
| if (ReducerGradient::requiresForwardOutput()) { |
| grad_ins.push_back(O(0)); |
| CAFFE_ENFORCE( |
| !SparseFused, |
| "Forward pass output not yet supported as input for backward pass " |
| "for SparseLengthsXXX operators"); |
| suffix = "AndForwardOutput" + suffix; |
| } |
| grad_ins.push_back(GO(0)); |
| grad_ins.push_back(I(ForwardOp::LENGTHS)); |
| bool indices_pushed = false; |
| if (ReducerGradient::requiresDataInput(Def())) { |
| grad_ins.push_back(I(0)); |
| if (SparseFused) { |
| grad_ins.push_back(I(ForwardOp::INDICES)); |
| indices_pushed = true; |
| } |
| suffix = "WithMainInput" + suffix; |
| } |
| if (GradientNeedIndices && !indices_pushed) { |
| if (SparseFused) { |
| grad_ins.push_back(I(ForwardOp::INDICES)); |
| } else { |
| // Hacky: using Input as Indices, remove this after we have specialized |
| // cuda LengthsIndicesInGradientSumGradient |
| grad_ins.push_back(I(0)); |
| } |
| } |
| vector<string> grad_outs; |
| grad_outs.push_back({SparseFused ? GI_V(0) : GI(0)}); |
| int aux_grads = ReducerGradient::numAuxInputsWithGrads(Def()); |
| for (int i = 1; i <= aux_grads; ++i) { |
| grad_outs.push_back(GI(i)); |
| } |
| vector<OperatorDef> r{CreateOperatorDef( |
| string(SparseFused ? "SparseLengths" : "Lengths") + |
| string(GradientNeedIndices ? "IndicesInGradient" : "") + |
| ReducerDef::name + suffix, |
| "", |
| grad_ins, |
| grad_outs)}; |
| if (SparseFused) { |
| SetSparse(0, I(ForwardOp::INDICES), GI_V(0)); |
| } |
| return r; |
| } |
| }; |
| |
| template < |
| typename T, |
| typename SIndex, |
| typename Context, |
| typename ReducerDef, |
| bool GradientNeedIndices = false> |
| struct AbstractLengthsDef { |
| using OpDef = ReducerDef; |
| static constexpr const char* basename = "Lengths"; |
| static constexpr const char* doc = R"DOC( |
| Applies '{op}' to each segment of the input tensor. Segments are defined |
| by their *LENGTHS*. *LENGTHS* is a vector that maps each of the slices of |
| *DATA* to a particular segment. Values belonging to the same segment are |
| aggregated together and considered for the '{op}' operation. |
| |
| For example *LENGTHS = [2, 1]* stands for segments *DATA[0..1]* and *DATA[2]* |
| |
| The sum of elements in *LENGTHS* must equal the number of elements in the first |
| dimension of *DATA*. The length of *OUTPUT* is equal to the number of input |
| segments, i.e. len(*LENGTHS*). |
| |
| {op_doc} |
| |
| {extra} |
| )DOC"; |
| static void PopulateSchema(OpSchema& schema) { |
| schema.Input(0, "DATA", "Input tensor, slices of which are aggregated."); |
| schema.Input( |
| Reducer::kInputCount, |
| "LENGTHS", |
| "Vector with the same sum of elements as the first dimension of DATA"); |
| schema.Output( |
| 0, |
| "OUTPUT", |
| "Aggregated output tensor. Has the first dimension of len(LENGTHS) "); |
| schema.TensorInferenceFunction( |
| [](const OperatorDef& def, const vector<TensorShape>& in) { |
| vector<TensorShape> out(0); |
| TensorShape output; |
| for (int d : in[Reducer::kInputCount].dims()) { |
| output.add_dims(d); |
| } |
| for (int j = 1; j < in[0].dims_size(); j++) { |
| output.add_dims(in[0].dims(j)); |
| } |
| output.set_data_type(in[0].data_type()); |
| out.push_back(output); |
| return out; |
| }); |
| ReducerDef::PopulateSchema(schema); |
| } |
| using Reducer = typename ReducerDef::template Reducer<T, Context>; |
| using ReducerGradient = |
| typename ReducerDef::template ReducerGradient<T, Context>; |
| using ForwardOp = AbstractLengthsOp<T, SIndex, Context, Reducer, false>; |
| using BackwardOp = |
| AbstractLengthsGradientOp<T, SIndex, Context, ReducerGradient>; |
| using WithMainInputBackwardOp = AbstractLengthsWithMainInputGradientOp< |
| T, |
| T, |
| SIndex, |
| Context, |
| ReducerGradient, |
| false>; |
| using WithMainInputAndForwardOutputBackwardOp = |
| AbstractLengthsWithMainInputAndForwardOutputGradientOp< |
| T, |
| SIndex, |
| Context, |
| ReducerGradient>; |
| using GetGradient = LengthsOpGetGradient< |
| ForwardOp, |
| ReducerDef, |
| ReducerGradient, |
| false /*SparseFused*/, |
| GradientNeedIndices>; |
| }; |
| |
| OpSchema::Cost CostInferenceForSparseLengths( |
| const OperatorDef& def, |
| const vector<TensorShape>& inputs, |
| bool use_weight); |
| |
| template < |
| typename T, |
| typename SIndex, |
| typename Context, |
| typename ReducerDef, |
| bool GradientNeedIndices = false> |
| struct AbstractSparseLengthsDef { |
| using OpDef = ReducerDef; |
| static constexpr const char* basename = "SparseLengths"; |
| static constexpr const char* doc = R"DOC( |
| Pulls in slices of the input tensor, groups them into segments and applies |
| '{op}' to each segment. Segments are defined by their LENGTHS. |
| |
| This op is basically Gather and Lengths{op} fused together. |
| |
| INDICES should contain integers in range 0..N-1 where N is the first dimension |
| of DATA. INDICES represent which slices of DATA need to be pulled in. |
| |
| LENGTHS is a vector that defines slice sizes by first dimension of DATA. Values |
| belonging to the same segment are aggregated together. sum(LENGTHS) has |
| to match INDICES size. |
| |
| The first dimension of the output is equal to the number of input segment, |
| i.e. `len(LENGTHS)`. Other dimensions are inherited from the input tensor. |
| |
| {op_doc} |
| )DOC"; |
| static void PopulateSchema(OpSchema& schema) { |
| schema.Input(0, "DATA", "Input tensor, slices of which are aggregated."); |
| schema.Input( |
| Reducer::kInputCount, |
| "INDICES", |
| "Integer vector containing indices of the first dimension of DATA for " |
| "the slices that are being aggregated"); |
| schema.Input( |
| Reducer::kInputCount + 1, |
| "LENGTHS", |
| "Non negative vector with sum of elements equal to INDICES length"); |
| schema.Output( |
| 0, |
| "OUTPUT", |
| "Aggregated output tensor. Has the first dimension of K " |
| "(the number of segments)."); |
| schema.TensorInferenceFunction(OpSchema::NeedsAllInputShapes( |
| [](const OperatorDef&, const std::vector<TensorShape>& input_types) { |
| std::vector<TensorShape> out(1); |
| out[0] = input_types[0]; |
| out[0].set_dims(0, input_types[Reducer::kInputCount + 1].dims(0)); |
| return out; |
| })); |
| ReducerDef::PopulateSchema(schema); |
| |
| schema.CostInferenceFunction( |
| [](const OperatorDef& def, |
| const vector<TensorShape>& inputs) -> OpSchema::Cost { |
| return CostInferenceForSparseLengths( |
| def, inputs, strcmp(OpDef::name, "WeightedSum") == 0); |
| }); |
| } |
| using Reducer = typename ReducerDef::template Reducer<T, Context>; |
| using ReducerGradient = |
| typename ReducerDef::template ReducerGradient<T, Context>; |
| using ForwardOp = AbstractLengthsOp<T, SIndex, Context, Reducer>; |
| // TODO(dzhulgakov): we're registering the same class twice here, |
| // consider avoiding op duplication here |
| // Note: registering 2 input version for now because of naming in the macro, |
| // will register 3 input version alone |
| /* INDICES are not used in CPU version, but they are needed in async CUDA |
| * version. So we register 3 input version for CPU as gradient op for |
| * GPU/CPU convert. We then register 2 input version for CPU for backward |
| * compatibility with older nets. |
| */ |
| using BackwardOp = AbstractLengthsGradientOp< |
| T, |
| SIndex, |
| Context, |
| ReducerGradient, |
| false /*GradientNeedIndices*/>; |
| using WithMainInputBackwardOp = AbstractLengthsWithMainInputGradientOp< |
| T, |
| T, |
| SIndex, |
| Context, |
| ReducerGradient>; |
| // Will return 3 input version. This is aligning new CPU/GPU nets. |
| using GetGradient = LengthsOpGetGradient< |
| ForwardOp, |
| ReducerDef, |
| ReducerGradient, |
| true /*SparseFused*/, |
| GradientNeedIndices>; |
| }; |
| } // namespace caffe2 |
| |
| #endif // CAFFE2_OPERATORS_SEGMENT_REDUCTION_OP_H_ |