| #ifndef CAFFE2_OPERATORS_UTILITY_OPS_H_ |
| #define CAFFE2_OPERATORS_UTILITY_OPS_H_ |
| |
| #include <cmath> |
| #include <map> |
| #include <utility> |
| |
| #include "caffe2/core/common_omp.h" |
| #include "caffe2/core/context.h" |
| #include "caffe2/core/export_caffe2_op_to_c10.h" |
| #include <c10/util/irange.h> |
| #include "caffe2/core/logging.h" |
| #include "caffe2/core/operator.h" |
| #include "caffe2/core/types.h" |
| #include "caffe2/operators/gather_op.h" |
| #include "caffe2/utils/conversions.h" |
| #include "caffe2/utils/math.h" |
| |
| C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(GatherRangesOp); |
| C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(LengthsGatherOp); |
| |
| namespace caffe2 { |
| |
| template <class Context> |
| class NanCheckOp final : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| template <class... Args> |
| explicit NanCheckOp(Args&&... args) |
| : Operator<Context>(std::forward<Args>(args)...) {} |
| |
| bool RunOnDevice() override; |
| |
| private: |
| TensorPrinter tensorPrinter_; |
| Tensor scratch_; |
| }; |
| |
| struct GetNanCheckGradient : public GradientMakerBase { |
| using GradientMakerBase::GradientMakerBase; |
| std::vector<OperatorDef> GetGradientDefs() override { |
| return {CreateOperatorDef( |
| "NanCheck", |
| "", |
| std::vector<string>{GO(0)}, |
| std::vector<string>{GI(0)})}; |
| } |
| }; |
| |
| template <class Context> |
| class IsNanOp final : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| IsNanOp(const OperatorDef& operator_def, Workspace* ws) |
| : Operator<Context>(operator_def, ws) {} |
| |
| bool RunOnDevice() override { |
| return DispatchHelper<TensorTypes<float, double>>::call(this, Input(0)); |
| } |
| |
| template <typename T> |
| bool DoRunWithType() { |
| auto& X = Input(0); |
| auto* Y = Output(0, X.sizes(), at::dtype<uint8_t>()); |
| const auto* X_data = X.template data<T>(); |
| uint8_t* Y_data = Y->template mutable_data<uint8_t>(); |
| for (const auto i : c10::irange(X.numel())) { |
| Y_data[i] = (uint8_t)(std::isnan(X_data[i])); |
| } |
| return true; |
| } |
| }; |
| |
| template <class Context> |
| class WallClockTimeOp final : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| |
| template <class... Args> |
| explicit WallClockTimeOp(Args&&... args) |
| : Operator<Context>(std::forward<Args>(args)...) {} |
| |
| bool RunOnDevice() override { |
| int64_t nanoseconds = static_cast<long int>( |
| std::chrono::duration_cast<std::chrono::nanoseconds>( |
| std::chrono::high_resolution_clock::now().time_since_epoch()) |
| .count()); |
| |
| TensorCPU* output = Output(0); |
| output->Resize(); |
| *output->template mutable_data<int64_t>() = nanoseconds; |
| |
| return true; |
| } |
| }; |
| |
| const char kPrintFileExtension[] = ".log"; |
| |
| template <class Context> |
| class PrintOp final : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_DISPATCH_HELPER; |
| explicit PrintOp(const OperatorDef& operator_def, Workspace* ws) |
| : Operator<Context>(operator_def, ws), |
| tensor_printer_( |
| operator_def.input(0), |
| this->template GetSingleArgument<int>("to_file", 0) |
| ? ws->RootFolder() + "/" + operator_def.input(0) + |
| kPrintFileExtension |
| : "", |
| this->template GetSingleArgument<int>("limit", 0)), |
| every_n_(this->template GetSingleArgument<int>("every_n", 1)) { |
| CAFFE_ENFORCE_GE(every_n_, 1); |
| } |
| |
| bool RunOnDevice() override { |
| if (++occurrences_mod_n_ > every_n_) { |
| occurrences_mod_n_ -= every_n_; |
| } |
| if (occurrences_mod_n_ != 1) { |
| return true; |
| } |
| |
| if (!this->InputIsTensorType(0, Context::GetDeviceType()) && |
| !this->InputIsTensorType(0, CPU)) { |
| LOG(INFO) << "Blob of type: " |
| << OperatorBase::Inputs().at(0)->meta().name(); |
| return true; |
| } |
| // special-case empty tensors since they may have no meta() |
| if (Input(0).numel() == 0) { |
| tensor_printer_.PrintMeta(Input(0)); |
| return true; |
| } |
| |
| using Types = TensorTypes< |
| float, |
| double, |
| int, |
| long, |
| bool, |
| char, |
| unsigned char, |
| std::string>; |
| |
| if (this->InputIsTensorType(0, CPU)) { |
| return DispatchHelper<Types>::call( |
| this, this->template Input<Tensor>(0, CPU)); |
| } else { |
| return DispatchHelper<Types>::call(this, Input(0)); |
| } |
| } |
| |
| private: |
| template <typename T> |
| bool DoRunWithType() { |
| // A simple strategy to copy tensor if needed, and have the tensor pointer |
| // pointing to the right instantiation. Note that tensor_copy_if_needed |
| // will handle memory deallocation itself so no smart pointer is needed. |
| const TensorCPU* tensor; |
| Tensor tensor_copy_if_needed(CPU); |
| if (this->InputIsTensorType(0, CPU)) { |
| tensor = &this->template Input<Tensor>(0, CPU); |
| } else { |
| // sync copy |
| tensor_copy_if_needed.CopyFrom(Input(0)); |
| tensor = &tensor_copy_if_needed; |
| } |
| tensor_printer_.Print<T>(*tensor); |
| return true; |
| } |
| |
| private: |
| TensorPrinter tensor_printer_; |
| int every_n_; |
| int occurrences_mod_n_{0}; |
| }; |
| |
| /** |
| * @brief Alias op makes the output and the input share the same underlying |
| * storage. |
| * |
| * WARNING: in general, in caffe2's operator interface different tensors should |
| * have different underlying storage, which is the assumption made by |
| * components such as the dependency engine and memory optimization. Thus, in |
| * normal situations you should not use the AliasOp, especially in a normal |
| * forward-backward pass. |
| * |
| * The Alias op is provided so one can achieve true asynchrony, such as |
| * Hogwild, in a graph. But make sure you understand all the implications |
| * similar to multi-thread computation before you use it explicitly. |
| */ |
| template <class Context> |
| class AliasOp final : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(AliasOp); |
| |
| bool RunOnDevice() override { |
| auto& input = Input(0); |
| CAFFE_ENFORCE_GE(input.numel(), 0, "Tensor is not initialized"); |
| OutputTensorAlias(0, input); |
| return true; |
| } |
| }; |
| |
| /** |
| * @brief Pass inputs to outputs. |
| * Input: |
| * DATA - dense tensor. |
| * Output: |
| * DATA - same tensor as input. |
| */ |
| template <class Context> |
| class EnsureDenseOp final : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(EnsureDenseOp) |
| |
| bool RunOnDevice() override { |
| const auto& input = Input(0); |
| auto* output = Output(0); |
| CAFFE_ENFORCE_GT(input.dim(), 0, "Input has to be at least a vector."); |
| // it is allowed to have the output inplace overwrite the input but also |
| // allow the output to be copied from the input |
| if (&input != output) { |
| output->ResizeLike(input); |
| output->CopyFrom(input, true /*async*/); |
| } |
| return true; |
| } |
| }; |
| |
| template <class Context> |
| class FlattenToVecOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(FlattenToVecOp); |
| |
| bool RunOnDevice() override { |
| auto& input = Input(0); |
| auto* output = Output(0); |
| CAFFE_ENFORCE_GE(input.dim(), 1, "The rank of the tensor must be >= 1."); |
| output->Resize(input.numel()); |
| |
| context_.CopyItemsSameDevice( |
| input.dtype(), |
| input.numel(), |
| input.raw_data(), |
| output->raw_mutable_data(input.dtype())); |
| return true; |
| } |
| }; |
| |
| // Output gets the data of input(0), but reshapes it like input(1). |
| template <class Context> |
| class ResizeLikeOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(ResizeLikeOp); |
| |
| bool RunOnDevice() override { |
| auto& input0 = Input(0); |
| auto& input1 = Input(1); |
| auto* output = Output(0); |
| CAFFE_ENFORCE_EQ(input0.numel(), input1.numel()); |
| output->ResizeLike(Input(1)); |
| context_.CopyItemsSameDevice( |
| input0.dtype(), |
| input0.numel(), |
| input0.raw_data(), |
| output->raw_mutable_data(input0.dtype())); |
| return true; |
| } |
| }; |
| |
| template <class Context> |
| class SumOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(SumOp); |
| |
| template <typename T> |
| bool DoRunWithType() { |
| auto& input0 = Input(0); |
| |
| if (InputSize() == 1) { |
| // TODO: better TensorOptions argument passing(e.g. default argument) |
| OutputTensorCopyFrom( |
| 0, |
| // I'll change the order of argument in another diff, so that we don't |
| // need to write this |
| at::dtype(input0.dtype()), |
| input0, |
| true /*async*/); |
| return true; |
| } |
| auto* output = Output(0, input0.sizes(), at::dtype<T>()); |
| T* output_data = output->template mutable_data<T>(); |
| // Dimension checking |
| for (const auto i : c10::irange(1, InputSize())) { |
| if (output->sizes() != Input(i).sizes()) { |
| CAFFE_THROW( |
| "Check failed: output->sizes() == Input(i).sizes().", |
| "Description: Input #", |
| i, |
| ", input dimension:", |
| Input(i).sizes(), |
| " should match output dimension: ", |
| output->sizes()); |
| } |
| } |
| |
| // Add the first two - works if in-place or not. |
| math::Add( |
| output->numel(), |
| input0.template data<T>(), |
| Input(1).template data<T>(), |
| output_data, |
| &context_); |
| // Add remaining. |
| for (const auto i : c10::irange(2, InputSize())) { |
| math::Add( |
| output->numel(), |
| output_data, |
| Input(i).template data<T>(), |
| output_data, |
| &context_); |
| } |
| return true; |
| } |
| |
| bool RunOnDevice() override { |
| return DispatchHelper<TensorTypes<float, double, int32_t, int64_t>>::call( |
| this, Input(0)); |
| } |
| }; |
| |
| inline OpSchema::Cost CostInferenceForSum( |
| const OperatorDef& def, |
| const std::vector<TensorShape>& in) { |
| struct OpSchema::Cost cost = PointwiseCostInference<1>(def, in); |
| cost.flops *= (in.size() - 1); |
| cost.params_bytes = 0; |
| return cost; |
| } |
| |
| // WeightedSumOp computes the weighted sum of several tensors. The input should |
| // be in the form X_0, weight_0, X_1, weight_1, ... where X_i all have the same |
| // shape, and weight_i are size 1 tensors that specifies the weight of each |
| // vector. Note that if one wants to do in-place computation, it could only be |
| // done with X_0 also as the output, but not other X_i. |
| template <class Context> |
| class WeightedSumOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(WeightedSumOp); |
| |
| bool RunOnDevice() override; |
| |
| template <typename T> |
| bool DoRunWithType() { |
| // the code is written this way because of 10.1 + gcc 7.3.1 compiler bug |
| // as discussed at |
| // https://devtalk.nvidia.com/default/topic/1048037/linux/cuda-10-1-nvidia-you-re-now-quot-fixing-quot-gcc-bugs-that-gcc-doesn-t-even-have/ |
| const int input_size = (*this).InputSize(); |
| CAFFE_ENFORCE_EQ(input_size % 2, 0); |
| const auto& X0 = Input(0); |
| const auto& weight0 = Input(1); |
| CAFFE_ENFORCE_EQ(weight0.numel(), 1); |
| const int size = X0.numel(); |
| // Note: removed Aliasing check, since Output already has |
| // caching capability |
| auto* Y = Output(0, X0.sizes(), at::dtype<T>()); |
| T* Y_data = Y->template mutable_data<T>(); |
| if (X0.numel() == 0) { |
| return true; |
| } |
| CAFFE_ENFORCE_GT(X0.numel(), 0); |
| if (input_size == 2) { |
| math::Scale<float, T>( |
| size, |
| weight0.template data<float>(), |
| X0.template data<T>(), |
| Y_data, |
| &context_); |
| return true; |
| } |
| const auto& X1 = Input(2); |
| CAFFE_ENFORCE( |
| !IsInputOutputAlias(2, 0), |
| "Input #2 is the same as output. If you want to do in-place updates, " |
| "put the output as input #0."); |
| const auto& weight1 = Input(3); |
| CAFFE_ENFORCE_EQ(X1.numel(), size); |
| CAFFE_ENFORCE_EQ(weight1.numel(), 1); |
| if (!IsInputOutputAlias(0, 0)) { |
| context_.template CopySameDevice<T>(size, X0.template data<T>(), Y_data); |
| } |
| math::Axpby<float, T, Context>( |
| size, |
| weight1.template data<float>(), |
| X1.template data<T>(), |
| weight0.template data<float>(), |
| Y_data, |
| &context_); |
| for (int i = 4; i < input_size; i += 2) { |
| const auto& Xi = Input(i); |
| // Do a check: if the input is the same as output, we have a problem - |
| // in-place update should always only happen with the zeroth input. |
| const std::string err_msg = "Input #" + to_string(i) + |
| " is the same as output. If you want to do in-place updates, " |
| "put the output as input #0."; |
| CAFFE_ENFORCE(!IsInputOutputAlias(i, 0), err_msg); |
| const auto& weighti = Input(i + 1); |
| CAFFE_ENFORCE_EQ(Xi.numel(), size); |
| CAFFE_ENFORCE_EQ(weighti.numel(), 1); |
| math::Axpy<float, T, Context>( |
| size, |
| weighti.template data<float>(), |
| Xi.template data<T>(), |
| Y_data, |
| &context_); |
| } |
| return true; |
| } |
| }; |
| |
| template <class Context> |
| class WeightedSumGradientOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| |
| template <class... Args> |
| explicit WeightedSumGradientOp(Args&&... args) |
| : Operator<Context>(std::forward<Args>(args)...), |
| grad_on_w_(this->template GetSingleArgument<bool>("grad_on_w", false)) { |
| } |
| |
| template <typename DstType> |
| bool DoRunWithType() { |
| CAFFE_ENFORCE_EQ(InputSize() % 2, 1); |
| auto output_size = grad_on_w_ ? InputSize() - 1 : InputSize() / 2; |
| CAFFE_ENFORCE_EQ(OutputSize(), output_size); |
| |
| auto& dY = Input(0); |
| const auto* dY_data = dY.template data<DstType>(); |
| int size = dY.numel(); |
| |
| // The input size should be the input size of the forward op plus 1 |
| for (int i = 0; i < InputSize() / 2; i++) { |
| auto& cur_w = Input(2 * i + 2); |
| CAFFE_ENFORCE_EQ(cur_w.numel(), 1); |
| |
| auto* cur_dX = Output(i, dY.sizes(), at::dtype<DstType>()); |
| |
| math::Scale<float, DstType, Context>( |
| size, |
| cur_w.template data<float>(), |
| dY_data, |
| cur_dX->template mutable_data<DstType>(), |
| &context_); |
| |
| if (grad_on_w_) { |
| auto& cur_X = Input(2 * i + 1); |
| CAFFE_ENFORCE_EQ(cur_X.numel(), size); |
| auto* cur_dw = Output(i + output_size / 2); |
| cur_dw->Resize(1); |
| math::Dot<DstType, Context>( |
| size, |
| dY_data, |
| cur_X.template data<DstType>(), |
| cur_dw->template mutable_data<float>(), |
| &context_); |
| } |
| } |
| |
| return true; |
| } |
| |
| bool RunOnDevice() override; |
| |
| private: |
| bool grad_on_w_; |
| }; |
| |
| /** |
| * @brief Update slices of the tensor in-place with weighted sum. |
| * |
| * ScatterWeightedSumOp is similar to WeightedSum and computes the weighted sum |
| * of several tensors. The first tensor has to be in-place and only slices of it |
| * on the first dimension as indexed by INDICES will be updated. |
| * |
| * Input: |
| * X_0 - tensor to be updated |
| * weight_0 - scalar weight for X_0, applied only to slices affected, |
| * INDICES - 1-D list of indices on the first dimension of X_0 that need to be |
| * updated |
| * X_1 - update slices, has to have shape of len(INDICES) + shape(X_0)[1:] |
| * weight_1 - scalar weight for X_1 update |
| * X_2, weight_2, ... |
| * |
| * Output: |
| * X_0 - has to be exactly the same tensor as the input 0 |
| * |
| * Note: The op pretty much ignores the exact shapes of the input arguments and |
| * cares only about sizes. It's done for performance consideration to avoid |
| * unnecessary reshapes. Only first dimension of X_0 is important, let's call it |
| * N. If M is the total size of X_0 and K is the size of INDICES then X_i is |
| * assumed to be of shape K x (M / N) regardless of the real shape. |
| * |
| * Note: Each update in INDICES is applied independently which means that if |
| * duplicated elements are present in INDICES the corresponding slice of X_0 |
| * will be scaled multiple times. Manual collapsing of INDICES is required |
| * beforehand if necessary. |
| * |
| * Note: Updates are applied sequentially by inputs which might have undesired |
| * consequences if the input tensor is accessed concurrently by different op |
| * (e.g. when doing Hogwild). Other threads might see intermediate results even |
| * on individual slice level, e.g. X_0 scaled by weight_0 but without any |
| * updates applied. |
| * |
| * For now really works only on CPU because of INDICES access |
| */ |
| template <class Context> |
| class ScatterWeightedSumOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(ScatterWeightedSumOp); |
| USE_DISPATCH_HELPER; |
| |
| bool RunOnDevice() override { |
| const auto& x0 = Input(0); |
| const auto x0Type = TypeMetaToDataType(x0.dtype()); |
| if (x0Type == TensorProto_DataType_FLOAT) { |
| return ScatterWeightedSumOp::template DoRun<float>(); |
| } |
| if (x0Type == TensorProto_DataType_DOUBLE) { |
| return ScatterWeightedSumOp::template DoRun<double>(); |
| } |
| CAFFE_THROW("Unsupported type of tensor X_0: ", x0.dtype().name()); |
| } |
| |
| private: |
| template<typename T> |
| bool DoRun() { |
| return DispatchHelper<TensorTypes<int32_t, int64_t>, T>::call(this, Input(2)); |
| } |
| template <typename T, typename Index> |
| bool DoRunWithType() { |
| int64_t block_size = Input(0).size_from_dim(1); |
| return DispatchHelper<FixedValues<1>, T, Index>::call(this, block_size); |
| } |
| |
| template <typename T, typename Index, int FixedSize> |
| bool DoRunWithValue() { |
| CAFFE_ENFORCE_EQ(InputSize() % 2, 1); |
| auto& X0 = Input(0); |
| auto& weight0 = Input(1); |
| auto& indices = Input(2); |
| auto* output = Output(0); |
| CAFFE_ENFORCE_EQ(&X0, output, "In place operation is required"); |
| |
| if (X0.numel() == 0) { |
| return true; |
| } |
| CAFFE_ENFORCE_GT(X0.numel(), 0); |
| CAFFE_ENFORCE_GT(X0.dim(), 0, "X0 has to be at least the vector"); |
| CAFFE_ENFORCE_EQ(weight0.numel(), 1); |
| int64_t M = X0.numel(); |
| int64_t N = X0.size(0); |
| int64_t K = indices.numel(); |
| int64_t block_size = M / N; |
| T* data = output->template mutable_data<T>(); |
| const Index* idxs = indices.template data<Index>(); |
| float w0 = *weight0.template data<float>(); |
| // It's most likely a constant so exact comparison is fine |
| if (w0 != 1.0) { |
| for (const auto i : c10::irange(K)) { |
| Index idx = idxs[i]; |
| CAFFE_ENFORCE( |
| 0 <= idx && idx < N, |
| "Index out of bounds: ", |
| idx, |
| ", range 0 to ", |
| N); |
| math::ScaleFixedSize<T, Context, FixedSize>( |
| block_size, |
| w0, |
| data + block_size * idx, |
| data + block_size * idx, |
| &context_); |
| } |
| } |
| for (int inp = 3; inp < InputSize(); inp += 2) { |
| auto& X = Input(inp); |
| auto& weight = Input(inp + 1); |
| CAFFE_ENFORCE_EQ(X.numel(), block_size * K); |
| CAFFE_ENFORCE_EQ(weight.numel(), 1); |
| const T* x_data = X.template data<T>(); |
| float w = *weight.template data<float>(); |
| for (const auto i : c10::irange(K)) { |
| Index idx = idxs[i]; |
| // double-checking the indices, but it's fine as it's DCHECK only |
| DCHECK(0 <= idx && idx < N) |
| << "Index out of bounds: " << idx << ", range 0 to " << N; |
| math::AxpyFixedSize<T, Context, FixedSize>( |
| block_size, |
| w, |
| x_data + block_size * i, |
| data + block_size * idx, |
| &context_); |
| } |
| } |
| return true; |
| } |
| Tensor x_data_host_; |
| Tensor weights_host_; |
| Tensor x_data_device_; |
| Tensor weights_device_; |
| }; |
| |
| /** |
| * @brief Update slices of the tensor in-place by overriding. |
| * |
| * Input: |
| * DATA - tensor to be updated |
| * INDICES - 1-D list of indices on the first dimension of X_0 that need to be |
| * updated |
| * SLICES - update slices, has to have shape of len(INDICES) + shape(X_0)[1:] |
| * |
| * Output: |
| * DATA - has to be exactly the same tensor as the input 0 |
| * |
| * Note: The op pretty much ignores the exact shapes of the input arguments and |
| * cares only about sizes. It's done for performance consideration to avoid |
| * unnecessary reshapes. Only first dimension of X_0 is important, let's call it |
| * N. If M is the total size of X_0 and K is the size of INDICES then X_i is |
| * assumed to be of shape K x (M / N) regardless of the real shape. |
| * |
| * Note: Each update in INDICES is applied independently which means that if |
| * duplicated elements are present in INDICES arbitrary one will win. |
| * |
| * For now really works only on CPU because of INDICES access |
| */ |
| template <class Context> |
| class ScatterAssignOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| virtual ~ScatterAssignOp() {} |
| |
| template <class... Args> |
| explicit ScatterAssignOp(Args&&... args) |
| : Operator<Context>(std::forward<Args>(args)...), |
| runners_({{{TensorProto_DataType_INT32, TensorProto_DataType_FLOAT}, |
| &ScatterAssignOp::DoRun<int32_t, float>}, |
| {{TensorProto_DataType_INT32, TensorProto_DataType_FLOAT16}, |
| &ScatterAssignOp::DoRun<int32_t, at::Half>}, |
| {{TensorProto_DataType_INT32, TensorProto_DataType_UINT8}, |
| &ScatterAssignOp::DoRun<int32_t, uint8_t>}, |
| {{TensorProto_DataType_INT32, TensorProto_DataType_INT32}, |
| &ScatterAssignOp::DoRun<int32_t, int32_t>}, |
| {{TensorProto_DataType_INT32, TensorProto_DataType_INT64}, |
| &ScatterAssignOp::DoRun<int32_t, int64_t>}, |
| {{TensorProto_DataType_INT32, TensorProto_DataType_DOUBLE}, |
| &ScatterAssignOp::DoRun<int32_t, double>}, |
| {{TensorProto_DataType_INT64, TensorProto_DataType_FLOAT}, |
| &ScatterAssignOp::DoRun<int64_t, float>}, |
| {{TensorProto_DataType_INT64, TensorProto_DataType_FLOAT16}, |
| &ScatterAssignOp::DoRun<int64_t, at::Half>}, |
| {{TensorProto_DataType_INT64, TensorProto_DataType_UINT8}, |
| &ScatterAssignOp::DoRun<int64_t, uint8_t>}, |
| {{TensorProto_DataType_INT64, TensorProto_DataType_INT32}, |
| &ScatterAssignOp::DoRun<int64_t, int32_t>}, |
| {{TensorProto_DataType_INT64, TensorProto_DataType_INT64}, |
| &ScatterAssignOp::DoRun<int64_t, int64_t>}, |
| {{TensorProto_DataType_INT64, TensorProto_DataType_DOUBLE}, |
| &ScatterAssignOp::DoRun<int64_t, double>}}) {} |
| |
| bool RunOnDevice() override { |
| const auto& data = Input(DATA); |
| const auto& slices = Input(SLICES); |
| auto& indices = Input(INDICES); |
| |
| const auto dataType = TypeMetaToDataType(data.dtype()); |
| const auto slicesType = TypeMetaToDataType(slices.dtype()); |
| const auto indicesType = TypeMetaToDataType(indices.dtype()); |
| C10_UNUSED auto* output = Output(0); |
| |
| auto runner = GetRunner(dataType, slicesType, indicesType); |
| (this->*runner)(); |
| return true; |
| } |
| |
| private: |
| typedef void (ScatterAssignOp::*RunnerType)(); |
| typedef std:: |
| map<std::pair<TensorProto_DataType, TensorProto_DataType>, RunnerType> |
| RunnerMap; |
| |
| RunnerMap runners_; |
| |
| RunnerType GetRunner( |
| const TensorProto_DataType dataType, |
| const TensorProto_DataType slicesType, |
| const TensorProto_DataType indicesType) { |
| CAFFE_ENFORCE_EQ(dataType, slicesType, "Data and slice types must match"); |
| auto it = runners_.find({indicesType, dataType}); |
| CAFFE_ENFORCE( |
| it != runners_.end(), |
| "Could not find the runner corresponding to indicesType, dataType = ", |
| indicesType, |
| " ", |
| dataType); |
| return it->second; |
| } |
| |
| template <typename Index, typename T> |
| void DoRun() { |
| auto& input = Input(DATA); |
| auto& indices = Input(INDICES); |
| auto& slices = Input(SLICES); |
| auto* output = Output(0); |
| CAFFE_ENFORCE_EQ(&input, output, "In place operation is required"); |
| |
| CAFFE_ENFORCE_GT(input.dim(), 0, "X0 has to be at least the vector"); |
| int64_t M = input.numel(); |
| int64_t N = input.size(0); |
| int64_t K = indices.numel(); |
| int64_t block_size = M / N; |
| CAFFE_ENFORCE_EQ(slices.numel(), block_size * K); |
| // TODO(dzhulgakov): it can be made to work with arbitrary data type by |
| // using raw_mutable_data |
| T* data = output->template mutable_data<T>(); |
| const Index* idxs = indices.template data<Index>(); |
| const T* slicesData = slices.template data<T>(); |
| DoScatterAssign(data, idxs, slicesData, N, K, block_size); |
| } |
| |
| template <typename Index, typename T> |
| void DoScatterAssign( |
| T* data, |
| const Index* idxs, |
| const T* slicesData, |
| int64_t N, |
| int64_t K, |
| int64_t block_size) { |
| for (const auto i : c10::irange(K)) { |
| Index idx = idxs[i]; |
| // double-checking the indices, but it's fine as it's DCHECK only |
| DCHECK(0 <= idx && idx < N) |
| << "Index out of bounds: " << idx << ", range 0 to " << N; |
| context_.template CopySameDevice<T>( |
| block_size, slicesData + block_size * i, data + block_size * idx); |
| } |
| } |
| |
| INPUT_TAGS(DATA, INDICES, SLICES); |
| }; |
| |
| template <class Context> |
| class ScatterOp : public Operator<CPUContext> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| |
| template <class... Args> |
| explicit ScatterOp(Args&&... args) |
| : Operator<CPUContext>(std::forward<Args>(args)...), |
| OP_SINGLE_ARG(int, "axis", axis_, 1) {} |
| |
| ~ScatterOp() noexcept override {} |
| |
| bool RunOnDevice() override { |
| TORCH_CHECK( |
| Context::GetDeviceType() == kCPU, |
| "ScatterOp currently only supports CPU.") |
| |
| return DispatchHelper<TensorTypes<int32_t, int64_t>>::call( |
| this, this->template Input<Tensor>(INDICES, CPU)); |
| } |
| |
| template <typename IndexType> |
| bool DoRunWithType() { |
| const Tensor& data = Input(DATA); |
| const Tensor& indices = Input(INDICES); |
| const Tensor& updates = Input(UPDATES); |
| const TypeMeta dataType = data.dtype(); |
| size_t item_bytesize = dataType.itemsize(); |
| |
| // ONNX allows negative axis to index from the back, valid range: [-r, r]. |
| axis_ = data.canonical_axis_index(axis_); |
| |
| CAFFE_ENFORCE_GE( |
| data.dim(), axis_ + 1, "DATA should be at least [axis+1]-D"); |
| CAFFE_ENFORCE_GE(axis_, 0, "Axis should be non-negative"); |
| CAFFE_ENFORCE_LT(axis_, data.dim(), "Axis out of range"); |
| |
| Tensor* output = Output(0, data.sizes().vec(), at::dtype(dataType)); |
| output->CopyFrom(data); |
| char* out = static_cast<char*>(output->raw_mutable_data(dataType)); |
| |
| // Succeed if size of output is zero, which can happen for empty batch which |
| // would have data dimension size of 0. |
| // This *must* be done AFTER output->raw_mutable_data() above as that has |
| // important allocation side effect that we must see. |
| if (output->numel() == 0) { |
| return true; |
| } |
| |
| const IndexType* idxs = indices.template data<IndexType>(); |
| const char* src_base = static_cast<const char*>(updates.raw_data()); |
| |
| const int64_t outer_dims_product = indices.size_to_dim(axis_); |
| |
| const int64_t dst_indexing_axis_dim = data.size(axis_); |
| |
| const int64_t idxs_block_size = indices.size_from_dim(axis_ + 1); |
| const int64_t src_block_size = updates.size_from_dim(axis_ + 1); |
| const int64_t dst_block_size = data.size_from_dim(axis_ + 1); |
| |
| const int64_t idxs_batch_size = indices.size_from_dim(axis_); |
| const int64_t src_batch_size = updates.size_from_dim(axis_); |
| const int64_t dst_batch_size = data.size_from_dim(axis_); |
| |
| const int64_t N = indices.size(axis_); |
| |
| check_indexarray_range<IndexType>(idxs, N, dst_indexing_axis_dim); |
| |
| // For a 3-D tensor, dst is updated as: |
| // dst[i][idxs[i][j][k]][k] = src[i][j][k] # if dim == 1 |
| // where i, j, k are iterating over their corresponding axis I, J, K. |
| // For a given i, j, k tuple. |
| // idxs offset can be computed as i * J_src * K + j * K + k. |
| // src offset can be computed as i * J_src * K + j * K + k. |
| // dst offset can be computed as i * J_dst * K + idxs[idxs_offset] * K + K |
| // Note that idxs and src should have the same rank and shape. |
| // dst should have the same rank as idxs and src, but the dimension of dim |
| // axis can be different. That is why in the above equation, there is the |
| // difference of J_src and J_dst. |
| for (const auto outer_batch : c10::irange(outer_dims_product)) { |
| for (const auto i : c10::irange(N)) { |
| for (const auto inner_batch : c10::irange(idxs_block_size)) { |
| auto idxs_elem_idx = |
| outer_batch * idxs_batch_size + i * idxs_block_size + inner_batch; |
| auto src_elem_idx = |
| outer_batch * src_batch_size + i * src_block_size + inner_batch; |
| auto dst_elem_idx = outer_batch * dst_batch_size + |
| idxs[idxs_elem_idx] * dst_block_size + inner_batch; |
| |
| auto src = src_base + src_elem_idx * item_bytesize; |
| auto dst = out + dst_elem_idx * item_bytesize; |
| context_.CopyItemsSameDevice(dataType, 1, src, dst); |
| } |
| } |
| } |
| return true; |
| } |
| |
| INPUT_TAGS(DATA, INDICES, UPDATES); |
| |
| // Check that indices fall within dimension array size with CAFFE_ENFORCE. |
| template <typename IndexType> |
| static void check_indexarray_range( |
| const IndexType* indices, |
| int64_t n, |
| IndexType indexing_axis_dim) { |
| for (const auto i : c10::irange(n)) { |
| auto idx = indices[i]; |
| CAFFE_ENFORCE( |
| 0 <= idx && idx < indexing_axis_dim, |
| "INDICES element is out of DATA bounds, id=", |
| idx, |
| " axis_dim=", |
| indexing_axis_dim); |
| } |
| } |
| |
| protected: |
| int axis_; |
| }; |
| |
| template <class Context> |
| class LengthsToSegmentIdsOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(LengthsToSegmentIdsOp); |
| |
| bool RunOnDevice() override { |
| auto& input = Input(0); |
| auto* output = Output(0); |
| auto* input_data = input.template data<int32_t>(); |
| |
| CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector."); |
| auto total_length = |
| std::accumulate(input_data, input_data + input.numel(), 0); |
| |
| output->Resize(total_length); |
| auto* output_data = output->template mutable_data<int32_t>(); |
| |
| for (const auto i : c10::irange(input.numel())) { |
| auto len = input_data[i]; |
| std::fill(output_data, output_data + len, i); |
| output_data += len; |
| } |
| return true; |
| } |
| }; |
| |
| template <class Context> |
| class LengthsToRangesOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(LengthsToRangesOp); |
| |
| bool RunOnDevice() override { |
| auto& input = Input(0); |
| auto* output = Output(0); |
| auto* input_data = input.template data<int32_t>(); |
| |
| CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector."); |
| auto size = input.numel(); |
| |
| output->Resize(size, 2); |
| auto* output_data = output->template mutable_data<int32_t>(); |
| |
| int32_t offset = 0; |
| for (const auto i : c10::irange(size)) { |
| auto len = input_data[i]; |
| output_data[i * 2] = offset; |
| output_data[i * 2 + 1] = len; |
| offset += len; |
| } |
| return true; |
| } |
| }; |
| |
| template <class Context> |
| class LengthsToOffsetsOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| |
| template <class... Args> |
| explicit LengthsToOffsetsOp(Args&&... args) |
| : Operator<Context>(std::forward<Args>(args)...), |
| include_last_offset_(this->template GetSingleArgument<bool>( |
| "include_last_offset", |
| false)) {} |
| |
| bool RunOnDevice() override { |
| auto& input = Input(0); |
| auto* output = Output(0); |
| auto* input_data = input.template data<int32_t>(); |
| |
| CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector."); |
| auto size = input.numel(); |
| |
| output->Resize(size + (include_last_offset_ ? 1 : 0)); |
| auto* output_data = output->template mutable_data<int32_t>(); |
| |
| int32_t offset = 0; |
| for (const auto i : c10::irange(size)) { |
| auto len = input_data[i]; |
| output_data[i] = offset; |
| offset += len; |
| } |
| if (include_last_offset_) { |
| output_data[size] = offset; |
| } |
| return true; |
| } |
| |
| private: |
| bool include_last_offset_; |
| }; |
| |
| template <class Context> |
| class SegmentIdsToLengthsOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(SegmentIdsToLengthsOp); |
| |
| bool RunOnDevice() override { |
| return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0)); |
| } |
| |
| template <typename Index> |
| bool DoRunWithType() { |
| auto& input = Input(0); |
| if (input.dim() == 2) { |
| CAFFE_ENFORCE( |
| input.dim32(0) == 1 || input.dim32(1) == 1, |
| "Input must be a vector."); |
| } else { |
| CAFFE_ENFORCE_EQ(input.dim(), 1, "Input must be a vector."); |
| } |
| auto* input_data = input.template data<Index>(); |
| auto input_size = input.numel(); |
| auto* output = Output(0); |
| // segment id starts from 0 |
| auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0; |
| if (InputSize() > 1) { |
| CAFFE_ENFORCE_GE(Input(1).dim(), 1); |
| CAFFE_ENFORCE_LE( |
| num_segments, |
| Input(1).size(0), |
| "The number of segments inferred should *NOT* be larger " |
| "than the size of Input(1)'s first dimension"); |
| num_segments = Input(1).size(0); |
| } |
| CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range"); |
| output->Resize(num_segments); |
| auto* output_data = output->template mutable_data<int32_t>(); |
| if (num_segments == 0) { |
| return true; |
| } |
| std::fill(output_data, output_data + num_segments, 0); |
| Index prev = 0; // Assume that segment_id >= 0. |
| for (const auto i : c10::irange(input_size)) { |
| CAFFE_ENFORCE( |
| prev <= input_data[i], |
| "Segment ids must be sorted: ", |
| prev, |
| " vs ", |
| input_data[i]); |
| prev = input_data[i]; |
| output_data[input_data[i]] += 1; |
| } |
| |
| return true; |
| } |
| }; |
| |
| template <class Context> |
| class SegmentIdsToRangesOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(SegmentIdsToRangesOp); |
| |
| bool RunOnDevice() override { |
| return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0)); |
| } |
| |
| template <typename Index> |
| bool DoRunWithType() { |
| auto& input = Input(0); |
| CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector."); |
| auto* input_data = input.template data<Index>(); |
| auto input_size = input.numel(); |
| auto* output = Output(0); |
| // segment id starts from 0 |
| auto num_segments = input_size ? input_data[input_size - 1] + 1 : 0; |
| if (InputSize() > 1) { |
| CAFFE_ENFORCE_GE(Input(1).dim(), 1); |
| CAFFE_ENFORCE_LE( |
| num_segments, |
| Input(1).size(0), |
| "The number of segments inferred should *NOT* be larger " |
| "than the size of Input(1)'s first dimension"); |
| num_segments = Input(1).size(0); |
| } |
| CAFFE_ENFORCE(0 <= num_segments, "Indices must be in 0..K-1 range"); |
| output->Resize(num_segments, 2); |
| auto* output_data = output->template mutable_data<int32_t>(); |
| if (num_segments == 0) { |
| return true; |
| } |
| std::fill(output_data, output_data + num_segments * 2, 0); |
| Index prev = input_data[0]; |
| for (const auto i : c10::irange(input_size)) { |
| CAFFE_ENFORCE( |
| prev <= input_data[i], |
| "Segment ids must be sorted: ", |
| prev, |
| " vs ", |
| input_data[i]); |
| while (prev != input_data[i]) { |
| ++prev; |
| output_data[prev * 2] = i; |
| } |
| output_data[input_data[i] * 2 + 1] += 1; |
| } |
| |
| return true; |
| } |
| }; |
| |
| template <class Context> |
| class LengthsToWeightsOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| template <class... Args> |
| explicit LengthsToWeightsOp(Args&&... args) |
| : Operator<Context>(std::forward<Args>(args)...), |
| power_(this->template GetSingleArgument<float>("power", 0.5)) {} |
| |
| bool RunOnDevice() override { |
| return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(0)); |
| } |
| |
| template <typename Index> |
| bool DoRunWithType() { |
| auto& input = Input(0); |
| CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector."); |
| auto* input_data = input.template data<Index>(); |
| auto input_size = input.numel(); |
| auto* output = Output(0); |
| |
| int64_t output_size = 0; |
| for (const auto i : c10::irange(input_size)) { |
| CAFFE_ENFORCE_GE(input_data[i], 0, "unexpected negative length value"); |
| output_size += input_data[i]; |
| } |
| |
| std::function<float(const int64_t& length, const float& power)> getWeight; |
| if (power_ == 0.5) { |
| getWeight = [](const int64_t& length, const float& /*power*/) { |
| return 1.0 / std::sqrt(length); |
| }; |
| } else if (power_ == 1) { |
| getWeight = [](const int64_t& length, const float& /*power*/) { |
| return 1.0 / length; |
| }; |
| } else { |
| getWeight = [](const int64_t& length, const float& power) { |
| return 1.0 / std::pow(length, power); |
| }; |
| } |
| |
| output->Resize(output_size); |
| auto* output_data = output->template mutable_data<float>(); |
| int64_t cnt = 0; |
| for (const auto i : c10::irange(input_size)) { |
| auto len = input_data[i]; |
| if (len == 0) { |
| continue; |
| } |
| CAFFE_ENFORCE_LE(cnt + len, output_size, "unexpected lengths value"); |
| |
| float weight_value = getWeight(len, power_); |
| std::fill(output_data + cnt, output_data + cnt + len, weight_value); |
| cnt += len; |
| } |
| |
| return true; |
| } |
| |
| private: |
| float power_; |
| }; |
| |
| template <class Context> |
| class HasElementsOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(HasElementsOp); |
| |
| bool RunOnDevice() override { |
| bool res = false; |
| for (const auto i : c10::irange(InputSize())) { |
| const auto& input = Input(i); |
| res = res || input.numel() > 0; |
| } |
| auto* output = Output(0); |
| output->Resize(std::vector<int64_t>{}); |
| *output->template mutable_data<bool>() = res; |
| return true; |
| } |
| }; |
| |
| // Return the size of a tensor |
| template <class Context> |
| class SizeOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(SizeOp); |
| |
| bool RunOnDevice() override { |
| auto& input = Input(0); |
| |
| auto* output = Output(0, vector<int64_t>(), at::dtype<int64_t>()); |
| auto* output_data = output->template mutable_data<int64_t>(); |
| |
| auto size = input.numel(); |
| math::Set<int64_t, Context>( |
| 1, static_cast<int64_t>(size), output_data, &context_); |
| |
| return true; |
| } |
| }; |
| |
| // returns a shape to be passed to Reshape |
| template <class Context> |
| class LengthsToShapeOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(LengthsToShapeOp); |
| |
| bool RunOnDevice() override { |
| auto& input = Input(0); |
| |
| CAFFE_ENFORCE(input.sizes().size() == 1, "Input must be a vector."); |
| auto* output = Output(0); |
| auto* input_data = input.template data<int32_t>(); |
| |
| auto size = input.numel(); |
| auto first = input_data[0]; |
| |
| for (const auto i : c10::irange(1, size)) { |
| CAFFE_ENFORCE( |
| input_data[i] == first, "All elements of input must be same "); |
| } |
| |
| output->Resize(2); |
| auto* output_data = output->template mutable_data<int32_t>(); |
| output_data[0] = size; |
| output_data[1] = first; |
| |
| return true; |
| } |
| }; |
| |
| template <class Context> |
| class GatherRangesOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(GatherRangesOp); |
| |
| bool RunOnDevice() override { |
| return DispatchHelper<TensorTypes<int32_t, int64_t>>::call( |
| this, this->template Input<Tensor>(RANGES, CPU)); |
| } |
| |
| template <typename Index> |
| bool DoRunWithType() { |
| auto& data = Input(DATA); |
| auto& ranges = Input(RANGES); |
| auto* outputData = Output(0); |
| auto* outputLengths = Output(1); |
| |
| auto batchSize = ranges.size(0); |
| CAFFE_ENFORCE(data.dim() == 1, "Data has to be 1-D"); |
| CAFFE_ENFORCE(ranges.dim() == 3, "Ranges must be 3-D"); |
| CAFFE_ENFORCE(ranges.size(1) > 0, "There has to be at least one range"); |
| CAFFE_ENFORCE_EQ( |
| ranges.size(2), 2, "Ranges last dimension should be of size 2"); |
| |
| auto* rawData = static_cast<const char*>(data.raw_data()); |
| auto* rangesData = ranges.template data<Index>(); |
| |
| outputLengths->Resize(batchSize); |
| auto* outputLengthsPtr = outputLengths->template mutable_data<int32_t>(); |
| size_t start = 0; |
| size_t blockSize = ranges.size_from_dim(1); |
| for (const auto i : c10::irange(batchSize)) { |
| auto end = start + blockSize; |
| outputLengthsPtr[i] = accumulate(rangesData, start, end); |
| start = end; |
| } |
| |
| size_t outputSize = accumulate(rangesData, 0, ranges.numel()); |
| outputData->Resize(outputSize); |
| |
| auto outputRawData = |
| static_cast<char*>(outputData->raw_mutable_data(data.dtype())); |
| VLOG(1) << "Copying data"; |
| size_t outputOffsetBytes = 0; |
| auto itemsize = data.dtype().itemsize(); |
| for (int i = 0; i < ranges.numel(); i += 2) { |
| auto rangeStart = rangesData[i]; |
| auto rangeLength = rangesData[i + 1]; |
| if (!rangeLength) { |
| continue; |
| } |
| auto rangeSizeBytes = rangeLength * itemsize; |
| CAFFE_ENFORCE(outputOffsetBytes < outputSize * itemsize); |
| CAFFE_ENFORCE(rangeStart + rangeLength <= data.numel()); |
| context_.CopyItemsSameDevice( |
| data.dtype(), |
| rangeLength, |
| rawData + rangeStart * itemsize, |
| outputRawData + outputOffsetBytes); |
| outputOffsetBytes += rangeSizeBytes; |
| } |
| CAFFE_ENFORCE(outputOffsetBytes == outputSize * itemsize); |
| return true; |
| } |
| |
| INPUT_TAGS(DATA, RANGES, LENGTHS); |
| |
| private: |
| template <typename Index> |
| size_t accumulate(Index* ranges, size_t start, size_t end) { |
| size_t result = 0; |
| for (size_t i = start + 1; i < end; i += 2) { |
| result += ranges[i]; |
| } |
| return result; |
| } |
| }; |
| |
| template <class Context> |
| class LengthsGatherOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(LengthsGatherOp); |
| |
| bool RunOnDevice() override { |
| return DispatchHelper<TensorTypes<int32_t, int64_t>>::call( |
| this, this->template Input<Tensor>(INDICES, CPU)); |
| } |
| |
| template <typename Index> |
| bool DoRunWithType() { |
| auto& items = Input(ITEMS); |
| auto& lengths = Input(LENGTHS); |
| auto& indices = Input(INDICES); |
| auto* output = Output(0); |
| |
| CAFFE_ENFORCE_GE(items.dim(), 1, "ITEMS should be at least 1-D"); |
| CAFFE_ENFORCE_EQ(lengths.dim(), 1, "LENGTHS should be 1-D"); |
| CAFFE_ENFORCE_EQ(indices.dim(), 1, "INDICES should be 1-D"); |
| |
| const auto* lengths_data = lengths.template data<int32_t>(); |
| const auto* indices_data = indices.template data<Index>(); |
| |
| int64_t total_length = 0; |
| for (const auto i : c10::irange(indices.numel())) { |
| auto idx = indices_data[i]; |
| CAFFE_ENFORCE_LT(idx, lengths.numel()); |
| total_length += lengths_data[idx]; |
| } |
| auto shape = items.sizes().vec(); |
| shape[0] = total_length; |
| output->Resize(shape); |
| |
| offsets_.clear(); |
| int64_t running_offset = 0; |
| offsets_.reserve(lengths.numel()); |
| for (const auto i : c10::irange(lengths.numel())) { |
| offsets_.push_back(running_offset); |
| running_offset += lengths_data[i]; |
| } |
| CAFFE_ENFORCE_EQ( |
| items.size(0), |
| running_offset, |
| "LENGTHS must match the first dimension of ITEMS"); |
| |
| auto src_base = static_cast<const char*>(items.raw_data()); |
| auto block_size = items.size_from_dim(1); |
| auto block_bytesize = block_size * items.itemsize(); |
| auto out = static_cast<char*>(output->raw_mutable_data(items.dtype())); |
| |
| for (const auto i : c10::irange(indices.numel())) { |
| auto idx = indices_data[i]; |
| auto length = lengths_data[idx]; |
| context_.CopyItemsSameDevice( |
| items.dtype(), |
| length * block_size, |
| src_base + offsets_[idx] * block_bytesize, |
| out); |
| out += length * block_bytesize; |
| } |
| return true; |
| } |
| |
| std::vector<int64_t> offsets_; |
| |
| INPUT_TAGS(ITEMS, LENGTHS, INDICES); |
| }; |
| |
| template <typename T, class Context> |
| class AccumulateHistogramOp : public Operator<Context> { |
| public: |
| template <class... Args> |
| explicit AccumulateHistogramOp(Args&&... args) |
| : Operator<Context>(std::forward<Args>(args)...), |
| lower_bound_( |
| this->template GetSingleArgument<float>("lower_bound", 0.0)), |
| upper_bound_( |
| this->template GetSingleArgument<float>("upper_bound", 1.0)), |
| num_buckets_(this->template GetSingleArgument<int>("num_buckets", 1)) { |
| CAFFE_ENFORCE_GT(num_buckets_, 0); |
| // 2 more for histograms < lower_bound, >= upper_bound respectively |
| num_output_buckets_ = num_buckets_ + 2; |
| accumulate_hist_ = std::vector<int64_t>(num_output_buckets_, 0); |
| } |
| |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| |
| bool RunOnDevice() override { |
| auto& X = Input(X_IN); |
| auto* X_data = X.template data<T>(); |
| int N = X.numel(); |
| auto* cur_hist = Output(CUR_HIST); |
| auto* acc_hist = Output(ACC_HIST); |
| cur_hist->Resize(num_output_buckets_); |
| acc_hist->Resize(num_output_buckets_); |
| auto* cur_hist_data = cur_hist->template mutable_data<int64_t>(); |
| auto* acc_hist_data = acc_hist->template mutable_data<int64_t>(); |
| auto segment = (upper_bound_ - lower_bound_) / num_buckets_; |
| math::Set<int64_t, Context>( |
| num_output_buckets_, 0, cur_hist_data, &context_); |
| |
| for (const auto i : c10::irange(N)) { |
| int bucket_index = -1; |
| if (X_data[i] < lower_bound_) { |
| bucket_index = 0; |
| } else if (X_data[i] >= upper_bound_) { |
| bucket_index = num_buckets_ + 1; |
| } else { |
| bucket_index = (int)((X_data[i] - lower_bound_) / segment) + 1; |
| } |
| cur_hist_data[bucket_index] += 1; |
| accumulate_hist_[bucket_index] += 1; |
| } |
| |
| for (const auto i : c10::irange(num_output_buckets_)) { |
| acc_hist_data[i] = accumulate_hist_[i]; |
| } |
| |
| return true; |
| } |
| |
| private: |
| float lower_bound_; |
| float upper_bound_; |
| int num_buckets_; |
| int num_output_buckets_; |
| std::vector<int64_t> accumulate_hist_; |
| |
| INPUT_TAGS(X_IN); |
| OUTPUT_TAGS(CUR_HIST, ACC_HIST); |
| }; |
| |
| template <class Context> |
| class RangeOp : public Operator<Context> { |
| public: |
| USE_OPERATOR_CONTEXT_FUNCTIONS; |
| USE_SIMPLE_CTOR_DTOR(RangeOp) |
| |
| bool RunOnDevice() override { |
| return DispatchHelper<TensorTypes<int32_t, int64_t, float, double>>::call( |
| this, Input(0)); |
| } |
| |
| template <typename T> |
| T readScalarInput(const int index) { |
| if (std::is_same<Context, TensorCPU>::value) { |
| return Input(index).template data<T>()[0]; |
| } else { |
| local_.CopyFrom(Input(index)); |
| return local_.template data<T>()[0]; |
| } |
| } |
| |
| template <typename T> |
| bool DoRunWithType() { |
| T stop = 0; |
| T start = 0; |
| T step = 1; |
| |
| for (const auto i : c10::irange(InputSize())) { |
| CAFFE_ENFORCE_EQ( |
| Input(i).numel(), 1, "All inputs must be scalar/1D tensor."); |
| } |
| |
| switch (InputSize()) { |
| case 1: |
| stop = readScalarInput<T>(0); |
| break; |
| case 2: |
| start = readScalarInput<T>(0); |
| stop = readScalarInput<T>(1); |
| break; |
| case 3: |
| step = readScalarInput<T>(2); |
| start = readScalarInput<T>(0); |
| stop = readScalarInput<T>(1); |
| break; |
| } |
| CAFFE_ENFORCE_NE(step, 0, "Step size cannot be 0."); |
| int length; |
| auto diff = stop - start; |
| if (std::is_integral<T>::value) { |
| // Avoid casting to and from floats in case it introduces rounding and |
| // avoid mod because the compiler doesn't strip unused code until later. |
| length = diff / step; |
| if (length * step < diff) { |
| length += 1; |
| } |
| } else { |
| length = static_cast<int>(ceil(diff / step)); |
| } |
| |
| // Match numpy's behavior here. |
| if (length <= 0) { |
| Output(0, {0}, at::dtype<T>()); |
| return true; |
| } else { |
| auto* output = Output(0, {length}, at::dtype<T>()); |
| return DoRunOnDevice<T>(start, step, output); |
| } |
| } |
| |
| template <typename T> |
| bool DoRunOnDevice(const T& start, const T& step, Tensor* output); |
| |
| private: |
| // local CPU tensor for copying constants. |
| Tensor local_{CPU}; |
| }; |
| |
| class ThrowExceptionOp : public Operator<CPUContext> { |
| public: |
| template <class... Args> |
| explicit ThrowExceptionOp(Args&&... args) |
| : Operator<CPUContext>(std::forward<Args>(args)...), |
| message_(GetSingleArgument<std::string>( |
| "message", |
| "Exception from ThrowExceptionOp")) {} |
| |
| bool RunOnDevice() override { |
| CAFFE_THROW(message_); |
| } |
| |
| private: |
| const std::string message_; |
| }; |
| |
| class ThrowChildThreadExceptionOp : public Operator<CPUContext> { |
| public: |
| template <class... Args> |
| explicit ThrowChildThreadExceptionOp(Args&&... args) |
| : Operator<CPUContext>(std::forward<Args>(args)...), |
| message_(GetSingleArgument<std::string>( |
| "message", |
| "Exception from ThrowChildThreadExceptionOp")) {} |
| |
| bool RunOnDevice() override { |
| std::thread t([this]() { CAFFE_THROW(this->message_); }); |
| |
| t.join(); |
| return true; |
| } |
| |
| private: |
| const std::string message_; |
| }; |
| |
| class LogFatalOp : public Operator<CPUContext> { |
| public: |
| template <class... Args> |
| explicit LogFatalOp(Args&&... args) |
| : Operator<CPUContext>(std::forward<Args>(args)...), |
| message_(GetSingleArgument<std::string>( |
| "message", |
| "Logging from LogFatalOp")) {} |
| |
| bool RunOnDevice() override { |
| LOG(FATAL) << message_; |
| return true; |
| } |
| |
| private: |
| const std::string message_; |
| }; |
| |
| class FailOp : public Operator<CPUContext> { |
| public: |
| template <class... Args> |
| explicit FailOp(Args&&... args) |
| : Operator<CPUContext>(std::forward<Args>(args)...) {} |
| |
| bool RunOnDevice() override { |
| return false; |
| } |
| }; |
| |
| } // namespace caffe2 |
| |
| #endif // CAFFE2_OPERATORS_UTILITY_OPS_H_ |