| #include "caffe2/operators/gelu_op.h" |
| |
| #include <algorithm> |
| #include <cmath> |
| #include <functional> |
| #include <numeric> |
| #include <string> |
| #include <vector> |
| |
| #ifdef _MSC_VER |
| #ifndef _USE_MATH_DEFINES |
| #define _USE_MATH_DEFINES |
| #endif |
| #include <math.h> |
| #endif // _MSC_VER |
| |
| #include "caffe2/utils/eigen_utils.h" |
| #include "caffe2/utils/math.h" |
| |
| namespace caffe2 { |
| |
| template <> |
| template <typename T> |
| bool GeluFunctor<CPUContext>:: |
| operator()(const int N, const T* X, T* Y, CPUContext* context) const { |
| if (fast_gelu) { |
| // y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3))) |
| constexpr T kAlpha = M_2_SQRTPI * M_SQRT1_2; |
| ConstEigenVectorArrayMap<T> X_arr(X, N); |
| EigenVectorArrayMap<T> Y_arr(Y, N); |
| Y_arr = X_arr * |
| (((X_arr + X_arr.cube() * gelu_utils::kFastCoeff) * kAlpha).tanh() + |
| T(1)) * |
| static_cast<T>(0.5); |
| } else { |
| // y = x * P(X <= x) where X ~ N(0, 1) |
| math::CdfNorm<T, CPUContext>(N, X, Y, context); |
| math::Mul<T, CPUContext>(N, X, Y, Y, context); |
| } |
| return true; |
| } |
| |
| template <> |
| template <typename T> |
| bool GeluGradientFunctor<CPUContext>::Forward( |
| const std::vector<int>& dY_dims, |
| const std::vector<int>& /* X_dims */, |
| const T* dY, |
| const T* X, |
| T* dX, |
| CPUContext* context) const { |
| const int N = std::accumulate( |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| dY_dims.cbegin(), dY_dims.cend(), 1, std::multiplies<int>()); |
| ConstEigenVectorArrayMap<T> dY_arr(dY, N); |
| ConstEigenVectorArrayMap<T> X_arr(X, N); |
| EigenVectorArrayMap<T> dX_arr(dX, N); |
| if (fast_gelu) { |
| constexpr T kAlpha = M_2_SQRTPI * M_SQRT1_2; |
| constexpr T kBeta = kAlpha * gelu_utils::kFastCoeff * T(3); |
| dX_arr = ((X_arr + X_arr.cube() * gelu_utils::kFastCoeff) * kAlpha).tanh(); |
| dX_arr = |
| (T(1) + dX_arr + |
| X_arr * (T(1) - dX_arr.square()) * (kBeta * X_arr.square() + kAlpha)) * |
| dY_arr * static_cast<T>(0.5); |
| } else { |
| constexpr T kAlpha = M_2_SQRTPI * M_SQRT1_2 * T(0.5); |
| math::CdfNorm<T, CPUContext>(N, X, dX, context); |
| dX_arr = (dX_arr + |
| X_arr * (-X_arr.square() * static_cast<T>(0.5)).exp() * kAlpha) * |
| dY_arr; |
| } |
| return true; |
| } |
| |
| REGISTER_CPU_OPERATOR(Gelu, GeluOp<CPUContext>); |
| REGISTER_CPU_OPERATOR(GeluGradient, GeluGradientOp<CPUContext>); |
| |
| namespace { |
| |
| OpSchema::Cost CostInferenceForGelu( |
| const OperatorDef& def, |
| const vector<TensorShape>& in) { |
| struct OpSchema::Cost cost = PointwiseCostInference<2>(def, in); |
| cost.params_bytes = 0; |
| return cost; |
| } |
| |
| } // namespace |
| |
| // Input: X, output: Y |
| OPERATOR_SCHEMA(Gelu) |
| .NumInputs(1) |
| .NumOutputs(1) |
| .Arg( |
| "fast_gelu", |
| "If true, use y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3))).") |
| .CostInferenceFunction(CostInferenceForGelu) |
| .IdenticalTypeAndShape() |
| .SetDoc(R"DOC( |
| Relu takes one input data (Tensor) and produces one output data |
| (Tensor) where the rectified linear function, y = xP(X <= x) where X ~ N(0, 1), |
| is applied to the tensor elementwise. |
| )DOC") |
| .Input(0, "X", "1D input tensor") |
| .Output(0, "Y", "1D input tensor"); |
| |
| OPERATOR_SCHEMA(GeluGradient) |
| .NumInputs(2) |
| .NumOutputs(1) |
| .IdenticalTypeAndShapeOfInput(1); |
| |
| namespace { |
| |
| class GetGeluGradient : public GradientMakerBase { |
| using GradientMakerBase::GradientMakerBase; |
| std::vector<OperatorDef> GetGradientDefs() override { |
| return SingleGradientDef( |
| "GeluGradient", |
| "", |
| std::vector<std::string>{GO(0), I(0)}, |
| std::vector<std::string>{GI(0)}); |
| } |
| }; |
| |
| } // namespace |
| |
| REGISTER_GRADIENT(Gelu, GetGeluGradient); |
| |
| } // namespace caffe2 |
| |
| C10_EXPORT_CAFFE2_OP_TO_C10_CPU( |
| Gelu, |
| "_caffe2::Gelu(Tensor input, bool fast_gelu = False) -> (Tensor output)", |
| caffe2::GeluOp<caffe2::CPUContext>); |