caffe2/utils/math/reduce.cc - platform/external/pytorch - Git at Google

 #include "caffe2/utils/math/reduce.h"

 #include <algorithm>
 #include <cstring>
 #include <functional>
 #include <numeric>
 #include <vector>

 #ifdef CAFFE2_USE_ACCELERATE
 #include <Accelerate/Accelerate.h>
 #endif // CAFFE2_USE_ACCELERATE

 #ifdef CAFFE2_USE_MKL
 #include <mkl.h>
 #endif // CAFFE2_USE_MKL

 #include <c10/util/accumulate.h>
 #include "caffe2/core/context.h"
 #include "caffe2/utils/eigen_utils.h"
 #include "caffe2/utils/math.h"
 #include "caffe2/utils/math/broadcast.h"
 #include "caffe2/utils/math/elementwise.h"
 #include "caffe2/utils/math/utils.h"

 namespace caffe2 {
 namespace math {

 namespace {

 #define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenFunc)              \
   template <typename T>                                                \
   void Rowwise##Func(                                                  \
       const int rows,                                                  \
       const int cols,                                                  \
       const T alpha,                                                   \
       const T* X,                                                      \
       T* Y,                                                            \
       CPUContext* /* context */) {                                     \
     EigenVectorMap<T>(Y, rows) = ConstEigenMatrixMap<T>(X, cols, rows) \
                                      .colwise()                        \
                                      .EigenFunc()                      \
                                      .transpose() *                    \
         alpha;                                                         \
   }
 DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
 DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
 DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceSum, sum)
 DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMean, mean)
 DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>)
 DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm)
 #undef DELEGATE_ROWWISE_REDUCE_FUNCTION

 #ifndef CAFFE2_USE_EIGEN_FOR_BLAS

 #define DELEGATE_ROWWISE_REDUCE_FUNCTION(T, Func, BLASFunc) \
   template <>                                               \
   void Rowwise##Func(                                       \
       const int rows,                                       \
       const int cols,                                       \
       const T alpha,                                        \
       const T* X,                                           \
       T* Y,                                                 \
       CPUContext* /* context */) {                          \
     for (int i = 0; i < rows; ++i) {                        \
       Y[i] = BLASFunc(cols, X + i * cols, 1) * alpha;       \
     }                                                       \
   }
 DELEGATE_ROWWISE_REDUCE_FUNCTION(float, ReduceL1, cblas_sasum)
 DELEGATE_ROWWISE_REDUCE_FUNCTION(double, ReduceL1, cblas_dasum)
 DELEGATE_ROWWISE_REDUCE_FUNCTION(float, ReduceL2, cblas_snrm2)
 DELEGATE_ROWWISE_REDUCE_FUNCTION(double, ReduceL2, cblas_dnrm2)
 #undef DELEGATE_ROWWISE_REDUCE_FUNCTION

 #endif // CAFFE2_USE_EIGEN_FOR_BLAS

 #define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, MathFunc)          \
   template <typename T>                                           \
   void Colwise##Func(                                             \
       const int rows,                                             \
       const int cols,                                             \
       const T alpha,                                              \
       const T* X,                                                 \
       T* Y,                                                       \
       CPUContext* context) {                                      \
     std::memcpy(Y, X, sizeof(T) * cols);                          \
     for (int i = 1; i < rows; ++i) {                              \
       MathFunc<T, CPUContext>(cols, Y, X + i * cols, Y, context); \
     }                                                             \
     Scale<T, T, CPUContext>(cols, alpha, Y, Y, context);          \
   }
 DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMin, Min)
 DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMax, Max)
 DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceSum, Add)
 #undef DELEGATE_COLWISE_REDUCE_FUNCTION

 template <typename T>
 void ColwiseReduceMean(
     const int rows,
     const int cols,
     const T alpha,
     const T* X,
     T* Y,
     CPUContext* context) {
   ColwiseReduceSum<T>(rows, cols, alpha / static_cast<T>(rows), X, Y, context);
 }

 template <typename T>
 void ColwiseReduceL1(
     const int rows,
     const int cols,
     const T alpha,
     const T* X,
     T* Y,
     CPUContext* context) {
   ConstEigenArrayMap<T> X_arr(X, cols, rows);
   EigenVectorArrayMap<T> Y_arr(Y, cols);
   Y_arr = X_arr.col(0).abs();
   for (int i = 1; i < rows; ++i) {
     Y_arr += X_arr.col(i).abs();
   }
   Scale<T, T, CPUContext>(cols, alpha, Y, Y, context);
 }

 template <typename T>
 void ColwiseReduceL2(
     const int rows,
     const int cols,
     const T alpha,
     const T* X,
     T* Y,
     CPUContext* /* context */) {
   ConstEigenArrayMap<T> X_arr(X, cols, rows);
   EigenVectorArrayMap<T> Y_arr(Y, cols);
   Y_arr = X_arr.col(0).square();
   for (int i = 1; i < rows; ++i) {
     Y_arr += X_arr.col(i).square();
   }
   Y_arr = Y_arr.sqrt() * alpha;
 }

 template <typename T>
 void BothEndsReduceMin(
     const int M,
     const int N,
     const int K,
     const T alpha,
     const T* X,
     T* Y,
     CPUContext* context) {
   EigenVectorArrayMap<T> Y_arr(Y, N);
   Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().minCoeff();
   for (int i = 1; i < M; ++i) {
     ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
     for (int j = 0; j < N; ++j) {
       Y[j] = std::min(Y[j], X_arr.col(j).minCoeff());
     }
   }
   Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
 }

 template <typename T>
 void BothEndsReduceMax(
     const int M,
     const int N,
     const int K,
     const T alpha,
     const T* X,
     T* Y,
     CPUContext* context) {
   EigenVectorArrayMap<T> Y_arr(Y, N);
   Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().maxCoeff();
   for (int i = 1; i < M; ++i) {
     ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
     for (int j = 0; j < N; ++j) {
       Y[j] = std::max(Y[j], X_arr.col(j).maxCoeff());
     }
   }
   Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
 }

 template <typename T>
 void BothEndsReduceSum(
     const int M,
     const int N,
     const int K,
     const T alpha,
     const T* X,
     T* Y,
     CPUContext* context) {
   EigenVectorArrayMap<T> Y_arr(Y, N);
   Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().sum();
   for (int i = 1; i < M; ++i) {
     Y_arr +=
         ConstEigenArrayMap<T>(X + i * N * K, K, N).colwise().sum().transpose();
   }
   Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
 }

 template <typename T>
 void BothEndsReduceMean(
     const int M,
     const int N,
     const int K,
     const T alpha,
     const T* X,
     T* Y,
     CPUContext* context) {
   EigenVectorArrayMap<T> Y_arr(Y, N);
   Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().sum();
   for (int i = 1; i < M; ++i) {
     Y_arr +=
         ConstEigenArrayMap<T>(X + i * N * K, K, N).colwise().sum().transpose();
   }
   Scale<T, T, CPUContext>(N, alpha / static_cast<T>(M * K), Y, Y, context);
 }

 template <typename T>
 void BothEndsReduceL1(
     const int M,
     const int N,
     const int K,
     const T alpha,
     const T* X,
     T* Y,
     CPUContext* context) {
   EigenVectorMap<T> Y_vec(Y, N);
   Y_vec = ConstEigenMatrixMap<T>(X, K, N).colwise().template lpNorm<1>();
   for (int i = 1; i < M; ++i) {
     Y_vec += ConstEigenMatrixMap<T>(X + i * N * K, K, N)
                  .colwise()
                  .template lpNorm<1>()
                  .transpose();
   }
   Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
 }

 template <typename T>
 void BothEndsReduceL2(
     const int M,
     const int N,
     const int K,
     const T alpha,
     const T* X,
     T* Y,
     CPUContext* /* context */) {
   ConstEigenArrayMap<T> X0_arr(X, K, N);
   EigenVectorArrayMap<T> Y_arr(Y, N);
   for (int i = 0; i < N; ++i) {
     Y_arr(i) = X0_arr.col(i).square().sum();
   }
   for (int i = 1; i < M; ++i) {
     ConstEigenArrayMap<T> Xi_arr(X + i * N * K, K, N);
     for (int j = 0; j < N; ++j) {
       Y_arr(j) += Xi_arr.col(j).square().sum();
     }
   }
   Y_arr = Y_arr.sqrt() * alpha;
 }

 template <typename T, class Reducer>
 void ReduceTensorImplFastpath(
     const int X_size,
     const int Y_size,
     const Reducer& reducer,
     const T* X,
     T* Y) {
   int Y_index = 0;
   for (int X_index = 0; X_index < X_size; ++X_index) {
     Y[Y_index] = reducer(Y[Y_index], X[X_index]);
     Y_index++;
     if (Y_index >= Y_size) {
       Y_index = 0;
     }
   }
 }

 template <typename T, class Reducer>
 void ReduceTensorImpl(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
     const Reducer& reducer,
     const T init,
     const T* X,
     T* Y,
     CPUContext* context,
     bool allow_broadcast_fastpath) {
   const auto X_size = c10::multiply_integers(X_dims, X_dims + ndim);
   const auto Y_size = c10::multiply_integers(Y_dims, Y_dims + ndim);
   Set<T, CPUContext>(Y_size, init, Y, context);
   if (allow_broadcast_fastpath && can_use_broadcast_fastpath(ndim, Y_dims)) {
     ReduceTensorImplFastpath(X_size, Y_size, reducer, X, Y);
     return;
   }
   std::vector<int> index(ndim, 0);
   for (int X_index = 0; X_index < X_size; ++X_index) {
     const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data());
     Y[Y_index] = reducer(Y[Y_index], X[X_index]);
     utils::IncreaseIndexInDims(ndim, X_dims, index.data());
   }
 }

 template <typename T>
 void ReduceMinImpl(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
     const T alpha,
     const T* X,
     T* Y,
     CPUContext* context,
     bool allow_broadcast_fastpath) {
   ReduceTensorImpl(
       ndim,
       X_dims,
       Y_dims,
       [](const T a, const T b) { return std::min(a, b); },
       std::numeric_limits<T>::max(),
       X,
       Y,
       context,
       allow_broadcast_fastpath);
   const auto Y_size = c10::multiply_integers(Y_dims, Y_dims + ndim);
   Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
 }

 template <typename T>
 void ReduceMaxImpl(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
     const T alpha,
     const T* X,
     T* Y,
     CPUContext* context,
     bool allow_broadcast_fastpath) {
   ReduceTensorImpl(
       ndim,
       X_dims,
       Y_dims,
       [](const T a, const T b) { return std::max(a, b); },
       std::numeric_limits<T>::lowest(),
       X,
       Y,
       context,
       allow_broadcast_fastpath);
   const auto Y_size = c10::multiply_integers(Y_dims, Y_dims + ndim);
   Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
 }

 template <typename T>
 void ReduceSumImpl(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
     const T alpha,
     const T* X,
     T* Y,
     CPUContext* context,
     bool allow_broadcast_fastpath) {
   ReduceTensorImpl(ndim, X_dims, Y_dims, std::plus<T>(), T(0), X, Y, context, allow_broadcast_fastpath);
   const auto Y_size = c10::multiply_integers(Y_dims, Y_dims + ndim);
   Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
 }

 template <typename T>
 void ReduceMeanImpl(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
     const T alpha,
     const T* X,
     T* Y,
     CPUContext* context,
     bool allow_broadcast_fastpath) {
   ReduceTensorImpl(ndim, X_dims, Y_dims, std::plus<T>(), T(0), X, Y, context, allow_broadcast_fastpath);
   const auto X_size = c10::multiply_integers(X_dims, X_dims + ndim);
   const auto Y_size = c10::multiply_integers(Y_dims, Y_dims + ndim);
   Scale<T, T, CPUContext>(
       Y_size,
       alpha * static_cast<T>(Y_size) / static_cast<T>(X_size),
       Y,
       Y,
       context);
 }

 template <typename T>
 void ReduceL1Impl(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
     const T alpha,
     const T* X,
     T* Y,
     CPUContext* context,
     bool allow_broadcast_fastpath) {
   ReduceTensorImpl(
       ndim,
       X_dims,
       Y_dims,
       [](const T a, const T b) { return a + std::abs(b); },
       T(0),
       X,
       Y,
       context,
       allow_broadcast_fastpath);
   const auto Y_size = c10::multiply_integers(Y_dims, Y_dims + ndim);
   Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
 }

 template <typename T>
 void ReduceL2Impl(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
     const T alpha,
     const T* X,
     T* Y,
     CPUContext* context,
     bool allow_broadcast_fastpath) {
   ReduceTensorImpl(
       ndim,
       X_dims,
       Y_dims,
       [](const T a, const T b) { return a + b * b; },
       T(0),
       X,
       Y,
       context,
       allow_broadcast_fastpath);
   const auto Y_size = c10::multiply_integers(Y_dims, Y_dims + ndim);
   EigenVectorArrayMap<T> Y_arr(Y, Y_size);
   Y_arr = Y_arr.sqrt() * alpha;
 }

 template <typename T>
 void RowwiseMoments(
     const int rows,
     const int cols,
     const T* X,
     T* mean,
     T* var) {
   ConstEigenArrayMap<T> X_arr(X, cols, rows);
   for (int i = 0; i < rows; ++i) {
     const T m = X_arr.col(i).mean();
     mean[i] = m;
     var[i] = (X_arr.col(i) - m).square().mean();
   }
 }

 template <typename T>
 void ColwiseMoments(
     const int rows,
     const int cols,
     const T* X,
     T* mean,
     T* var) {
   ConstEigenArrayMap<T> X_arr(X, cols, rows);
   EigenVectorArrayMap<T> mean_arr(mean, cols);
   EigenVectorArrayMap<T> var_arr(var, cols);
   EArrXt<T> delta_arr(cols);
   mean_arr.setZero();
   var_arr.setZero();
   for (int i = 0; i < rows; ++i) {
     delta_arr = X_arr.col(i) - mean_arr;
     mean_arr += delta_arr / static_cast<T>(i + 1);
     var_arr += delta_arr * (X_arr.col(i) - mean_arr);
   }
   var_arr /= static_cast<T>(rows);
 }

 template <typename T>
 void BothEndsMoments(
     const int M,
     const int N,
     const int K,
     const T* X,
     T* mean,
     T* var) {
   ConstEigenArrayMap<T> X_arr(X, K, M * N);
   EigenVectorArrayMap<T> mean_arr(mean, N);
   EigenVectorArrayMap<T> var_arr(var, N);
   for (int i = 0; i < N; ++i) {
     mean_arr(i) = X_arr.col(i).sum();
     var_arr(i) = X_arr.col(i).square().sum();
   }
   for (int i = 1; i < M; ++i) {
     for (int j = 0; j < N; ++j) {
       const int c = i * N + j;
       mean_arr(j) += X_arr.col(c).sum();
       var_arr(j) += X_arr.col(c).square().sum();
     }
   }
   const T scale = T(1) / static_cast<T>(M * K);
   mean_arr *= scale;
   var_arr = var_arr * scale - mean_arr.square();
 }

 template <typename T>
 void MomentsImpl(
     const int ndim,
     const int* X_dims,
     const int* Y_dims,
     const T* X,
     T* mean,
     T* var,
     CPUContext* /* context */,
     bool allow_broadcast_fastpath) {
   const auto X_size = c10::multiply_integers(X_dims, X_dims + ndim);
   const auto Y_size = c10::multiply_integers(Y_dims, Y_dims + ndim);
   if (X_size == 0) {
     std::memset(mean, 0, sizeof(T) * Y_size);
     std::memset(var, 0, sizeof(T) * Y_size);
     return;
   }
   if (std::equal(X_dims, X_dims + ndim, Y_dims)) {
     std::memcpy(mean, X, sizeof(T) * Y_size);
     std::memset(var, 0, sizeof(T) * Y_size);
     return;
   }
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   int rows;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   int cols;
   if (utils::IsRowwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {
     RowwiseMoments<T>(rows, cols, X, mean, var);
     return;
   }
   if (utils::IsColwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {
     ColwiseMoments<T>(rows, cols, X, mean, var);
     return;
   }
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   int pre;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   int mid;
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   int nxt;
   if (utils::IsBothEndsReduce(ndim, X_dims, Y_dims, &pre, &mid, &nxt)) {
     BothEndsMoments<T>(pre, mid, nxt, X, mean, var);
     return;
   }
   std::memset(mean, 0, sizeof(T) * Y_size);
   std::memset(var, 0, sizeof(T) * Y_size);
   std::vector<int> index(ndim, 0);
   for (int X_index = 0; X_index < X_size; ++X_index) {
     const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data());
     mean[Y_index] += X[X_index];
     var[Y_index] += X[X_index] * X[X_index];
     utils::IncreaseIndexInDims(ndim, X_dims, index.data());
   }
   const T scale = static_cast<T>(Y_size) / static_cast<T>(X_size);
   EigenVectorArrayMap<T> mean_arr(mean, Y_size);
   EigenVectorArrayMap<T> var_arr(var, Y_size);
   mean_arr *= scale;
   var_arr = var_arr * scale - mean_arr.square();
 }

 } // namespace

 #define DELEGATE_GLOBAL_REDUCE_FUNCTION(T, Func, EigenFunc) \
   template <>                                               \
   C10_EXPORT void Func<T, CPUContext>(                      \
       const int N,                                          \
       const T* X,                                           \
       T* Y,                                                 \
       Tensor* /* scratch_ptr */,                            \
       CPUContext* /* context */) {                          \
     *Y = ConstEigenVectorArrayMap<T>(X, N).EigenFunc();     \
   }
 DELEGATE_GLOBAL_REDUCE_FUNCTION(float, ReduceMin, minCoeff)
 DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int32_t, ReduceMin, minCoeff)
 DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int64_t, ReduceMin, minCoeff)
 DELEGATE_GLOBAL_REDUCE_FUNCTION(float, ReduceMax, maxCoeff)
 DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int32_t, ReduceMax, maxCoeff)
 DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int64_t, ReduceMax, maxCoeff)
 #undef DELEGATE_GLOBAL_REDUCE_FUNCTION

 #define DELEGATE_REDUCE_FUNCTION(T, Func, kInit, kIsNorm)                  \
   template <>                                                              \
   C10_EXPORT void Func<T, CPUContext>(                                     \
       const int ndim,                                                      \
       const int* X_dims,                                                   \
       const int* Y_dims,                                                   \
       const T alpha,                                                       \
       const T* X,                                                          \
       T* Y,                                                                \
       CPUContext* context,                                                 \
       bool allow_broadcast_fastpath) {                                     \
     const int X_size =                                                     \
         std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>()); \
     const int Y_size =                                                     \
         std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>()); \
     if (X_size == 0) {                                                     \
       Set<T, CPUContext>(Y_size, alpha * kInit, Y, context);               \
       return;                                                              \
     }                                                                      \
     if (alpha == T(0)) {                                                   \
       std::memset(Y, 0, sizeof(T) * Y_size);                               \
       return;                                                              \
     }                                                                      \
     if (std::equal(X_dims, X_dims + ndim, Y_dims)) {                       \
       if (kIsNorm) {                                                       \
         EigenVectorArrayMap<T>(Y, Y_size) =                                \
             ConstEigenVectorArrayMap<T>(X, X_size).abs() * alpha;          \
       } else {                                                             \
         Scale<T, T, CPUContext>(Y_size, alpha, X, Y, context);             \
       }                                                                    \
       return;                                                              \
     }                                                                      \
     int rows;                                                              \
     int cols;                                                              \
     if (utils::IsRowwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {      \
       Rowwise##Func<T>(rows, cols, alpha, X, Y, context);                  \
       return;                                                              \
     }                                                                      \
     if (utils::IsColwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {      \
       Colwise##Func<T>(rows, cols, alpha, X, Y, context);                  \
       return;                                                              \
     }                                                                      \
     int M;                                                                 \
     int N;                                                                 \
     int K;                                                                 \
     if (utils::IsBothEndsReduce(ndim, X_dims, Y_dims, &M, &N, &K)) {       \
       BothEnds##Func<T>(M, N, K, alpha, X, Y, context);                    \
       return;                                                              \
     }                                                                      \
     Func##Impl<T>(ndim, X_dims, Y_dims, alpha, X, Y,                       \
                   context, allow_broadcast_fastpath);                      \
   }
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(
     float,
     ReduceMin,
     std::numeric_limits<float>::max(),
     false)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(
     double,
     ReduceMin,
     std::numeric_limits<double>::max(),
     false)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(
     std::int32_t,
     ReduceMin,
     std::numeric_limits<std::int32_t>::max(),
     false)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(
     std::int64_t,
     ReduceMin,
     std::numeric_limits<std::int64_t>::max(),
     false)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(
     float,
     ReduceMax,
     std::numeric_limits<float>::lowest(),
     false)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(
     double,
     ReduceMax,
     std::numeric_limits<double>::lowest(),
     false)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(
     std::int32_t,
     ReduceMax,
     std::numeric_limits<std::int32_t>::lowest(),
     false)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(
     std::int64_t,
     ReduceMax,
     std::numeric_limits<std::int64_t>::lowest(),
     false)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(float, ReduceSum, 0.0f, false)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(double, ReduceSum, 0.0, false)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(std::int32_t, ReduceSum, 0, false)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(std::int64_t, ReduceSum, 0LL, false)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(float, ReduceMean, 0.0f, false)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(double, ReduceMean, 0.0, false)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(float, ReduceL1, 0.0f, true)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(double, ReduceL1, 0.0, true)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(std::int32_t, ReduceL1, 0, true)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(std::int64_t, ReduceL1, 0LL, true)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(float, ReduceL2, 0.0f, true)
 // NOLINTNEXTLINE(modernize-use-transparent-functors)
 DELEGATE_REDUCE_FUNCTION(double, ReduceL2, 0.0, true)
 #undef DELEGATE_REDUCE_FUNCTION

 #define CAFFE2_SPECIALIZED_MOMENTS(T)                            \
   template <>                                                    \
   C10_EXPORT void Moments<T, CPUContext>(                        \
       const int ndim,                                            \
       const int* X_dims,                                         \
       const int* Y_dims,                                         \
       const T* X,                                                \
       T* mean,                                                   \
       T* var,                                                    \
       CPUContext* context,                                       \
       bool allow_broadcast_fastpath) {                           \
     MomentsImpl<T>(ndim, X_dims, Y_dims, X, mean, var,           \
                    context, allow_broadcast_fastpath);           \
   }
 CAFFE2_SPECIALIZED_MOMENTS(float)
 CAFFE2_SPECIALIZED_MOMENTS(double)
 #undef CAFFE2_SPECIALIZED_MOMENTS

 } // namespace math
 } // namespace caffe2
	#include "caffe2/utils/math/reduce.h"

	#include <algorithm>
	#include <cstring>
	#include <functional>
	#include <numeric>
	#include <vector>

	#ifdef CAFFE2_USE_ACCELERATE
	#include <Accelerate/Accelerate.h>
	#endif // CAFFE2_USE_ACCELERATE

	#ifdef CAFFE2_USE_MKL
	#include <mkl.h>
	#endif // CAFFE2_USE_MKL

	#include <c10/util/accumulate.h>
	#include "caffe2/core/context.h"
	#include "caffe2/utils/eigen_utils.h"
	#include "caffe2/utils/math.h"
	#include "caffe2/utils/math/broadcast.h"
	#include "caffe2/utils/math/elementwise.h"
	#include "caffe2/utils/math/utils.h"

	namespace caffe2 {
	namespace math {

	namespace {

	#define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenFunc) \
	template <typename T> \
	void Rowwise##Func( \
	const int rows, \
	const int cols, \
	const T alpha, \
	const T* X, \
	T* Y, \
	CPUContext* /* context */) { \
	EigenVectorMap<T>(Y, rows) = ConstEigenMatrixMap<T>(X, cols, rows) \
	.colwise() \
	.EigenFunc() \
	.transpose() * \
	alpha; \
	}
	DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
	DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
	DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceSum, sum)
	DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMean, mean)
	DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>)
	DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm)
	#undef DELEGATE_ROWWISE_REDUCE_FUNCTION

	#ifndef CAFFE2_USE_EIGEN_FOR_BLAS

	#define DELEGATE_ROWWISE_REDUCE_FUNCTION(T, Func, BLASFunc) \
	template <> \
	void Rowwise##Func( \
	const int rows, \
	const int cols, \
	const T alpha, \
	const T* X, \
	T* Y, \
	CPUContext* /* context */) { \
	for (int i = 0; i < rows; ++i) { \
	Y[i] = BLASFunc(cols, X + i * cols, 1) * alpha; \
	} \
	}
	DELEGATE_ROWWISE_REDUCE_FUNCTION(float, ReduceL1, cblas_sasum)
	DELEGATE_ROWWISE_REDUCE_FUNCTION(double, ReduceL1, cblas_dasum)
	DELEGATE_ROWWISE_REDUCE_FUNCTION(float, ReduceL2, cblas_snrm2)
	DELEGATE_ROWWISE_REDUCE_FUNCTION(double, ReduceL2, cblas_dnrm2)
	#undef DELEGATE_ROWWISE_REDUCE_FUNCTION

	#endif // CAFFE2_USE_EIGEN_FOR_BLAS

	#define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, MathFunc) \
	template <typename T> \
	void Colwise##Func( \
	const int rows, \
	const int cols, \
	const T alpha, \
	const T* X, \
	T* Y, \
	CPUContext* context) { \
	std::memcpy(Y, X, sizeof(T) * cols); \
	for (int i = 1; i < rows; ++i) { \
	MathFunc<T, CPUContext>(cols, Y, X + i * cols, Y, context); \
	} \
	Scale<T, T, CPUContext>(cols, alpha, Y, Y, context); \
	}
	DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMin, Min)
	DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMax, Max)
	DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceSum, Add)
	#undef DELEGATE_COLWISE_REDUCE_FUNCTION

	template <typename T>
	void ColwiseReduceMean(
	const int rows,
	const int cols,
	const T alpha,
	const T* X,
	T* Y,
	CPUContext* context) {
	ColwiseReduceSum<T>(rows, cols, alpha / static_cast<T>(rows), X, Y, context);
	}

	template <typename T>
	void ColwiseReduceL1(
	const int rows,
	const int cols,
	const T alpha,
	const T* X,
	T* Y,
	CPUContext* context) {
	ConstEigenArrayMap<T> X_arr(X, cols, rows);
	EigenVectorArrayMap<T> Y_arr(Y, cols);
	Y_arr = X_arr.col(0).abs();
	for (int i = 1; i < rows; ++i) {
	Y_arr += X_arr.col(i).abs();
	}
	Scale<T, T, CPUContext>(cols, alpha, Y, Y, context);
	}

	template <typename T>
	void ColwiseReduceL2(
	const int rows,
	const int cols,
	const T alpha,
	const T* X,
	T* Y,
	CPUContext* /* context */) {
	ConstEigenArrayMap<T> X_arr(X, cols, rows);
	EigenVectorArrayMap<T> Y_arr(Y, cols);
	Y_arr = X_arr.col(0).square();
	for (int i = 1; i < rows; ++i) {
	Y_arr += X_arr.col(i).square();
	}
	Y_arr = Y_arr.sqrt() * alpha;
	}

	template <typename T>
	void BothEndsReduceMin(
	const int M,
	const int N,
	const int K,
	const T alpha,
	const T* X,
	T* Y,
	CPUContext* context) {
	EigenVectorArrayMap<T> Y_arr(Y, N);
	Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().minCoeff();
	for (int i = 1; i < M; ++i) {
	ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
	for (int j = 0; j < N; ++j) {
	Y[j] = std::min(Y[j], X_arr.col(j).minCoeff());
	}
	}
	Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
	}

	template <typename T>
	void BothEndsReduceMax(
	const int M,
	const int N,
	const int K,
	const T alpha,
	const T* X,
	T* Y,
	CPUContext* context) {
	EigenVectorArrayMap<T> Y_arr(Y, N);
	Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().maxCoeff();
	for (int i = 1; i < M; ++i) {
	ConstEigenArrayMap<T> X_arr(X + i * N * K, K, N);
	for (int j = 0; j < N; ++j) {
	Y[j] = std::max(Y[j], X_arr.col(j).maxCoeff());
	}
	}
	Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
	}

	template <typename T>
	void BothEndsReduceSum(
	const int M,
	const int N,
	const int K,
	const T alpha,
	const T* X,
	T* Y,
	CPUContext* context) {
	EigenVectorArrayMap<T> Y_arr(Y, N);
	Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().sum();
	for (int i = 1; i < M; ++i) {
	Y_arr +=
	ConstEigenArrayMap<T>(X + i * N * K, K, N).colwise().sum().transpose();
	}
	Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
	}

	template <typename T>
	void BothEndsReduceMean(
	const int M,
	const int N,
	const int K,
	const T alpha,
	const T* X,
	T* Y,
	CPUContext* context) {
	EigenVectorArrayMap<T> Y_arr(Y, N);
	Y_arr = ConstEigenArrayMap<T>(X, K, N).colwise().sum();
	for (int i = 1; i < M; ++i) {
	Y_arr +=
	ConstEigenArrayMap<T>(X + i * N * K, K, N).colwise().sum().transpose();
	}
	Scale<T, T, CPUContext>(N, alpha / static_cast<T>(M * K), Y, Y, context);
	}

	template <typename T>
	void BothEndsReduceL1(
	const int M,
	const int N,
	const int K,
	const T alpha,
	const T* X,
	T* Y,
	CPUContext* context) {
	EigenVectorMap<T> Y_vec(Y, N);
	Y_vec = ConstEigenMatrixMap<T>(X, K, N).colwise().template lpNorm<1>();
	for (int i = 1; i < M; ++i) {
	Y_vec += ConstEigenMatrixMap<T>(X + i * N * K, K, N)
	.colwise()
	.template lpNorm<1>()
	.transpose();
	}
	Scale<T, T, CPUContext>(N, alpha, Y, Y, context);
	}

	template <typename T>
	void BothEndsReduceL2(
	const int M,
	const int N,
	const int K,
	const T alpha,
	const T* X,
	T* Y,
	CPUContext* /* context */) {
	ConstEigenArrayMap<T> X0_arr(X, K, N);
	EigenVectorArrayMap<T> Y_arr(Y, N);
	for (int i = 0; i < N; ++i) {
	Y_arr(i) = X0_arr.col(i).square().sum();
	}
	for (int i = 1; i < M; ++i) {
	ConstEigenArrayMap<T> Xi_arr(X + i * N * K, K, N);
	for (int j = 0; j < N; ++j) {
	Y_arr(j) += Xi_arr.col(j).square().sum();
	}
	}
	Y_arr = Y_arr.sqrt() * alpha;
	}

	template <typename T, class Reducer>
	void ReduceTensorImplFastpath(
	const int X_size,
	const int Y_size,
	const Reducer& reducer,
	const T* X,
	T* Y) {
	int Y_index = 0;
	for (int X_index = 0; X_index < X_size; ++X_index) {
	Y[Y_index] = reducer(Y[Y_index], X[X_index]);
	Y_index++;
	if (Y_index >= Y_size) {
	Y_index = 0;
	}
	}
	}

	template <typename T, class Reducer>
	void ReduceTensorImpl(
	const int ndim,
	const int* X_dims,
	const int* Y_dims,
	const Reducer& reducer,
	const T init,
	const T* X,
	T* Y,
	CPUContext* context,
	bool allow_broadcast_fastpath) {
	const auto X_size = c10::multiply_integers(X_dims, X_dims + ndim);
	const auto Y_size = c10::multiply_integers(Y_dims, Y_dims + ndim);
	Set<T, CPUContext>(Y_size, init, Y, context);
	if (allow_broadcast_fastpath && can_use_broadcast_fastpath(ndim, Y_dims)) {
	ReduceTensorImplFastpath(X_size, Y_size, reducer, X, Y);
	return;
	}
	std::vector<int> index(ndim, 0);
	for (int X_index = 0; X_index < X_size; ++X_index) {
	const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data());
	Y[Y_index] = reducer(Y[Y_index], X[X_index]);
	utils::IncreaseIndexInDims(ndim, X_dims, index.data());
	}
	}

	template <typename T>
	void ReduceMinImpl(
	const int ndim,
	const int* X_dims,
	const int* Y_dims,
	const T alpha,
	const T* X,
	T* Y,
	CPUContext* context,
	bool allow_broadcast_fastpath) {
	ReduceTensorImpl(
	ndim,
	X_dims,
	Y_dims,
	[](const T a, const T b) { return std::min(a, b); },
	std::numeric_limits<T>::max(),
	X,
	Y,
	context,
	allow_broadcast_fastpath);
	const auto Y_size = c10::multiply_integers(Y_dims, Y_dims + ndim);
	Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
	}

	template <typename T>
	void ReduceMaxImpl(
	const int ndim,
	const int* X_dims,
	const int* Y_dims,
	const T alpha,
	const T* X,
	T* Y,
	CPUContext* context,
	bool allow_broadcast_fastpath) {
	ReduceTensorImpl(
	ndim,
	X_dims,
	Y_dims,
	[](const T a, const T b) { return std::max(a, b); },
	std::numeric_limits<T>::lowest(),
	X,
	Y,
	context,
	allow_broadcast_fastpath);
	const auto Y_size = c10::multiply_integers(Y_dims, Y_dims + ndim);
	Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
	}

	template <typename T>
	void ReduceSumImpl(
	const int ndim,
	const int* X_dims,
	const int* Y_dims,
	const T alpha,
	const T* X,
	T* Y,
	CPUContext* context,
	bool allow_broadcast_fastpath) {
	ReduceTensorImpl(ndim, X_dims, Y_dims, std::plus<T>(), T(0), X, Y, context, allow_broadcast_fastpath);
	const auto Y_size = c10::multiply_integers(Y_dims, Y_dims + ndim);
	Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
	}

	template <typename T>
	void ReduceMeanImpl(
	const int ndim,
	const int* X_dims,
	const int* Y_dims,
	const T alpha,
	const T* X,
	T* Y,
	CPUContext* context,
	bool allow_broadcast_fastpath) {
	ReduceTensorImpl(ndim, X_dims, Y_dims, std::plus<T>(), T(0), X, Y, context, allow_broadcast_fastpath);
	const auto X_size = c10::multiply_integers(X_dims, X_dims + ndim);
	const auto Y_size = c10::multiply_integers(Y_dims, Y_dims + ndim);
	Scale<T, T, CPUContext>(
	Y_size,
	alpha * static_cast<T>(Y_size) / static_cast<T>(X_size),
	Y,
	Y,
	context);
	}

	template <typename T>
	void ReduceL1Impl(
	const int ndim,
	const int* X_dims,
	const int* Y_dims,
	const T alpha,
	const T* X,
	T* Y,
	CPUContext* context,
	bool allow_broadcast_fastpath) {
	ReduceTensorImpl(
	ndim,
	X_dims,
	Y_dims,
	[](const T a, const T b) { return a + std::abs(b); },
	T(0),
	X,
	Y,
	context,
	allow_broadcast_fastpath);
	const auto Y_size = c10::multiply_integers(Y_dims, Y_dims + ndim);
	Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
	}

	template <typename T>
	void ReduceL2Impl(
	const int ndim,
	const int* X_dims,
	const int* Y_dims,
	const T alpha,
	const T* X,
	T* Y,
	CPUContext* context,
	bool allow_broadcast_fastpath) {
	ReduceTensorImpl(
	ndim,
	X_dims,
	Y_dims,
	[](const T a, const T b) { return a + b * b; },
	T(0),
	X,
	Y,
	context,
	allow_broadcast_fastpath);
	const auto Y_size = c10::multiply_integers(Y_dims, Y_dims + ndim);
	EigenVectorArrayMap<T> Y_arr(Y, Y_size);
	Y_arr = Y_arr.sqrt() * alpha;
	}

	template <typename T>
	void RowwiseMoments(
	const int rows,
	const int cols,
	const T* X,
	T* mean,
	T* var) {
	ConstEigenArrayMap<T> X_arr(X, cols, rows);
	for (int i = 0; i < rows; ++i) {
	const T m = X_arr.col(i).mean();
	mean[i] = m;
	var[i] = (X_arr.col(i) - m).square().mean();
	}
	}

	template <typename T>
	void ColwiseMoments(
	const int rows,
	const int cols,
	const T* X,
	T* mean,
	T* var) {
	ConstEigenArrayMap<T> X_arr(X, cols, rows);
	EigenVectorArrayMap<T> mean_arr(mean, cols);
	EigenVectorArrayMap<T> var_arr(var, cols);
	EArrXt<T> delta_arr(cols);
	mean_arr.setZero();
	var_arr.setZero();
	for (int i = 0; i < rows; ++i) {
	delta_arr = X_arr.col(i) - mean_arr;
	mean_arr += delta_arr / static_cast<T>(i + 1);
	var_arr += delta_arr * (X_arr.col(i) - mean_arr);
	}
	var_arr /= static_cast<T>(rows);
	}

	template <typename T>
	void BothEndsMoments(
	const int M,
	const int N,
	const int K,
	const T* X,
	T* mean,
	T* var) {
	ConstEigenArrayMap<T> X_arr(X, K, M * N);
	EigenVectorArrayMap<T> mean_arr(mean, N);
	EigenVectorArrayMap<T> var_arr(var, N);
	for (int i = 0; i < N; ++i) {
	mean_arr(i) = X_arr.col(i).sum();
	var_arr(i) = X_arr.col(i).square().sum();
	}
	for (int i = 1; i < M; ++i) {
	for (int j = 0; j < N; ++j) {
	const int c = i * N + j;
	mean_arr(j) += X_arr.col(c).sum();
	var_arr(j) += X_arr.col(c).square().sum();
	}
	}
	const T scale = T(1) / static_cast<T>(M * K);
	mean_arr *= scale;
	var_arr = var_arr * scale - mean_arr.square();
	}

	template <typename T>
	void MomentsImpl(
	const int ndim,
	const int* X_dims,
	const int* Y_dims,
	const T* X,
	T* mean,
	T* var,
	CPUContext* /* context */,
	bool allow_broadcast_fastpath) {
	const auto X_size = c10::multiply_integers(X_dims, X_dims + ndim);
	const auto Y_size = c10::multiply_integers(Y_dims, Y_dims + ndim);
	if (X_size == 0) {
	std::memset(mean, 0, sizeof(T) * Y_size);
	std::memset(var, 0, sizeof(T) * Y_size);
	return;
	}
	if (std::equal(X_dims, X_dims + ndim, Y_dims)) {
	std::memcpy(mean, X, sizeof(T) * Y_size);
	std::memset(var, 0, sizeof(T) * Y_size);
	return;
	}
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	int rows;
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	int cols;
	if (utils::IsRowwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {
	RowwiseMoments<T>(rows, cols, X, mean, var);
	return;
	}
	if (utils::IsColwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) {
	ColwiseMoments<T>(rows, cols, X, mean, var);
	return;
	}
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	int pre;
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	int mid;
	// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
	int nxt;
	if (utils::IsBothEndsReduce(ndim, X_dims, Y_dims, &pre, &mid, &nxt)) {
	BothEndsMoments<T>(pre, mid, nxt, X, mean, var);
	return;
	}
	std::memset(mean, 0, sizeof(T) * Y_size);
	std::memset(var, 0, sizeof(T) * Y_size);
	std::vector<int> index(ndim, 0);
	for (int X_index = 0; X_index < X_size; ++X_index) {
	const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data());
	mean[Y_index] += X[X_index];
	var[Y_index] += X[X_index] * X[X_index];
	utils::IncreaseIndexInDims(ndim, X_dims, index.data());
	}
	const T scale = static_cast<T>(Y_size) / static_cast<T>(X_size);
	EigenVectorArrayMap<T> mean_arr(mean, Y_size);
	EigenVectorArrayMap<T> var_arr(var, Y_size);
	mean_arr *= scale;
	var_arr = var_arr * scale - mean_arr.square();
	}

	} // namespace

	#define DELEGATE_GLOBAL_REDUCE_FUNCTION(T, Func, EigenFunc) \
	template <> \
	C10_EXPORT void Func<T, CPUContext>( \
	const int N, \
	const T* X, \
	T* Y, \
	Tensor* /* scratch_ptr */, \
	CPUContext* /* context */) { \
	*Y = ConstEigenVectorArrayMap<T>(X, N).EigenFunc(); \
	}
	DELEGATE_GLOBAL_REDUCE_FUNCTION(float, ReduceMin, minCoeff)
	DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int32_t, ReduceMin, minCoeff)
	DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int64_t, ReduceMin, minCoeff)
	DELEGATE_GLOBAL_REDUCE_FUNCTION(float, ReduceMax, maxCoeff)
	DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int32_t, ReduceMax, maxCoeff)
	DELEGATE_GLOBAL_REDUCE_FUNCTION(std::int64_t, ReduceMax, maxCoeff)
	#undef DELEGATE_GLOBAL_REDUCE_FUNCTION

	#define DELEGATE_REDUCE_FUNCTION(T, Func, kInit, kIsNorm) \
	template <> \
	C10_EXPORT void Func<T, CPUContext>( \
	const int ndim, \
	const int* X_dims, \
	const int* Y_dims, \
	const T alpha, \
	const T* X, \
	T* Y, \
	CPUContext* context, \
	bool allow_broadcast_fastpath) { \
	const int X_size = \
	std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>()); \
	const int Y_size = \
	std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>()); \
	if (X_size == 0) { \
	Set<T, CPUContext>(Y_size, alpha * kInit, Y, context); \
	return; \
	} \
	if (alpha == T(0)) { \
	std::memset(Y, 0, sizeof(T) * Y_size); \
	return; \
	} \
	if (std::equal(X_dims, X_dims + ndim, Y_dims)) { \
	if (kIsNorm) { \
	EigenVectorArrayMap<T>(Y, Y_size) = \
	ConstEigenVectorArrayMap<T>(X, X_size).abs() * alpha; \
	} else { \
	Scale<T, T, CPUContext>(Y_size, alpha, X, Y, context); \
	} \
	return; \
	} \
	int rows; \
	int cols; \
	if (utils::IsRowwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) { \
	Rowwise##Func<T>(rows, cols, alpha, X, Y, context); \
	return; \
	} \
	if (utils::IsColwiseReduce(ndim, X_dims, Y_dims, &rows, &cols)) { \
	Colwise##Func<T>(rows, cols, alpha, X, Y, context); \
	return; \
	} \
	int M; \
	int N; \
	int K; \
	if (utils::IsBothEndsReduce(ndim, X_dims, Y_dims, &M, &N, &K)) { \
	BothEnds##Func<T>(M, N, K, alpha, X, Y, context); \
	return; \
	} \
	Func##Impl<T>(ndim, X_dims, Y_dims, alpha, X, Y, \
	context, allow_broadcast_fastpath); \
	}
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(
	float,
	ReduceMin,
	std::numeric_limits<float>::max(),
	false)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(
	double,
	ReduceMin,
	std::numeric_limits<double>::max(),
	false)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(
	std::int32_t,
	ReduceMin,
	std::numeric_limits<std::int32_t>::max(),
	false)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(
	std::int64_t,
	ReduceMin,
	std::numeric_limits<std::int64_t>::max(),
	false)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(
	float,
	ReduceMax,
	std::numeric_limits<float>::lowest(),
	false)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(
	double,
	ReduceMax,
	std::numeric_limits<double>::lowest(),
	false)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(
	std::int32_t,
	ReduceMax,
	std::numeric_limits<std::int32_t>::lowest(),
	false)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(
	std::int64_t,
	ReduceMax,
	std::numeric_limits<std::int64_t>::lowest(),
	false)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(float, ReduceSum, 0.0f, false)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(double, ReduceSum, 0.0, false)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(std::int32_t, ReduceSum, 0, false)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(std::int64_t, ReduceSum, 0LL, false)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(float, ReduceMean, 0.0f, false)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(double, ReduceMean, 0.0, false)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(float, ReduceL1, 0.0f, true)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(double, ReduceL1, 0.0, true)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(std::int32_t, ReduceL1, 0, true)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(std::int64_t, ReduceL1, 0LL, true)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(float, ReduceL2, 0.0f, true)
	// NOLINTNEXTLINE(modernize-use-transparent-functors)
	DELEGATE_REDUCE_FUNCTION(double, ReduceL2, 0.0, true)
	#undef DELEGATE_REDUCE_FUNCTION

	#define CAFFE2_SPECIALIZED_MOMENTS(T) \
	template <> \
	C10_EXPORT void Moments<T, CPUContext>( \
	const int ndim, \
	const int* X_dims, \
	const int* Y_dims, \
	const T* X, \
	T* mean, \
	T* var, \
	CPUContext* context, \
	bool allow_broadcast_fastpath) { \
	MomentsImpl<T>(ndim, X_dims, Y_dims, X, mean, var, \
	context, allow_broadcast_fastpath); \
	}
	CAFFE2_SPECIALIZED_MOMENTS(float)
	CAFFE2_SPECIALIZED_MOMENTS(double)
	#undef CAFFE2_SPECIALIZED_MOMENTS

	} // namespace math
	} // namespace caffe2