| // Implements the math functions for CPU. |
| // The implementation in this file allows us to route the underlying numerical |
| // computation library to different backends. Notably: |
| // (1) For all BLAS-related functions, one can explicitly request a BLAS backend |
| // such as MKL, openblas or Atlas. To see the set of supported backends |
| // currently provided, check //third_party/blas/. |
| // (2) If one chooses to link against MKL, we utilize MKL's vector math library |
| // (VML) for a few functions such as Exp and Log. |
| // (3) Fallback implementations are provided in Eigen for cross-platform |
| // support. Since Eigen is a header-only library and supports a number of |
| // platforms, it allows one to quickly port Caffe2 to different platforms |
| // where BLAS may not be present. |
| |
| #include "caffe2/utils/math.h" |
| |
| #include <algorithm> |
| #include <array> |
| #include <atomic> |
| #include <chrono> |
| #include <cmath> |
| #include <cstdlib> |
| #include <cstring> |
| #include <functional> |
| #include <limits> |
| #include <numeric> |
| #include <random> |
| #include <tuple> |
| #include <unordered_set> |
| #include <vector> |
| |
| #include "caffe2/core/context.h" |
| #include "caffe2/utils/cpu_neon.h" |
| #include "caffe2/utils/eigen_utils.h" |
| #include "caffe2/utils/fixed_divisor.h" |
| #include "caffe2/utils/math/broadcast.h" |
| |
| #include "Eigen/Core" |
| #include "Eigen/Dense" |
| |
| #ifdef CAFFE2_USE_MKL |
| #include <mkl.h> |
| #endif // CAFFE2_USE_MKL |
| |
| #ifdef CAFFE2_USE_HPTT |
| #include <hptt.h> |
| #endif // CAFFE2_USE_HPTT |
| |
| #if defined(_MSC_VER) |
| #include <process.h> |
| #endif |
| |
| namespace caffe2 { |
| |
| namespace math { |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // BLAS alternatives. |
| // Depending on whether we have specified an external BLAS library or not, we |
| // will delegate the Caffe math functions that are BLAS-related to either the |
| // CBLAS call or the Eigen implementation. |
| //////////////////////////////////////////////////////////////////////////////// |
| #ifdef CAFFE2_USE_EIGEN_FOR_BLAS |
| |
| // Caffe2 gemm provides a simpler interface to the gemm functions, with the |
| // limitation that the data has to be contiguous in memory. |
| // |
| // The gemm call implements the following operation: |
| // |
| // C = alpha * op(A) * op(B) + beta * C |
| // |
| // where op(A) has size M x K, op(B) has size K x N, and C has size M x N. Each |
| // of A, B, and C are matrices and alpha and beta are scalars. Note that the |
| // most common use case of gemm will involve setting alpha to 1 and beta to 0. |
| // |
| // op(A) and op(B) represent the transformations that are done to A and B before |
| // the matrix multiply; depending on the flags set, op(A) is equal to A or A^T |
| // (transpose) if the argument TransA or TransB is set to CblasNoTrans or |
| // CblasTrans, respectively, for each of A and B. |
| template <> |
| C10_EXPORT void Gemm<float, CPUContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const float* B, |
| const float beta, |
| float* C, |
| CPUContext* context, |
| TensorProto::DataType math_type) { |
| auto C_mat = EigenMatrixMap<float>(C, N, M); |
| if (beta == 0) { |
| C_mat.setZero(); |
| } else { |
| C_mat *= beta; |
| } |
| switch (trans_A) { |
| case CblasNoTrans: { |
| switch (trans_B) { |
| case CblasNoTrans: |
| C_mat.noalias() += alpha * |
| (ConstEigenMatrixMap<float>(B, N, K) * |
| ConstEigenMatrixMap<float>(A, K, M)); |
| return; |
| case CblasTrans: |
| C_mat.noalias() += alpha * |
| (ConstEigenMatrixMap<float>(B, K, N).transpose() * |
| ConstEigenMatrixMap<float>(A, K, M)); |
| return; |
| default: |
| LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B"; |
| return; // The line above calls `abort()`. Should never reach here. |
| } |
| } |
| case CblasTrans: { |
| switch (trans_B) { |
| case CblasNoTrans: |
| C_mat.noalias() += alpha * |
| (ConstEigenMatrixMap<float>(B, N, K) * |
| ConstEigenMatrixMap<float>(A, M, K).transpose()); |
| return; |
| case CblasTrans: |
| C_mat.noalias() += alpha * |
| (ConstEigenMatrixMap<float>(B, K, N).transpose() * |
| ConstEigenMatrixMap<float>(A, M, K).transpose()); |
| return; |
| default: |
| LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B"; |
| return; // The line above calls `abort()`. Should never reach here. |
| } |
| } |
| default: |
| LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_A"; |
| } |
| } |
| |
| template <> |
| C10_EXPORT void GemmEx<float, CPUContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const int lda, |
| const float* B, |
| const int ldb, |
| const float beta, |
| float* C, |
| const int ldc, |
| CPUContext*) { |
| EigenOuterStridedMatrixMap<float> C_mat(C, N, M, EigenOuterStride(ldc)); |
| if (beta == 0) { |
| C_mat.setZero(); |
| } else { |
| C_mat *= beta; |
| } |
| switch (trans_A) { |
| case CblasNoTrans: { |
| switch (trans_B) { |
| case CblasNoTrans: |
| C_mat.noalias() += alpha * |
| (ConstEigenOuterStridedMatrixMap<float>( |
| B, N, K, EigenOuterStride(ldb)) * |
| ConstEigenOuterStridedMatrixMap<float>( |
| A, K, M, EigenOuterStride(lda))); |
| return; |
| case CblasTrans: |
| C_mat.noalias() += alpha * |
| (ConstEigenOuterStridedMatrixMap<float>( |
| B, K, N, EigenOuterStride(ldb)) |
| .transpose() * |
| ConstEigenOuterStridedMatrixMap<float>( |
| A, K, M, EigenOuterStride(lda))); |
| return; |
| default: |
| LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B"; |
| return; // The line above calls `abort()`. Should never reach here. |
| } |
| } |
| case CblasTrans: { |
| switch (trans_B) { |
| case CblasNoTrans: |
| C_mat.noalias() += alpha * |
| (ConstEigenOuterStridedMatrixMap<float>( |
| B, N, K, EigenOuterStride(ldb)) * |
| ConstEigenOuterStridedMatrixMap<float>( |
| A, M, K, EigenOuterStride(lda)) |
| .transpose()); |
| return; |
| case CblasTrans: |
| C_mat.noalias() += alpha * |
| (ConstEigenOuterStridedMatrixMap<float>( |
| B, K, N, EigenOuterStride(ldb)) |
| .transpose() * |
| ConstEigenOuterStridedMatrixMap<float>( |
| A, M, K, EigenOuterStride(lda)) |
| .transpose()); |
| return; |
| default: |
| LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B"; |
| return; // The line above calls `abort()`. Should never reach here. |
| } |
| } |
| default: |
| LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_A"; |
| } |
| } |
| |
| template <> |
| C10_EXPORT void Gemv<float, CPUContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const int M, |
| const int N, |
| const float alpha, |
| const float* A, |
| const float* x, |
| const float beta, |
| float* y, |
| CPUContext* context, |
| TensorProto::DataType math_type) { |
| EigenVectorMap<float> y_vec(y, trans_A == CblasNoTrans ? M : N); |
| if (beta == 0) { |
| // In Caffe2 we often do a lazy initialization, which may contain NaNs in |
| // the float values. As a result, if beta is 0, we explicitly do a setzero. |
| y_vec.setZero(); |
| } else { |
| y_vec *= beta; |
| } |
| switch (trans_A) { |
| case CblasNoTrans: { |
| y_vec.noalias() += alpha * |
| (ConstEigenMatrixMap<float>(A, N, M).transpose() * |
| ConstEigenVectorMap<float>(x, N)); |
| return; |
| } |
| case CblasTrans: { |
| y_vec.noalias() += alpha * |
| (ConstEigenMatrixMap<float>(A, N, M) * |
| ConstEigenVectorMap<float>(x, M)); |
| return; |
| } |
| default: |
| LOG(FATAL) << "Gemv float found an unexpected CBLAS_TRANSPOSE input."; |
| } |
| } |
| |
| #define CAFFE2_SPECIALIZED_DOT(T) \ |
| template <> \ |
| C10_EXPORT void Dot<T, CPUContext>( \ |
| const int N, const T* a, const T* b, T* y, CPUContext* context) { \ |
| *y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N)); \ |
| } |
| CAFFE2_SPECIALIZED_DOT(float) |
| #undef CAFFE2_SPECIALIZED_DOT |
| |
| #else // CAFFE2_USE_EIGEN_FOR_BLAS |
| |
| template <> |
| C10_EXPORT void Gemm<float, CPUContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const float* B, |
| const float beta, |
| float* C, |
| CPUContext* /*context*/, |
| TensorProto::DataType /*math_type*/) { |
| // MKL expects ld? >= 1 |
| const int lda = std::max((trans_A == CblasNoTrans) ? K : M, 1); |
| const int ldb = std::max((trans_B == CblasNoTrans) ? N : K, 1); |
| cblas_sgemm( |
| CblasRowMajor, |
| trans_A, |
| trans_B, |
| M, |
| N, |
| K, |
| alpha, |
| A, |
| lda, |
| B, |
| ldb, |
| beta, |
| C, |
| N); |
| } |
| |
| template <> |
| C10_EXPORT void GemmEx<float, CPUContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const int lda, |
| const float* B, |
| const int ldb, |
| const float beta, |
| float* C, |
| const int ldc, |
| CPUContext* /*context*/) { |
| cblas_sgemm( |
| CblasRowMajor, |
| trans_A, |
| trans_B, |
| M, |
| N, |
| K, |
| alpha, |
| A, |
| lda, |
| B, |
| ldb, |
| beta, |
| C, |
| ldc); |
| } |
| |
| template <> |
| C10_EXPORT void Gemv<float, CPUContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const int M, |
| const int N, |
| const float alpha, |
| const float* A, |
| const float* x, |
| const float beta, |
| float* y, |
| CPUContext* /*context*/, |
| TensorProto::DataType /*math_type*/) { |
| cblas_sgemv(CblasRowMajor, trans_A, M, N, alpha, A, N, x, 1, beta, y, 1); |
| } |
| |
| #define CAFFE2_SPECIALIZED_DOT(T, prefix) \ |
| template <> \ |
| C10_EXPORT void Dot<T, CPUContext>( \ |
| const int N, const T* a, const T* b, T* y, CPUContext*) { \ |
| *y = cblas_##prefix##dot(N, a, 1, b, 1); \ |
| } |
| CAFFE2_SPECIALIZED_DOT(float, s) |
| #undef CAFFE2_SPECIALIZED_DOT |
| |
| #endif // CAFFE2_USE_EIGEN_FOR_BLAS |
| |
| template <> |
| C10_EXPORT void GemmBatched<float, CPUContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int batch_size, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float** A, |
| const float** B, |
| const float beta, |
| float** C, |
| CPUContext* context, |
| TensorProto::DataType /* math_type */) { |
| #ifdef CAFFE2_USE_MKL |
| (void)context; |
| // MKL expects ld? >= 1 |
| const int lda = std::max((trans_A == CblasNoTrans) ? K : M, 1); |
| const int ldb = std::max((trans_B == CblasNoTrans) ? N : K, 1); |
| const int ldc = std::max(N, 1); |
| cblas_sgemm_batch( |
| CblasRowMajor, |
| &trans_A, |
| &trans_B, |
| &M, |
| &N, |
| &K, |
| &alpha, |
| A, |
| &lda, |
| B, |
| &ldb, |
| &beta, |
| C, |
| &ldc, |
| 1, |
| &batch_size); |
| #else // CAFFE2_USE_MKL |
| // loop over matrices in the batch |
| for (int i = 0; i < batch_size; ++i) { |
| math::Gemm<float, CPUContext>( |
| trans_A, trans_B, M, N, K, alpha, A[i], B[i], beta, C[i], context); |
| } |
| #endif // CAFFE2_USE_MKL |
| } |
| |
| template <> |
| C10_EXPORT void GemmStridedBatched<float, CPUContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int batch_size, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const int A_stride, |
| const float* B, |
| const int B_stride, |
| const float beta, |
| float* C, |
| const int C_stride, |
| CPUContext* context, |
| TensorProto::DataType /* math_type */) { |
| #ifdef CAFFE2_USE_MKL |
| (void)context; |
| // MKL expects ld? >= 1 |
| const int lda = std::max((trans_A == CblasNoTrans) ? K : M, 1); |
| const int ldb = std::max((trans_B == CblasNoTrans) ? N : K, 1); |
| const int ldc = std::max(N, 1); |
| std::vector<const float*> A_array(batch_size); |
| std::vector<const float*> B_array(batch_size); |
| std::vector<float*> C_array(batch_size); |
| for (int i = 0; i < batch_size; ++i) { |
| A_array[i] = A + i * A_stride; |
| B_array[i] = B + i * B_stride; |
| C_array[i] = C + i * C_stride; |
| } |
| cblas_sgemm_batch( |
| CblasRowMajor, |
| &trans_A, |
| &trans_B, |
| &M, |
| &N, |
| &K, |
| &alpha, |
| A_array.data(), |
| &lda, |
| B_array.data(), |
| &ldb, |
| &beta, |
| C_array.data(), |
| &ldc, |
| 1, |
| &batch_size); |
| #else // CAFFE2_USE_MKL |
| // loop over matrices in the batch |
| for (int i = 0; i < batch_size; ++i) { |
| math::Gemm<float, CPUContext>( |
| trans_A, trans_B, M, N, K, alpha, A, B, beta, C, context); |
| A += A_stride; |
| B += B_stride; |
| C += C_stride; |
| } |
| #endif |
| } |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // Common math functions being used in Caffe that do not have a BLAS or MKL |
| // equivalent. For all these functions, we will simply implement them either via |
| // Eigen or via custom code. |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| namespace { |
| |
| template <typename T> |
| void BroadcastImpl( |
| const int X_ndim, |
| const int* X_dims, |
| const int Y_ndim, |
| const int* Y_dims, |
| const T alpha, |
| const T* X, |
| T* Y, |
| CPUContext* context, |
| bool allow_broadcast_fastpath) { |
| CAFFE_ENFORCE_LE(X_ndim, Y_ndim); |
| std::vector<int> X_dims_vector(Y_ndim); |
| const int d = Y_ndim - X_ndim; |
| std::fill(X_dims_vector.begin(), X_dims_vector.begin() + d, 1); |
| for (int i = d; i < Y_ndim; ++i) { |
| CAFFE_ENFORCE(X_dims[i - d] == 1 || X_dims[i - d] == Y_dims[i]); |
| X_dims_vector[i] = X_dims[i - d]; |
| } |
| X_dims = X_dims_vector.data(); |
| CAFFE_ENFORCE_EQ(X_dims_vector.size(), Y_ndim); |
| const int X_size = |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::accumulate(X_dims, X_dims + Y_ndim, 1, std::multiplies<int>()); |
| const int Y_size = |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::accumulate(Y_dims, Y_dims + Y_ndim, 1, std::multiplies<int>()); |
| if (allow_broadcast_fastpath && can_use_broadcast_fastpath(Y_ndim, X_dims)) { |
| int X_index = 0; |
| for (int Y_index = 0; Y_index < Y_size; ++Y_index) { |
| Y[Y_index] = X[X_index]; |
| X_index++; |
| if (X_index >= X_size) { |
| X_index = 0; |
| } |
| } |
| } else { |
| std::vector<int> index(Y_ndim, 0); |
| for (int Y_index = 0; Y_index < Y_size; ++Y_index) { |
| const int X_index = utils::GetIndexFromDims(Y_ndim, X_dims, index.data()); |
| Y[Y_index] = X[X_index]; |
| utils::IncreaseIndexInDims(Y_ndim, Y_dims, index.data()); |
| } |
| } |
| Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); |
| } |
| |
| } // namespace |
| |
| #define CAFFE2_SPECIALIZED_BROADCAST(T) \ |
| template <> \ |
| C10_EXPORT void Broadcast<T, CPUContext>( \ |
| const int X_ndim, \ |
| const int* X_dims, \ |
| const int Y_ndim, \ |
| const int* Y_dims, \ |
| const T alpha, \ |
| const T* X, \ |
| T* Y, \ |
| CPUContext* context, \ |
| bool allow_broadcast_fastpath) { \ |
| BroadcastImpl<T>(X_ndim, X_dims, Y_ndim, Y_dims, alpha, X, Y, \ |
| context, allow_broadcast_fastpath); \ |
| } |
| CAFFE2_SPECIALIZED_BROADCAST(std::int32_t) |
| CAFFE2_SPECIALIZED_BROADCAST(std::int64_t) |
| CAFFE2_SPECIALIZED_BROADCAST(float) |
| CAFFE2_SPECIALIZED_BROADCAST(double) |
| #undef CAFFE2_SPECIALIZED_BROADCAST |
| |
| #define CAFFE2_SPECIALIZED_INV_STD(T) \ |
| template <> \ |
| void InvStd<T, CPUContext>( \ |
| const int N, \ |
| const T epsilon, \ |
| const T* var, \ |
| T* inv_std, \ |
| CPUContext* context) { \ |
| EigenVectorArrayMap<T>(inv_std, N) = \ |
| (ConstEigenVectorArrayMap<T>(var, N) + epsilon).rsqrt(); \ |
| } |
| CAFFE2_SPECIALIZED_INV_STD(float) |
| #undef CAFFE2_SPECIALIZED_INV_STD |
| |
| #define CAFFE2_SPECIALIZED_ROWWISEMAX(T) \ |
| template <> \ |
| C10_EXPORT void RowwiseMax<T, CPUContext>( \ |
| const int N, const int D, const T* x, T* y, CPUContext*) { \ |
| EigenVectorMap<T>(y, N) = \ |
| ConstEigenMatrixMap<T>(x, D, N).colwise().maxCoeff(); \ |
| } |
| CAFFE2_SPECIALIZED_ROWWISEMAX(float) |
| #undef CAFFE2_SPECIALIZED_ROWWISEMAX |
| |
| #define CAFFE2_SPECIALIZED_COLWISEMAX(T) \ |
| template <> \ |
| C10_EXPORT void ColwiseMax<T, CPUContext>( \ |
| const int N, const int D, const T* x, T* y, CPUContext*) { \ |
| EigenVectorMap<T>(y, D) = \ |
| ConstEigenMatrixMap<T>(x, D, N).rowwise().maxCoeff(); \ |
| } |
| CAFFE2_SPECIALIZED_COLWISEMAX(float) |
| #undef CAFFE2_SPECIALIZED_COLWISEMAX |
| |
| #define CAFFE2_SPECIALIZED_MAXIMUM(T) \ |
| template <> \ |
| C10_EXPORT void Maximum<T, CPUContext>( \ |
| const int N, const float alpha, const T* x, T* y, CPUContext* context) { \ |
| std::transform( \ |
| x, x + N, y, [&alpha](const T& x_i) { return std::max(x_i, alpha); }); \ |
| } |
| CAFFE2_SPECIALIZED_MAXIMUM(float) |
| #undef CAFFE2_SPECIALIZED_MAXIMUM |
| |
| // The actual implementation uses eigen which is column major, so notice the |
| // row/column swap in the actual implementation. |
| |
| #define DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \ |
| template <> \ |
| C10_EXPORT void Rowwise##Func<T, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| if (C == B) { \ |
| EigenArrayMap<T>(C, cols, rows).colwise() expr## = \ |
| ConstEigenVectorArrayMap<T>(A, cols); \ |
| } else { \ |
| EigenArrayMap<T>(C, cols, rows) = \ |
| ConstEigenArrayMap<T>(B, cols, rows) \ |
| .colwise() expr ConstEigenVectorArrayMap<T>(A, cols); \ |
| } \ |
| } \ |
| template <> \ |
| C10_EXPORT void Colwise##Func<T, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| if (C == B) { \ |
| EigenArrayMap<T>(C, cols, rows).rowwise() expr## = \ |
| ConstEigenVectorArrayMap<T>(A, rows).transpose(); \ |
| } else { \ |
| EigenArrayMap<T>(C, cols, rows) = \ |
| ConstEigenArrayMap<T>(B, cols, rows) \ |
| .rowwise() expr ConstEigenVectorArrayMap<T>(A, rows) \ |
| .transpose(); \ |
| } \ |
| } |
| |
| #define DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr) \ |
| template <> \ |
| C10_EXPORT void Rowwise##Func<T, CPUContext, false>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| if (C == A) { \ |
| EigenArrayMap<T>(C, cols, rows).colwise() expr## = \ |
| ConstEigenVectorArrayMap<T>(B, cols); \ |
| } else { \ |
| EigenArrayMap<T>(C, cols, rows) = \ |
| ConstEigenArrayMap<T>(A, cols, rows) \ |
| .colwise() expr ConstEigenVectorArrayMap<T>(B, cols); \ |
| } \ |
| } \ |
| template <> \ |
| C10_EXPORT void Colwise##Func<T, CPUContext, false>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| if (C == A) { \ |
| EigenArrayMap<T>(C, cols, rows).rowwise() expr## = \ |
| ConstEigenVectorArrayMap<T>(B, rows).transpose(); \ |
| } else { \ |
| EigenArrayMap<T>(C, cols, rows) = \ |
| ConstEigenArrayMap<T>(A, cols, rows) \ |
| .rowwise() expr ConstEigenVectorArrayMap<T>(B, rows) \ |
| .transpose(); \ |
| } \ |
| } |
| |
| #define DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(T, Func, expr) \ |
| DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \ |
| DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr) |
| |
| #define DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Func, expr) \ |
| DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(float, Func, expr) \ |
| DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(double, Func, expr) \ |
| DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, Func, expr) \ |
| DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, Func, expr) |
| |
| DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Add, +) |
| DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Mul, *) |
| |
| #undef DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION |
| #undef DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION |
| |
| #define DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void RowwiseSub<T, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| EigenArrayMap<T>(C, cols, rows) = \ |
| (-ConstEigenArrayMap<T>(B, cols, rows)).colwise() + \ |
| ConstEigenVectorArrayMap<T>(A, cols); \ |
| } \ |
| template <> \ |
| C10_EXPORT void ColwiseSub<T, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| EigenArrayMap<T>(C, cols, rows) = \ |
| (-ConstEigenArrayMap<T>(B, cols, rows)).rowwise() + \ |
| ConstEigenVectorArrayMap<T>(A, rows).transpose(); \ |
| } \ |
| DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Sub, -) |
| |
| DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(float) |
| DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(double) |
| DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int32_t) |
| DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int64_t) |
| |
| #undef DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION |
| |
| #define DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void RowwiseDiv<T, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| EigenArrayMap<T>(C, cols, rows) = \ |
| ConstEigenArrayMap<T>(B, cols, rows).inverse().colwise() * \ |
| ConstEigenVectorArrayMap<T>(A, cols); \ |
| } \ |
| template <> \ |
| C10_EXPORT void ColwiseDiv<T, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| EigenArrayMap<T>(C, cols, rows) = \ |
| ConstEigenArrayMap<T>(B, cols, rows).inverse().rowwise() * \ |
| ConstEigenVectorArrayMap<T>(A, rows).transpose(); \ |
| } \ |
| DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Div, /) |
| |
| DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(float) |
| DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(double) |
| DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int32_t, Div, /) |
| DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int64_t, Div, /) |
| |
| #undef DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION |
| |
| #undef DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION |
| #undef DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION |
| |
| template <> |
| C10_EXPORT void Not<bool, CPUContext>( |
| const int N, |
| const bool* x, |
| bool* y, |
| CPUContext* /*context*/) { |
| for (int i = 0; i < N; ++i) { |
| y[i] = !x[i]; |
| } |
| } |
| |
| #undef C10_DEFINE_BINARY_OP |
| #undef CAFFE2_INSTANTIATE_BINARY_OP |
| |
| #define CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(T) \ |
| template <> \ |
| C10_EXPORT void AddStripedBatch( \ |
| const int N, \ |
| const T* first, \ |
| T* y, \ |
| const int stripe, \ |
| const int batch, \ |
| CPUContext* context) { \ |
| for (int j = 0; j < batch; j++) { \ |
| Add<T, CPUContext>(N, first + j * stripe, y, y, context); \ |
| } \ |
| } |
| |
| CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(float); |
| #undef CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH |
| |
| namespace { |
| |
| template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st> |
| void RowwiseBinaryOp( |
| const int rows, |
| const int cols, |
| const BinaryOperator& op, |
| const TIn* A, |
| const TIn* B, |
| TOut* C) { |
| for (int i = 0; i < rows; ++i) { |
| for (int j = 0; j < cols; ++j) { |
| const int C_index = i * cols + j; |
| const int A_index = kBroadcast1st ? j : C_index; |
| const int B_index = kBroadcast1st ? C_index : j; |
| C[C_index] = op(A[A_index], B[B_index]); |
| } |
| } |
| } |
| |
| template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st> |
| void ColwiseBinaryOp( |
| const int rows, |
| const int cols, |
| const BinaryOperator& op, |
| const TIn* A, |
| const TIn* B, |
| TOut* C) { |
| for (int i = 0; i < rows; ++i) { |
| for (int j = 0; j < cols; ++j) { |
| const int C_index = i * cols + j; |
| const int A_index = kBroadcast1st ? i : C_index; |
| const int B_index = kBroadcast1st ? C_index : i; |
| C[C_index] = op(A[A_index], B[B_index]); |
| } |
| } |
| } |
| |
| template <typename TIn, typename TOut, class BinaryOperator> |
| void BroadcastBinaryOpImpl( |
| const int ndim, |
| const int* A_dims, |
| const int* B_dims, |
| const int* C_dims, |
| const BinaryOperator& op, |
| const TIn* A, |
| const TIn* B, |
| TOut* C) { |
| std::vector<int> index(ndim, 0); |
| const int C_size = |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::accumulate(C_dims, C_dims + ndim, 1, std::multiplies<int>()); |
| for (int C_index = 0; C_index < C_size; ++C_index) { |
| const int A_index = utils::GetIndexFromDims(ndim, A_dims, index.data()); |
| const int B_index = utils::GetIndexFromDims(ndim, B_dims, index.data()); |
| C[C_index] = op(A[A_index], B[B_index]); |
| utils::IncreaseIndexInDims(ndim, C_dims, index.data()); |
| } |
| } |
| |
| } // namespace |
| |
| #define DELEGATE_2D_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \ |
| template <> \ |
| C10_EXPORT void Rowwise##Func<TIn, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const TIn* A, \ |
| const TIn* B, \ |
| TOut* C, \ |
| CPUContext*) { \ |
| RowwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \ |
| } \ |
| template <> \ |
| C10_EXPORT void Rowwise##Func<TIn, CPUContext, false>( \ |
| const int rows, \ |
| const int cols, \ |
| const TIn* A, \ |
| const TIn* B, \ |
| TOut* C, \ |
| CPUContext*) { \ |
| RowwiseBinaryOp<TIn, TOut, Op<TIn>, false>( \ |
| rows, cols, Op<TIn>(), A, B, C); \ |
| } \ |
| template <> \ |
| C10_EXPORT void Colwise##Func<TIn, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const TIn* A, \ |
| const TIn* B, \ |
| TOut* C, \ |
| CPUContext*) { \ |
| ColwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \ |
| } \ |
| template <> \ |
| C10_EXPORT void Colwise##Func<TIn, CPUContext, false>( \ |
| const int rows, \ |
| const int cols, \ |
| const TIn* A, \ |
| const TIn* B, \ |
| TOut* C, \ |
| CPUContext*) { \ |
| ColwiseBinaryOp<TIn, TOut, Op<TIn>, false>( \ |
| rows, cols, Op<TIn>(), A, B, C); \ |
| } |
| |
| #define DEFINE_2D_COMPARE_FUNCTION(Func, Op) \ |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op) \ |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op) \ |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) |
| |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_2D_COMPARE_FUNCTION(EQ, std::equal_to) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_2D_COMPARE_FUNCTION(NE, std::not_equal_to) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_2D_COMPARE_FUNCTION(LT, std::less) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_2D_COMPARE_FUNCTION(LE, std::less_equal) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_2D_COMPARE_FUNCTION(GT, std::greater) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_2D_COMPARE_FUNCTION(GE, std::greater_equal) |
| |
| #undef DEFINE_2D_COMPARE_FUNCTION |
| |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, And, std::logical_and) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Or, std::logical_or) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor) |
| |
| #define DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op) \ |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) \ |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) |
| |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor) |
| |
| #undef DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION |
| |
| #undef DELEGATE_2D_BROADCAST_BINARY_FUNCTION |
| |
| #define DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void RowwiseDiv<T, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| RowwiseBinaryOp<T, T, std::divides<T>, true>( \ |
| rows, cols, std::divides<T>(), A, B, C); \ |
| } \ |
| template <> \ |
| C10_EXPORT void ColwiseDiv<T, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| ColwiseBinaryOp<T, T, std::divides<T>, true>( \ |
| rows, cols, std::divides<T>(), A, B, C); \ |
| } |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int32_t) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int64_t) |
| #undef DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION |
| |
| #define DELEGATE_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \ |
| template <> \ |
| C10_EXPORT void Func<TIn, CPUContext>( \ |
| const int A_ndim, \ |
| const int* A_dims, \ |
| const int B_ndim, \ |
| const int* B_dims, \ |
| const TIn* A, \ |
| const TIn* B, \ |
| TOut* C, \ |
| CPUContext* context) { \ |
| const int ndim = std::max(A_ndim, B_ndim); \ |
| std::vector<int> A_dims_array(ndim); \ |
| std::vector<int> B_dims_array(ndim); \ |
| std::vector<int> C_dims_array(ndim); \ |
| utils::ComputeBroadcastBinaryOpDims( \ |
| A_ndim, \ |
| A_dims, \ |
| B_ndim, \ |
| B_dims, \ |
| A_dims_array.data(), \ |
| B_dims_array.data(), \ |
| C_dims_array.data()); \ |
| if (A_dims_array == B_dims_array) { \ |
| const int size = std::accumulate( \ |
| C_dims_array.cbegin(), \ |
| C_dims_array.cend(), \ |
| 1, \ |
| std::multiplies<int>()); \ |
| Func<TIn, CPUContext>(size, A, B, C, context); \ |
| return; \ |
| } \ |
| int rows; \ |
| int cols; \ |
| bool broadcast_1st; \ |
| if (utils::IsRowwiseBroadcastBinaryOp( \ |
| ndim, \ |
| A_dims_array.data(), \ |
| B_dims_array.data(), \ |
| &rows, \ |
| &cols, \ |
| &broadcast_1st)) { \ |
| if (broadcast_1st) { \ |
| Rowwise##Func<TIn, CPUContext, true>(rows, cols, A, B, C, context); \ |
| } else { \ |
| Rowwise##Func<TIn, CPUContext, false>(rows, cols, A, B, C, context); \ |
| } \ |
| return; \ |
| } \ |
| if (utils::IsColwiseBroadcastBinaryOp( \ |
| ndim, \ |
| A_dims_array.data(), \ |
| B_dims_array.data(), \ |
| &rows, \ |
| &cols, \ |
| &broadcast_1st)) { \ |
| if (broadcast_1st) { \ |
| Colwise##Func<TIn, CPUContext, true>(rows, cols, A, B, C, context); \ |
| } else { \ |
| Colwise##Func<TIn, CPUContext, false>(rows, cols, A, B, C, context); \ |
| } \ |
| return; \ |
| } \ |
| int pre; \ |
| int mid; \ |
| int nxt; \ |
| if (utils::IsBothEndsBroadcastBinaryOp( \ |
| ndim, \ |
| A_dims_array.data(), \ |
| B_dims_array.data(), \ |
| &pre, \ |
| &mid, \ |
| &nxt, \ |
| &broadcast_1st)) { \ |
| const int stride = mid * nxt; \ |
| for (int i = 0; i < pre; ++i) { \ |
| if (broadcast_1st) { \ |
| Colwise##Func<TIn, CPUContext, true>( \ |
| mid, nxt, A, B + i * stride, C + i * stride, context); \ |
| } else { \ |
| Colwise##Func<TIn, CPUContext, false>( \ |
| mid, nxt, A + i * stride, B, C + i * stride, context); \ |
| } \ |
| } \ |
| return; \ |
| } \ |
| BroadcastBinaryOpImpl( \ |
| ndim, \ |
| A_dims_array.data(), \ |
| B_dims_array.data(), \ |
| C_dims_array.data(), \ |
| Op<TIn>(), \ |
| A, \ |
| B, \ |
| C); \ |
| } |
| |
| #define DEFINE_BROADCAST_COMPARE_FUNCTION(Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) |
| |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_BROADCAST_COMPARE_FUNCTION(EQ, std::equal_to) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_BROADCAST_COMPARE_FUNCTION(NE, std::not_equal_to) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_BROADCAST_COMPARE_FUNCTION(LT, std::less) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_BROADCAST_COMPARE_FUNCTION(LE, std::less_equal) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_BROADCAST_COMPARE_FUNCTION(GT, std::greater) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_BROADCAST_COMPARE_FUNCTION(GE, std::greater_equal) |
| |
| #undef DEFINE_BROADCAST_COMPARE_FUNCTION |
| |
| #define DEFINE_BROADCAST_BINARY_FUNCTION(Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(float, float, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(double, double, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) |
| |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_BROADCAST_BINARY_FUNCTION(Add, std::plus) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_BROADCAST_BINARY_FUNCTION(Sub, std::minus) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_BROADCAST_BINARY_FUNCTION(Mul, std::multiplies) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_BROADCAST_BINARY_FUNCTION(Div, std::divides) |
| |
| #undef DEFINE_BROADCAST_BINARY_FUNCTION |
| |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, And, std::logical_and) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Or, std::logical_or) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor) |
| |
| #define DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) |
| |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor) |
| |
| #undef DEFINE_BITWISE_BROADCAST_BINARY_FUNCTION |
| |
| #undef DELEGATE_BROADCAST_BINARY_FUNCTION |
| |
| namespace { |
| // incrementIfNotMax increments the number if the value is not max for that |
| // datatype. This ensures that the value never overflows. |
| template <typename T> |
| inline T incrementIfNotMax(T a) { |
| if (a == std::numeric_limits<T>::max()) { |
| return a; |
| } |
| return a + 1; |
| } |
| } // namespace |
| |
| #define CAFFE2_RAND_UNIFORM_REAL(T) \ |
| template <> \ |
| C10_EXPORT void RandUniform<T, CPUContext>( \ |
| const size_t n, const T a, const T b, T* r, CPUContext* context) { \ |
| at::uniform_real_distribution<T> distribution(a, b); \ |
| for (size_t i = 0; i < n; ++i) { \ |
| r[i] = distribution(context->RandGenerator()); \ |
| } \ |
| } |
| CAFFE2_RAND_UNIFORM_REAL(float); |
| CAFFE2_RAND_UNIFORM_REAL(double); |
| #undef CAFFE2_RAND_UNIFORM_REAL |
| |
| #define CAFFE2_RAND_UNIFORM_CHAR(T) \ |
| template <> \ |
| C10_EXPORT void RandUniform<T, CPUContext>( \ |
| const size_t n, const T a, const T b, T* r, CPUContext* context) { \ |
| at::uniform_int_from_to_distribution<short> distribution( \ |
| incrementIfNotMax(b - a), a); \ |
| for (size_t i = 0; i < n; ++i) { \ |
| r[i] = static_cast<T>(distribution(context->RandGenerator())); \ |
| } \ |
| } |
| CAFFE2_RAND_UNIFORM_CHAR(int8_t); |
| CAFFE2_RAND_UNIFORM_CHAR(uint8_t); |
| #undef CAFFE2_RAND_UNIFORM_CHAR |
| |
| #define CAFFE2_RAND_UNIFORM_INT(T) \ |
| template <> \ |
| C10_EXPORT void RandUniform<T, CPUContext>( \ |
| const size_t n, const T a, const T b, T* r, CPUContext* context) { \ |
| at::uniform_int_from_to_distribution<T> distribution( \ |
| incrementIfNotMax( \ |
| static_cast<uint64_t>(b) - static_cast<uint64_t>(a)), \ |
| a); \ |
| for (size_t i = 0; i < n; ++i) { \ |
| r[i] = distribution(context->RandGenerator()); \ |
| } \ |
| } |
| |
| CAFFE2_RAND_UNIFORM_INT(int16_t); |
| CAFFE2_RAND_UNIFORM_INT(int32_t); |
| CAFFE2_RAND_UNIFORM_INT(int64_t); |
| CAFFE2_RAND_UNIFORM_INT(uint16_t); |
| CAFFE2_RAND_UNIFORM_INT(uint32_t); |
| CAFFE2_RAND_UNIFORM_INT(uint64_t); |
| #undef CAFFE2_RAND_UNIFORM_INT |
| |
| // This is not uniformly distributed between a and b. |
| // It takes advantage of normal distribution to generate numbers |
| // with mean = sum / n. |
| // Ideally the algorithm should be generating n numbers between 0 and 1, |
| // sum them up as scaled_sum, and use sum / scaled_sum to adjust the values |
| // to between a and b. |
| // The algorithm is non-trivial given the adjustment would be different towards |
| // each value. |
| #define CAFFE2_RAND_FIXED_SUM(T) \ |
| template <> \ |
| C10_EXPORT void RandFixedSum<T, CPUContext>( \ |
| const size_t n, \ |
| const T a, \ |
| const T b, \ |
| const T sum, \ |
| T* r, \ |
| CPUContext* context) { \ |
| CAFFE_ENFORCE_GE(a, 0); \ |
| CAFFE_ENFORCE_GE(sum / (double)n, a); \ |
| CAFFE_ENFORCE_LE(sum / (double)n, b); \ |
| T current_sum = 0; \ |
| T remaining_sum = sum; \ |
| for (size_t i = 0; i < n; ++i) { \ |
| auto remaining_numbers = n - 1 - i; \ |
| double mean = (sum - current_sum) / (remaining_numbers + 1); \ |
| double stdev = std::min(mean - a, b - mean); \ |
| at::normal_distribution<double> distribution{mean, stdev / 4.0}; \ |
| T value, remaining_sum_test; \ |
| do { \ |
| value = distribution(context->RandGenerator()); \ |
| remaining_sum_test = remaining_sum - value; \ |
| } while (value < a || remaining_sum_test < a * remaining_numbers || \ |
| value > b || remaining_sum_test > b * remaining_numbers); \ |
| r[i] = value; \ |
| CAFFE_ENFORCE(a <= value && value <= b); \ |
| current_sum += value; \ |
| remaining_sum -= value; \ |
| CAFFE_ENFORCE_GE(remaining_sum, a* remaining_numbers); \ |
| CAFFE_ENFORCE_LE(remaining_sum, b* remaining_numbers); \ |
| } \ |
| r[n - 1] += remaining_sum; \ |
| current_sum += remaining_sum; \ |
| CAFFE_ENFORCE(a <= r[n - 1] && r[n - 1] <= b); \ |
| CAFFE_ENFORCE_EQ(current_sum, sum); \ |
| } |
| CAFFE2_RAND_FIXED_SUM(float); |
| CAFFE2_RAND_FIXED_SUM(double); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare,bugprone-integer-division) |
| CAFFE2_RAND_FIXED_SUM(int8_t); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare,bugprone-integer-division) |
| CAFFE2_RAND_FIXED_SUM(int16_t); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare,bugprone-integer-division) |
| CAFFE2_RAND_FIXED_SUM(int32_t); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare,bugprone-integer-division) |
| CAFFE2_RAND_FIXED_SUM(int64_t); |
| // NOLINTNEXTLINE(bugprone-integer-division) |
| CAFFE2_RAND_FIXED_SUM(uint8_t); |
| // NOLINTNEXTLINE(bugprone-integer-division) |
| CAFFE2_RAND_FIXED_SUM(uint16_t); |
| // NOLINTNEXTLINE(bugprone-integer-division) |
| CAFFE2_RAND_FIXED_SUM(uint32_t); |
| // NOLINTNEXTLINE(bugprone-integer-division) |
| CAFFE2_RAND_FIXED_SUM(uint64_t); |
| #undef CAFFE2_RAND_FIXED_SUM |
| |
| template <class Type, class Val_t, class Ind_t, class Context_t, bool cdf_app> |
| Ind_t generate_stack_distance( |
| std::vector<Ind_t>& cum_val, |
| std::vector<Val_t>& cum_dis, |
| std::vector<Ind_t>& cum_map, |
| Ind_t max_i, |
| Ind_t i, |
| Context_t* context) { |
| /* Description: |
| Inverse Transform Sampling method to generate values for random variable X |
| that is described by the cumulative distribution F (cum_val,cum_dis). |
| Notice, that we may choose to use the inverse map of F (cum_map) as an |
| approximation to avoid searching. Also, scaling the probability so that |
| the values are within max_i refs, because stack distance can not be > |
| than the # of already generated refs (max_i). |
| */ |
| Ind_t j, k, n; |
| Val_t u, f, fi; |
| |
| // generate a random number u in [0,1] from a uniform distribution U |
| math::RandUniform<Val_t, Context_t>(1, 0, 1, &u, context); |
| |
| // scale the random number u to be within range [0,f(i)], if needed |
| if (i < max_i) { |
| // approach 2: allows gaps in the distribution |
| j = (std::upper_bound(cum_val.begin(), cum_val.end(), i) - |
| cum_val.begin()) - |
| 1; |
| fi = cum_dis[j]; |
| u *= fi; |
| } |
| // 2. compute the stack distance value of x, s.t. F(x)=u |
| // notice that the cumulative distribution F increases monotonically up to 1 |
| if (cdf_app) { |
| // look up cum_val corresponding to u <= cum_dis[j] |
| k = cum_map.size(); |
| n = (Ind_t)round(u * k); |
| j = cum_map[n]; |
| return cum_val[j]; |
| } else { |
| // iterate until you find the cum_val corresponding to u <= cum_dis[j] |
| for (j = 0; j < Ind_t(cum_dis.size()); j++) { |
| f = cum_dis[j]; |
| if (u <= f) { |
| return cum_val[j]; |
| } |
| } |
| return cum_val[j - 1]; |
| } |
| } |
| |
| template <class Type, class Val_t, class Ind_t, class Context_t, bool cdf_app> |
| C10_EXPORT void generate_trace_lru( |
| std::vector<Ind_t>& uni_ref, |
| std::vector<Ind_t>& cum_val, |
| std::vector<Val_t>& cum_dis, |
| std::vector<Ind_t>& cum_map, |
| Context_t* context, |
| Ind_t cache_line_size, |
| Ind_t n, |
| Type min, |
| Type max, |
| Type* syn_ref) { |
| /* Description: |
| Generate synthetic trace from a list of unique accesses uni_ref, and |
| cumulative distribution of distances (cum_val,cum_dis) between them. |
| Also, there is an option to use cum_map approximation to avoid searching. |
| */ |
| Ind_t i, j, k, sd, line_ref, mem_ref, mem_ref_within_line; |
| Ind_t max_sd = cum_val.back(); |
| Ind_t l = uni_ref.size(); |
| |
| for (i = 0, j = 0; j < n; j++) { |
| // generate stack distance |
| sd = generate_stack_distance<Type, Val_t, Ind_t, Context_t, cdf_app>( |
| cum_val, cum_dis, cum_map, max_sd, i, context); |
| // fixed access within cache line |
| mem_ref_within_line = 0; |
| // random access within cache line |
| // Val_t r; |
| // math::RandUniform<Val_t, Context_t>(1, 0, 1, &r, context); |
| // mem_ref_within_line = floor(r*cache_line_size); |
| |
| // generate memory reference |
| if (sd == 0) { |
| k = 0; /// new reference /// |
| i++; |
| } else { |
| k = l - sd; /// existing reference /// |
| } |
| line_ref = uni_ref[k]; // pop k-th element |
| uni_ref.erase(uni_ref.begin() + k); |
| uni_ref.push_back(line_ref); // append it back |
| mem_ref = line_ref * cache_line_size + mem_ref_within_line; |
| /* |
| //debug prints |
| if ((mem_ref < min) || (mem_ref > max)) { |
| //printf("mem_ref[%d]=%d (%ld) \n",j,mem_ref,syn_ref[j]); |
| std::cout << "syn_ref[" << j << "]=" << (Type)mem_ref << " "; |
| std::cout << "(" << mem_ref << ") "; |
| std::cout << "[" << min << "," << max << "]" << std::endl; |
| int scanf_temp; |
| scanf("%d",&scanf_temp); |
| } |
| */ |
| |
| // patch mem_ref to be within range |
| // WARNING: this should not be needed if instantiation type and distribution |
| // choice is correct. It is remeding a symptom of earlier mistakes. |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| if (mem_ref < min) { |
| // NOLINTNEXTLINE(bugprone-signed-char-misuse) |
| mem_ref = min; |
| // std::cout << "clamping (min) mem_ref=" << mem_ref << std::endl; |
| } |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| if (mem_ref > max) { |
| // NOLINTNEXTLINE(bugprone-signed-char-misuse) |
| mem_ref = max; // mem_ref % max; |
| // std::cout << "clamping (max) mem_ref=" << mem_ref << std::endl; |
| } |
| |
| // save generated memory reference |
| syn_ref[j] = (Type)mem_ref; |
| } |
| } |
| |
| // Generate n values from synthetic data distribution, |
| // define by unique accesses and stack distances |
| // WARNING: can create this for all tables or per table, but in latter |
| // case we need to know the table id, to sample from the right distribution |
| #define CAFFE2_RAND_SYNTHETIC_DATA(T) \ |
| template <> \ |
| C10_EXPORT void RandSyntheticData<T, CPUContext>( \ |
| const size_t n, const T a, const T b, T* r, CPUContext* context) { \ |
| /* unique memory references */ \ |
| std::vector<int> mem_ref = {1, 2, 3, 4, 5, 6}; \ |
| /* cumulative distribution of distances */ \ |
| std::vector<int> cum_val = {0, 1, 3, 4, 5}; \ |
| std::vector<double> cum_dis = {0.55, 0.64, 0.82, 0.91, 1.0}; \ |
| /* inverse map of cumulative distribution (for O(1) lookup) */ \ |
| /* std::vector<int> cum_map = {0, 0, 0, 0, 0, 1, 2, 2, 3, 4}; */ \ |
| int k = 10; /* 100; */ \ |
| std::vector<int> cum_map(k, 0); \ |
| for (int j = 0; j < cum_dis.size();) { \ |
| int sz = (int)round(cum_dis[j] * k); \ |
| for (int i = 0; i < sz; i++) { \ |
| cum_map[j + i] = j; \ |
| } \ |
| j += sz; \ |
| } \ |
| \ |
| /* code to generate the synthetic data from the above values */ \ |
| const int cache_line = 1; /* 64; */ \ |
| generate_trace_lru<T, double, int, CPUContext, false>( \ |
| mem_ref, cum_val, cum_dis, cum_map, context, cache_line, n, a, b, r); \ |
| } |
| |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| CAFFE2_RAND_SYNTHETIC_DATA(float); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| CAFFE2_RAND_SYNTHETIC_DATA(double); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| CAFFE2_RAND_SYNTHETIC_DATA(int8_t); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| CAFFE2_RAND_SYNTHETIC_DATA(int16_t); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| CAFFE2_RAND_SYNTHETIC_DATA(int32_t); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| CAFFE2_RAND_SYNTHETIC_DATA(int64_t); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| CAFFE2_RAND_SYNTHETIC_DATA(uint8_t); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| CAFFE2_RAND_SYNTHETIC_DATA(uint16_t); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| CAFFE2_RAND_SYNTHETIC_DATA(uint32_t); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| CAFFE2_RAND_SYNTHETIC_DATA(uint64_t); |
| #undef CAFFE2_RAND_SYNTHETIC_DATA |
| |
| #define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T) \ |
| template <> \ |
| C10_EXPORT void RandUniformUnique<T, CPUContext>( \ |
| const size_t n, \ |
| const T a, \ |
| const T b, \ |
| T* r, \ |
| const size_t m, \ |
| const T* avoid, \ |
| CPUContext* context) { \ |
| CAFFE_ENFORCE_LE( \ |
| n, b - a - m + 1, "Cannot satisfy the unique requirement"); \ |
| std::unordered_set<T> avoid_set(n); \ |
| if (m) { \ |
| avoid_set.insert(avoid, avoid + m); \ |
| CAFFE_ENFORCE_EQ( \ |
| m, avoid_set.size(), "AC10_EXPORT void should be unique"); \ |
| } \ |
| at::uniform_int_from_to_distribution<T> distribution( \ |
| incrementIfNotMax(b - a), a); \ |
| T v = 0; \ |
| for (size_t i = 0; i < n; ++i) { \ |
| do { \ |
| v = distribution(context->RandGenerator()); \ |
| } while (avoid_set.count(v)); \ |
| r[i] = v; \ |
| avoid_set.insert(v); \ |
| } \ |
| } |
| |
| CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int32_t); |
| CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int64_t); |
| #undef CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE |
| |
| template <> |
| C10_EXPORT void RandGaussian<float, CPUContext>( |
| const size_t n, |
| const float mean, |
| const float std, |
| float* r, |
| CPUContext* context) { |
| at::normal_distribution<float> distribution(mean, std); |
| for (size_t i = 0; i < n; ++i) { |
| r[i] = distribution(context->RandGenerator()); |
| } |
| } |
| |
| #define CAFFE2_SPECIALIZED_SUM(T) \ |
| template <> \ |
| C10_EXPORT void Sum<T, CPUContext>( \ |
| const int N, \ |
| const T* x, \ |
| T* y, \ |
| CPUContext* /* unused */, \ |
| Tensor* /* unused */) { \ |
| *y = ConstEigenVectorMap<T>(x, N).sum(); \ |
| } |
| |
| CAFFE2_SPECIALIZED_SUM(float); |
| CAFFE2_SPECIALIZED_SUM(int32_t); |
| CAFFE2_SPECIALIZED_SUM(int64_t); |
| |
| #undef CAFFE2_SPECIALIZED_SUM |
| |
| template <> |
| C10_EXPORT void SumSqr<float, CPUContext>( |
| const int N, |
| const float* x, |
| float* y, |
| CPUContext* /*context*/ /* unused */, |
| Tensor* /*scratch_ptr*/ /* unused */) { |
| *y = ConstEigenVectorMap<float>(x, N).squaredNorm(); |
| } |
| |
| template <> |
| C10_EXPORT void SumSqr<double, CPUContext>( |
| const int N, |
| const double* x, |
| double* y, |
| CPUContext* /*context*/ /* unused */, |
| Tensor* /*scratch_ptr*/ /* unused */) { |
| *y = ConstEigenVectorMap<double>(x, N).squaredNorm(); |
| } |
| |
| template <> |
| C10_EXPORT void Select<float, CPUContext>( |
| const int N, |
| const int D, |
| const float* x, |
| const int* idx, |
| float* y, |
| CPUContext* /*context*/) { |
| for (int i = 0; i < N; ++i) { |
| DCHECK_LT(idx[i], D); |
| y[i] = x[i * D + idx[i]]; |
| } |
| } |
| |
| template <> |
| C10_EXPORT void CopyMatrix<CPUContext>( |
| const size_t itemsize, |
| const int M, |
| const int N, |
| const void* A, |
| const int lda, |
| void* B, |
| const int ldb, |
| CPUContext* /*context*/, |
| TypeMeta::Copy copy) { |
| if (A == nullptr || B == nullptr) { |
| return; |
| } |
| if (lda == N && ldb == N) { |
| // can coalesce to a single memcpy of size M * N |
| if (copy) { |
| copy(static_cast<const char*>(A), static_cast<char*>(B), N * M); |
| } else { |
| memcpy( |
| static_cast<char*>(B), static_cast<const char*>(A), itemsize * N * M); |
| } |
| return; |
| } |
| |
| for (int i = 0; i < M; ++i) { |
| if (copy) { |
| copy( |
| static_cast<const char*>(A) + lda * i * itemsize, |
| static_cast<char*>(B) + ldb * i * itemsize, |
| N); |
| } else { |
| memcpy( |
| static_cast<char*>(B) + ldb * i * itemsize, |
| static_cast<const char*>(A) + lda * i * itemsize, |
| itemsize * N); |
| } |
| } |
| } |
| |
| #ifdef CAFFE2_USE_MKL |
| |
| #define DELEGATE_COPY_MATRIX_FUNCTION(T, Func) \ |
| template <> \ |
| C10_EXPORT void CopyMatrix<T, CPUContext>( \ |
| const int M, \ |
| const int N, \ |
| const T* A, \ |
| const int lda, \ |
| T* B, \ |
| const int ldb, \ |
| CPUContext* /* context */) { \ |
| Func('R', 'N', M, N, T(1), A, lda, B, ldb); \ |
| } \ |
| template <> \ |
| C10_EXPORT void CopyMatrix<T, CPUContext>( \ |
| const int M, \ |
| const int N, \ |
| const T* A, \ |
| const int A_outer_stride, \ |
| const int A_inner_stride, \ |
| T* B, \ |
| const int B_outer_stride, \ |
| const int B_inner_stride, \ |
| CPUContext* /* context */) { \ |
| Func##2( \ |
| 'R', \ |
| 'N', \ |
| M, \ |
| N, \ |
| T(1), \ |
| A, \ |
| A_outer_stride, \ |
| A_inner_stride, \ |
| B, \ |
| B_outer_stride, \ |
| B_inner_stride); \ |
| } |
| DELEGATE_COPY_MATRIX_FUNCTION(float, mkl_somatcopy) |
| DELEGATE_COPY_MATRIX_FUNCTION(double, mkl_domatcopy) |
| #undef DELEGATE_COPY_MATRIX_FUNCTION |
| |
| #endif // CAFFE2_USE_MKL |
| |
| #define CAFFE2_SPECIALIZED_COPY_MATRIX(T) \ |
| template <> \ |
| C10_EXPORT void CopyMatrix<T, CPUContext>( \ |
| const int M, \ |
| const int N, \ |
| const T* A, \ |
| const int lda, \ |
| T* B, \ |
| const int ldb, \ |
| CPUContext* /* context */) { \ |
| if (M == 0 || N == 0) { \ |
| return; \ |
| } \ |
| if (lda == N) { \ |
| if (ldb == N) { \ |
| std::memcpy(B, A, sizeof(T) * M * N); \ |
| } else { \ |
| EigenOuterStridedMatrixMap<T>(B, N, M, EigenOuterStride(ldb)) = \ |
| ConstEigenMatrixMap<T>(A, N, M); \ |
| } \ |
| } else { \ |
| if (ldb == N) { \ |
| EigenMatrixMap<T>(B, N, M) = ConstEigenOuterStridedMatrixMap<T>( \ |
| A, N, M, EigenOuterStride(lda)); \ |
| } else { \ |
| EigenOuterStridedMatrixMap<T>(B, N, M, EigenOuterStride(ldb)) = \ |
| ConstEigenOuterStridedMatrixMap<T>( \ |
| A, N, M, EigenOuterStride(lda)); \ |
| } \ |
| } \ |
| } \ |
| template <> \ |
| C10_EXPORT void CopyMatrix<T, CPUContext>( \ |
| const int M, \ |
| const int N, \ |
| const T* A, \ |
| const int A_outer_stride, \ |
| const int A_inner_stride, \ |
| T* B, \ |
| const int B_outer_stride, \ |
| const int B_inner_stride, \ |
| CPUContext* context) { \ |
| if (A_inner_stride == 1 && B_inner_stride == 1) { \ |
| CopyMatrix<T, CPUContext>( \ |
| M, N, A, A_outer_stride, B, B_outer_stride, context); \ |
| return; \ |
| } \ |
| EigenStridedMatrixMap<T>( \ |
| B, N, M, EigenStride(B_outer_stride, B_inner_stride)) = \ |
| ConstEigenStridedMatrixMap<T>( \ |
| A, N, M, EigenStride(A_outer_stride, A_inner_stride)); \ |
| } |
| |
| #ifndef CAFFE2_USE_MKL |
| CAFFE2_SPECIALIZED_COPY_MATRIX(float) |
| CAFFE2_SPECIALIZED_COPY_MATRIX(double) |
| #endif // CAFFE2_USE_MKL |
| |
| CAFFE2_SPECIALIZED_COPY_MATRIX(int) |
| CAFFE2_SPECIALIZED_COPY_MATRIX(int64_t) |
| CAFFE2_SPECIALIZED_COPY_MATRIX(std::uint8_t) |
| CAFFE2_SPECIALIZED_COPY_MATRIX(std::uint16_t) |
| |
| #undef CAFFE2_SPECIALIZXED_COPY_MATRIX |
| |
| namespace { |
| |
| template <typename T> |
| void Im2ColZeroPaddingAndNoDilationNCHW( |
| const int C, |
| const int H, |
| const int W, |
| const int kernel_h, |
| const int kernel_w, |
| const int stride_h, |
| const int stride_w, |
| const T* img_data, |
| T* col_data, |
| CPUContext* context) { |
| const int output_h = (H - kernel_h) / stride_h + 1; |
| const int output_w = (W - kernel_w) / stride_w + 1; |
| const int output_size = output_h * output_w; |
| for (int c = 0; c < C; ++c) { |
| for (int kh = 0; kh < kernel_h; ++kh) { |
| for (int kw = 0; kw < kernel_w; ++kw) { |
| const T* src = img_data + kh * W + kw; |
| if (stride_w == 1) { |
| CopyMatrix<T, CPUContext>( |
| output_h, |
| output_w, |
| src, |
| stride_h * W, |
| col_data, |
| output_w, |
| context); |
| } else { |
| CopyMatrix<T, CPUContext>( |
| output_h, |
| output_w, |
| src, |
| stride_h * W, |
| stride_w, |
| col_data, |
| output_w, |
| 1, |
| context); |
| } |
| col_data += output_size; |
| } |
| } |
| img_data += H * W; |
| } |
| } |
| |
| template <typename T> |
| void Col2ImZeroPaddingAndNoDilationNCHW( |
| const int C, |
| const int H, |
| const int W, |
| const int kernel_h, |
| const int kernel_w, |
| const int stride_h, |
| const int stride_w, |
| const T* col_data, |
| T* img_data, |
| CPUContext* context) { |
| Set<T, CPUContext>(C * H * W, T(0), img_data, context); |
| const int output_h = (H - kernel_h) / stride_h + 1; |
| const int output_w = (W - kernel_w) / stride_w + 1; |
| const int output_size = output_h * output_w; |
| for (int c = 0; c < C; ++c) { |
| for (int kh = 0; kh < kernel_h; ++kh) { |
| for (int kw = 0; kw < kernel_w; ++kw) { |
| T* dst = img_data + kh * W + kw; |
| if (stride_w == 1) { |
| EigenOuterStridedArrayMap<T>( |
| dst, output_w, output_h, EigenOuterStride(stride_h * W)) += |
| ConstEigenArrayMap<T>(col_data, output_w, output_h); |
| } else { |
| EigenStridedArrayMap<T>( |
| dst, output_w, output_h, EigenStride(stride_h * W, stride_w)) += |
| ConstEigenArrayMap<T>(col_data, output_w, output_h); |
| } |
| col_data += output_size; |
| } |
| } |
| img_data += H * W; |
| } |
| } |
| |
| template <typename T> |
| void Im2ColZeroPaddingAndNoDilationNHWC( |
| const int C, |
| const int H, |
| const int W, |
| const int kernel_h, |
| const int kernel_w, |
| const int stride_h, |
| const int stride_w, |
| const T* img_data, |
| T* col_data, |
| CPUContext* context) { |
| const int output_h = (H - kernel_h) / stride_h + 1; |
| const int output_w = (W - kernel_w) / stride_w + 1; |
| const int kernel_size = kernel_h * kernel_w; |
| for (int yh = 0; yh < output_h; ++yh) { |
| for (int yw = 0; yw < output_w; ++yw) { |
| const T* src = img_data + (yh * stride_h * W + yw * stride_w) * C; |
| CopyMatrix<T, CPUContext>( |
| kernel_h, kernel_w * C, src, W * C, col_data, kernel_w * C, context); |
| col_data += kernel_size * C; |
| } |
| } |
| } |
| |
| template <typename T> |
| void Col2ImZeroPaddingAndNoDilationNHWC( |
| const int C, |
| const int H, |
| const int W, |
| const int kernel_h, |
| const int kernel_w, |
| const int stride_h, |
| const int stride_w, |
| const T* col_data, |
| T* img_data, |
| CPUContext* context) { |
| Set<T, CPUContext>(H * W * C, T(0), img_data, context); |
| const int output_h = (H - kernel_h) / stride_h + 1; |
| const int output_w = (W - kernel_w) / stride_w + 1; |
| const int kernel_size = kernel_h * kernel_w; |
| for (int yh = 0; yh < output_h; ++yh) { |
| for (int yw = 0; yw < output_w; ++yw) { |
| T* dst = img_data + (yh * stride_h * W + yw * stride_w) * C; |
| EigenOuterStridedArrayMap<T>( |
| dst, kernel_w * C, kernel_h, EigenOuterStride(W * C)) += |
| ConstEigenArrayMap<T>(col_data, kernel_w * C, kernel_h); |
| col_data += kernel_size * C; |
| } |
| } |
| } |
| |
| template <typename T, bool kCol2Im> |
| void Im2ColNdNCHWImpl( |
| const int N, |
| const int img_size, |
| const int col_size, |
| const int* img_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const float* X_data, |
| float* Y_data) { |
| if (kCol2Im) { |
| std::memset(Y_data, 0, img_size * sizeof(float)); |
| } |
| const int outer_size = col_shape[0]; |
| const int inner_size = col_size / outer_size; |
| const int kernel_size = std::accumulate( |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| kernel_shape, kernel_shape + N, 1, std::multiplies<int>()); |
| std::vector<FixedDivisor<int>> kernel_shape_div(N); |
| for (int i = 0; i < N; ++i) { |
| kernel_shape_div[i] = FixedDivisor<int>(kernel_shape[i]); |
| } |
| std::vector<int> d_offset(N, 0); |
| std::vector<int> d_iter(N, 0); |
| for (int i = 0; i < outer_size; ++i) { |
| // Loop over spatial axes in reverse order to compute a per-axis offset. |
| int offset = i; |
| for (int d_i = N - 1; d_i >= 0; --d_i) { |
| kernel_shape_div[d_i].DivMod(offset, &offset, &d_offset[d_i]); |
| } |
| for (int j = 0; j < inner_size; ++j) { |
| // Loop over spatial axes in forward order to compute the indices in the |
| // image and column, and whether the index lies in the padding. |
| const int col_index = i * inner_size + j; |
| int img_index = i / kernel_size; |
| bool is_padding = false; |
| for (int d_i = 0; d_i < N; ++d_i) { |
| const int d_img = d_iter[d_i] * stride[d_i] - pad[d_i] + |
| d_offset[d_i] * dilation[d_i]; |
| is_padding |= !utils::IsAGeZeroAndALtB(d_img, img_shape[d_i + 1]); |
| img_index = img_index * img_shape[d_i + 1] + d_img; |
| } |
| if (!kCol2Im) { |
| Y_data[col_index] = is_padding ? 0 : X_data[img_index]; |
| } else if (!is_padding) { |
| Y_data[img_index] += X_data[col_index]; |
| } |
| utils::IncreaseIndexInDims(N, col_shape + 1, d_iter.data()); |
| } |
| } |
| } |
| |
| template <typename T> |
| void Im2Col3dNCHWImpl( |
| const int channels, |
| const int clip_len, |
| const int height, |
| const int width, |
| const int kernel_t, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_t, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_p, |
| const int pad_t, |
| const int pad_l, |
| const int pad_a, |
| const int pad_b, |
| const int pad_r, |
| const int stride_t, |
| const int stride_h, |
| const int stride_w, |
| const T* img_data, |
| T* col_data) { |
| const int output_t = |
| (clip_len + pad_p + pad_a - (dilation_t * (kernel_t - 1) + 1)) / |
| stride_t + |
| 1; |
| const int output_h = |
| (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h + |
| 1; |
| const int output_w = |
| (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w + |
| 1; |
| const int kernel_size = kernel_t * kernel_h * kernel_w; |
| const int kernel_hw_size = kernel_h * kernel_w; |
| const int output_size = output_t * output_h * output_w; |
| const int channel_size = clip_len * height * width; |
| const int output_hw_size = output_h * output_w; |
| const int channel_hw_size = height * width; |
| |
| // Fast path for zero padding and no dilation |
| // From Torch, THNN_(unfolded_copy) |
| if (dilation_t == 1 && dilation_h == 1 && dilation_w == 1 && pad_a == 0 && |
| pad_p == 0 && pad_l == 0 && pad_r == 0 && pad_t == 0 && pad_b == 0) { |
| for (auto k = 0; k < channels * kernel_size; k++) { |
| const auto nip = k / kernel_size; |
| const auto rest = k % kernel_size; |
| const auto kt = rest / kernel_hw_size; |
| const auto rest_hw = rest % kernel_hw_size; |
| const auto kh = rest_hw / kernel_w; |
| const auto kw = rest_hw % kernel_w; |
| auto* dst = col_data + nip * (kernel_size * output_size) + |
| kt * (kernel_hw_size * output_size) + kh * (kernel_w * output_size) + |
| kw * output_size; |
| const auto* src = img_data + nip * channel_size; |
| for (auto t = 0; t < output_t; t++) { |
| const auto it = t * stride_t + kt; |
| for (auto y = 0; y < output_h; y++) { |
| const auto iy = y * stride_h + kh; |
| const auto ix = kw; |
| if (stride_w == 1) { |
| memcpy( |
| dst + (t * output_hw_size + y * output_w), |
| src + (it * channel_hw_size + iy * width + ix), |
| sizeof(T) * output_w); |
| } else { |
| for (auto x = 0; x < output_w; x++) { |
| memcpy( |
| dst + (t * output_hw_size + y * output_w + x), |
| src + (it * channel_hw_size + iy * width + ix + x * stride_w), |
| sizeof(T)); |
| } |
| } |
| } |
| } |
| } |
| return; |
| } |
| // Fast path for equal padding |
| if (pad_a == pad_p && pad_l == pad_r && pad_t == pad_b) { |
| const int pad_f = pad_a; |
| const int pad_h = pad_t; |
| const int pad_w = pad_l; |
| for (int channel = channels; channel--; img_data += channel_size) { |
| for (int kernel_frame = 0; kernel_frame < kernel_t; kernel_frame++) { |
| for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { |
| for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { |
| int input_frame = -pad_f + kernel_frame * dilation_t; |
| for (int output_frames = output_t; output_frames; output_frames--) { |
| if (!utils::IsAGeZeroAndALtB(input_frame, clip_len)) { |
| for (int output_rows = output_h; output_rows; output_rows--) { |
| for (int output_cols = output_w; output_cols; output_cols--) { |
| *(col_data++) = 0; |
| } |
| } |
| } else { |
| int input_row = -pad_h + kernel_row * dilation_h; |
| for (int output_rows = output_h; output_rows; output_rows--) { |
| if (!utils::IsAGeZeroAndALtB(input_row, height)) { |
| for (int output_cols = output_w; output_cols; |
| output_cols--) { |
| *(col_data++) = 0; |
| } |
| } else { |
| int input_col = -pad_w + kernel_col * dilation_w; |
| for (int output_col = output_w; output_col; output_col--) { |
| if (utils::IsAGeZeroAndALtB(input_col, width)) { |
| *(col_data++) = img_data |
| [(input_frame * height + input_row) * width + |
| input_col]; |
| } else { |
| *(col_data++) = 0; |
| } |
| input_col += stride_w; |
| } |
| } |
| input_row += stride_h; |
| } |
| } |
| input_frame += stride_t; |
| } |
| } |
| } |
| } |
| } |
| return; |
| } |
| |
| // Baseline |
| const int dkernel_t = dilation_t * (kernel_t - 1) + 1; |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| |
| int clip_col = (clip_len + pad_p + pad_a - dkernel_t) / stride_t + 1; |
| int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| |
| int channels_col = channels * kernel_t * kernel_h * kernel_w; |
| for (int c = 0; c < channels_col; ++c) { |
| int w_offset = c % kernel_w; |
| int h_offset = (c / kernel_w) % kernel_h; |
| int t_offset = (c / kernel_w / kernel_h) % kernel_t; |
| int c_im = c / kernel_h / kernel_w / kernel_t; |
| for (int t = 0; t < clip_col; ++t) { |
| for (int h = 0; h < height_col; ++h) { |
| for (int w = 0; w < width_col; ++w) { |
| int t_pad = t * stride_t - pad_p + t_offset * dilation_t; |
| int h_pad = h * stride_h - pad_t + h_offset * dilation_h; |
| int w_pad = w * stride_w - pad_l + w_offset * dilation_w; |
| if (t_pad >= 0 && t_pad < clip_len && h_pad >= 0 && h_pad < height && |
| w_pad >= 0 && w_pad < width) { |
| col_data[((c * clip_col + t) * height_col + h) * width_col + w] = |
| img_data |
| [((c_im * clip_len + t_pad) * height + h_pad) * width + |
| w_pad]; |
| } else { |
| col_data[((c * clip_col + t) * height_col + h) * width_col + w] = 0; |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| } // namespace |
| |
| template <> |
| C10_EXPORT void Im2ColNd<float, CPUContext, StorageOrder::NCHW>( |
| const int N, |
| const int img_size, |
| const int col_size, |
| const int* img_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const float* img_data, |
| float* col_data, |
| CPUContext* /* context */, |
| const int /* groups */) { |
| // In NCHW, the number of groups doesn't affect Im2Col. |
| if (N == 3) { |
| const int channels = |
| col_shape[0] / kernel_shape[0] / kernel_shape[1] / kernel_shape[2]; |
| Im2Col3dNCHWImpl<float>( |
| channels, |
| img_shape[1], |
| img_shape[2], |
| img_shape[3], |
| kernel_shape[0], |
| kernel_shape[1], |
| kernel_shape[2], |
| dilation[0], |
| dilation[1], |
| dilation[2], |
| pad[0], |
| pad[1], |
| pad[2], |
| pad[3], |
| pad[4], |
| pad[5], |
| stride[0], |
| stride[1], |
| stride[2], |
| img_data, |
| col_data); |
| } else { |
| Im2ColNdNCHWImpl<float, false>( |
| N, |
| img_size, |
| col_size, |
| img_shape, |
| col_shape, |
| kernel_shape, |
| stride, |
| dilation, |
| pad, |
| img_data, |
| col_data); |
| } |
| } |
| |
| template <> |
| C10_EXPORT void Col2ImNd<float, CPUContext, StorageOrder::NCHW>( |
| const int N, |
| const int img_size, |
| const int col_size, |
| const int* img_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const float* col_data, |
| float* img_data, |
| CPUContext* /* context */, |
| const int /* groups */) { |
| // In NCHW, the number of groups doesn't affect Col2Im. |
| Im2ColNdNCHWImpl<float, true>( |
| N, |
| img_size, |
| col_size, |
| img_shape, |
| col_shape, |
| kernel_shape, |
| stride, |
| dilation, |
| pad, |
| col_data, |
| img_data); |
| } |
| |
| template <> |
| C10_EXPORT void Im2Col<float, CPUContext, StorageOrder::NCHW>( |
| const int C, |
| const int H, |
| const int W, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_t, |
| const int pad_l, |
| const int pad_b, |
| const int pad_r, |
| const int stride_h, |
| const int stride_w, |
| const float* img_data, |
| float* col_data, |
| CPUContext* context, |
| const int /* groups */) { |
| // In NCHW, the number of groups doesn't affect Im2Col. |
| |
| // Fast path for zero padding and no dilation |
| if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 && |
| dilation_w == 1) { |
| Im2ColZeroPaddingAndNoDilationNCHW<float>( |
| C, |
| H, |
| W, |
| kernel_h, |
| kernel_w, |
| stride_h, |
| stride_w, |
| img_data, |
| col_data, |
| context); |
| return; |
| } |
| |
| // Baseline |
| const int output_h = |
| (H + pad_t + pad_b - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; |
| const int output_w = |
| (W + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; |
| const int output_size = output_h * output_w; |
| for (int c = 0; c < C; ++c) { |
| for (int kh = 0; kh < kernel_h; ++kh) { |
| for (int kw = 0; kw < kernel_w; ++kw) { |
| for (int h = 0; h < output_h; ++h) { |
| const int h_pad = h * stride_h - pad_t + kh * dilation_h; |
| if (!utils::IsAGeZeroAndALtB(h_pad, H)) { |
| std::memset(col_data + h * output_w, 0, output_w * sizeof(float)); |
| continue; |
| } |
| for (int w = 0; w < output_w; ++w) { |
| const int w_pad = w * stride_w - pad_l + kw * dilation_w; |
| col_data[h * output_w + w] = utils::IsAGeZeroAndALtB(w_pad, W) |
| ? img_data[(c * H + h_pad) * W + w_pad] |
| : 0; |
| } |
| } |
| col_data += output_size; |
| } |
| } |
| } |
| } |
| |
| template <> |
| C10_EXPORT void Im2Col<float, CPUContext, StorageOrder::NHWC>( |
| const int C, |
| const int H, |
| const int W, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_t, |
| const int pad_l, |
| const int pad_b, |
| const int pad_r, |
| const int stride_h, |
| const int stride_w, |
| const float* img_data, |
| float* col_data, |
| CPUContext* context, |
| const int groups) { |
| // Fast path for zero padding and no dilation |
| if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 && |
| dilation_w == 1 && groups == 1) { |
| Im2ColZeroPaddingAndNoDilationNHWC<float>( |
| C, |
| H, |
| W, |
| kernel_h, |
| kernel_w, |
| stride_h, |
| stride_w, |
| img_data, |
| col_data, |
| context); |
| return; |
| } |
| |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| int h_pad = -pad_t; |
| if (groups == 1) { |
| for (int h = 0; h < output_h; ++h) { |
| int w_pad = -pad_l; |
| for (int w = 0; w < output_w; ++w) { |
| for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) { |
| if (!utils::IsAGeZeroAndALtB(ih, H)) { |
| std::memset(col_data, 0, sizeof(float) * kernel_w * C); |
| col_data += kernel_w * C; |
| continue; |
| } |
| for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) { |
| if (utils::IsAGeZeroAndALtB(iw, W)) { |
| std::memcpy( |
| col_data, img_data + (ih * W + iw) * C, sizeof(float) * C); |
| } else { |
| std::memset(col_data, 0, sizeof(float) * C); |
| } |
| col_data += C; |
| } // iw |
| } // ih |
| w_pad += stride_w; |
| } // w |
| h_pad += stride_h; |
| } // h |
| } else { |
| /** |
| * img_data in N H W G C/G layout |
| * col_data in N G H W R S C/G layout |
| * Note that groups are pulled out to an outer dimension so that we can use |
| * GEMMs efficiently. |
| */ |
| const int C_per_G = C / groups; |
| for (int h = 0; h < output_h; ++h) { |
| int w_pad = -pad_l; |
| for (int w = 0; w < output_w; ++w) { |
| int r = 0; |
| for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) { |
| int s = 0; |
| for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w, ++s) { |
| if (utils::IsAGeZeroAndALtB(ih, H) && |
| utils::IsAGeZeroAndALtB(iw, W)) { |
| for (int g = 0; g < groups; ++g) { |
| std::memcpy( |
| col_data + ((g * kernel_h + r) * kernel_w + s) * C_per_G, |
| img_data + (ih * W + iw) * C + g * C_per_G, |
| sizeof(float) * C_per_G); |
| } |
| } else { |
| for (int g = 0; g < groups; ++g) { |
| std::memset( |
| col_data + ((g * kernel_h + r) * kernel_w + s) * C_per_G, |
| 0, |
| sizeof(float) * C_per_G); |
| } |
| } |
| } // iw |
| } // ih |
| col_data += kernel_h * kernel_w * C; |
| w_pad += stride_w; |
| } // w |
| h_pad += stride_h; |
| } // h |
| } |
| } |
| |
| /** |
| * The layout of the result is N H W G R S C/G. |
| * Note that groups are pulled out to an outer dimension so that we can use |
| * GEMMs efficiently. |
| */ |
| template <typename TData> |
| C10_EXPORT void Im2Col3dNHWCImpl( |
| const int C, |
| const int T, |
| const int H, |
| const int W, |
| const int kernel_t, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_t, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_p, // previous frame |
| const int pad_t, // top |
| const int pad_l, // left |
| const int pad_n, // next frame |
| const int pad_b, // bottom |
| const int pad_r, // right |
| const int stride_t, |
| const int stride_h, |
| const int stride_w, |
| const TData* img_data, |
| TData* col_data, |
| const int groups) { |
| const int dkernel_t = dilation_t * (kernel_t - 1) + 1; |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| const int output_t = (T + pad_p + pad_n - dkernel_t) / stride_t + 1; |
| const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| const int C_per_G = C / groups; |
| int t_pad = -pad_p; |
| for (int t = 0; t < output_t; ++t) { |
| int h_pad = -pad_t; |
| for (int h = 0; h < output_h; ++h) { |
| int w_pad = -pad_l; |
| for (int w = 0; w < output_w; ++w) { |
| int q = 0; |
| for (int it = t_pad; it < t_pad + dkernel_t; it += dilation_t, ++q) { |
| int r = 0; |
| for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) { |
| int s = 0; |
| for (int iw = w_pad; iw < w_pad + dkernel_w; |
| iw += dilation_w, ++s) { |
| if (utils::IsAGeZeroAndALtB(it, T) && |
| utils::IsAGeZeroAndALtB(ih, H) && |
| utils::IsAGeZeroAndALtB(iw, W)) { |
| for (int g = 0; g < groups; ++g) { |
| std::memcpy( |
| col_data + |
| (((g * kernel_t + q) * kernel_h + r) * kernel_w + s) * |
| C_per_G, |
| img_data + ((it * H + ih) * W + iw) * C + g * C_per_G, |
| sizeof(TData) * C_per_G); |
| } |
| } else { |
| for (int g = 0; g < groups; ++g) { |
| std::memset( |
| col_data + |
| (((g * kernel_t + q) * kernel_h + r) * kernel_w + s) * |
| C_per_G, |
| 0, |
| sizeof(TData) * C_per_G); |
| } |
| } |
| } // iw |
| } // ih |
| } // it |
| col_data += kernel_t * kernel_h * kernel_w * C; |
| w_pad += stride_w; |
| } // w |
| h_pad += stride_h; |
| } // h |
| t_pad += stride_t; |
| } // t |
| } |
| |
| template <> |
| C10_EXPORT void Im2ColNd<float, CPUContext, StorageOrder::NHWC>( |
| const int N, |
| const int /*img_size*/, |
| const int /*col_size*/, |
| const int* img_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const float* img_data, |
| float* col_data, |
| CPUContext* /* context */, |
| const int groups) { |
| if (N == 3) { |
| const int channels = |
| col_shape[3] / kernel_shape[0] / kernel_shape[1] / kernel_shape[2]; |
| Im2Col3dNHWCImpl<float>( |
| channels, |
| img_shape[0], |
| img_shape[1], |
| img_shape[2], |
| kernel_shape[0], |
| kernel_shape[1], |
| kernel_shape[2], |
| dilation[0], |
| dilation[1], |
| dilation[2], |
| pad[0], |
| pad[1], |
| pad[2], |
| pad[3], |
| pad[4], |
| pad[5], |
| stride[0], |
| stride[1], |
| stride[2], |
| img_data, |
| col_data, |
| groups); |
| } else { |
| CAFFE_NOT_IMPLEMENTED; |
| } |
| } |
| |
| template <> |
| C10_EXPORT void Col2Im<float, CPUContext, StorageOrder::NCHW>( |
| const int C, |
| const int H, |
| const int W, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_t, |
| const int pad_l, |
| const int pad_b, |
| const int pad_r, |
| const int stride_h, |
| const int stride_w, |
| const float* col_data, |
| float* img_data, |
| CPUContext* context, |
| const int /* groups */) { |
| // In NCHW, the number of groups doesn't affect Col2Im. |
| |
| // Fast path for zero padding and no dilation |
| if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 && |
| dilation_w == 1) { |
| Col2ImZeroPaddingAndNoDilationNCHW<float>( |
| C, |
| H, |
| W, |
| kernel_h, |
| kernel_w, |
| stride_h, |
| stride_w, |
| col_data, |
| img_data, |
| context); |
| return; |
| } |
| |
| // Fallback |
| Set<float, CPUContext>(C * H * W, 0.0f, img_data, context); |
| const int output_h = |
| (H + pad_t + pad_b - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; |
| const int output_w = |
| (W + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; |
| const int output_size = output_h * output_w; |
| for (int c = 0; c < C; ++c) { |
| for (int kh = 0; kh < kernel_h; ++kh) { |
| for (int kw = 0; kw < kernel_w; ++kw) { |
| for (int h = 0; h < output_h; ++h) { |
| const int h_pad = h * stride_h - pad_t + kh * dilation_h; |
| if (!utils::IsAGeZeroAndALtB(h_pad, H)) { |
| continue; |
| } |
| for (int w = 0; w < output_w; ++w) { |
| const int w_pad = w * stride_w - pad_l + kw * dilation_w; |
| if (utils::IsAGeZeroAndALtB(w_pad, W)) { |
| img_data[(c * H + h_pad) * W + w_pad] += |
| col_data[h * output_w + w]; |
| } |
| } |
| } |
| col_data += output_size; |
| } |
| } |
| } |
| } |
| |
| template <> |
| C10_EXPORT void Col2Im<float, CPUContext, StorageOrder::NHWC>( |
| const int C, |
| const int H, |
| const int W, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_t, |
| const int pad_l, |
| const int pad_b, |
| const int pad_r, |
| const int stride_h, |
| const int stride_w, |
| const float* col_data, |
| float* img_data, |
| CPUContext* context, |
| const int groups) { |
| // Fast path for zero padding and no dilation |
| if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 && |
| dilation_w == 1 && groups == 1) { |
| Col2ImZeroPaddingAndNoDilationNHWC<float>( |
| C, |
| H, |
| W, |
| kernel_h, |
| kernel_w, |
| stride_h, |
| stride_w, |
| col_data, |
| img_data, |
| context); |
| return; |
| } |
| |
| Set<float, CPUContext>(H * W * C, 0, img_data, context); |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| |
| int h_pad = -pad_t; |
| if (groups == 1) { |
| for (int h = 0; h < output_h; ++h) { |
| int w_pad = -pad_l; |
| for (int w = 0; w < output_w; ++w) { |
| for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) { |
| if (!utils::IsAGeZeroAndALtB(ih, H)) { |
| col_data += kernel_w * C; |
| continue; |
| } |
| for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) { |
| if (utils::IsAGeZeroAndALtB(iw, W)) { |
| float* img_data_patch = img_data + (ih * W + iw) * C; |
| Add<float, CPUContext>( |
| C, img_data_patch, col_data, img_data_patch, context); |
| } |
| col_data += C; |
| } // iw |
| } // ih |
| w_pad += stride_w; |
| } // w |
| h_pad += stride_h; |
| } // h |
| } else { |
| const int C_per_G = C / groups; |
| for (int h = 0; h < output_h; ++h) { |
| int w_pad = -pad_l; |
| for (int w = 0; w < output_w; ++w) { |
| int r = 0; |
| for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) { |
| int s = 0; |
| for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w, ++s) { |
| if (utils::IsAGeZeroAndALtB(ih, H) && |
| utils::IsAGeZeroAndALtB(iw, W)) { |
| float* img_data_patch = img_data + (ih * W + iw) * C; |
| for (int g = 0; g < groups; ++g) { |
| Add<float, CPUContext>( |
| C_per_G, |
| img_data_patch + g * C_per_G, |
| col_data + ((g * kernel_h + r) * kernel_w + s) * C_per_G, |
| img_data_patch + g * C_per_G, |
| context); |
| } |
| } |
| } // iw |
| } // ih |
| col_data += kernel_h * kernel_w * C; |
| w_pad += stride_w; |
| } // w |
| h_pad += stride_h; |
| } // h |
| } |
| } |
| |
| /** |
| * The layout of the result is N H W G R S C/G. |
| * Note that groups are pulled out to an outer dimension so that we can use |
| * GEMMs efficiently. |
| */ |
| template <typename TData> |
| C10_EXPORT void Col2Im3dNHWCImpl( |
| const int C, |
| const int T, |
| const int H, |
| const int W, |
| const int kernel_t, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_t, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_p, // previous frame |
| const int pad_t, // top |
| const int pad_l, // left |
| const int pad_n, // next frame |
| const int pad_b, // bottom |
| const int pad_r, // right |
| const int stride_t, |
| const int stride_h, |
| const int stride_w, |
| const TData* col_data, |
| TData* img_data, |
| CPUContext* context, |
| const int groups) { |
| Set<float, CPUContext>(T * H * W * C, 0, img_data, context); |
| const int dkernel_t = dilation_t * (kernel_t - 1) + 1; |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| const int output_t = (T + pad_p + pad_n - dkernel_t) / stride_t + 1; |
| const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| const int C_per_G = C / groups; |
| |
| int t_pad = -pad_p; |
| for (int t = 0; t < output_t; ++t) { |
| int h_pad = -pad_t; |
| for (int h = 0; h < output_h; ++h) { |
| int w_pad = -pad_l; |
| for (int w = 0; w < output_w; ++w) { |
| int q = 0; |
| for (int it = t_pad; it < t_pad + dkernel_t; it += dilation_t, ++q) { |
| int r = 0; |
| for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) { |
| int s = 0; |
| for (int iw = w_pad; iw < w_pad + dkernel_w; |
| iw += dilation_w, ++s) { |
| if (utils::IsAGeZeroAndALtB(it, T) && |
| utils::IsAGeZeroAndALtB(ih, H) && |
| utils::IsAGeZeroAndALtB(iw, W)) { |
| float* img_data_patch = img_data + ((it * T + ih) * W + iw) * C; |
| for (int g = 0; g < groups; ++g) { |
| Add<float, CPUContext>( |
| C_per_G, |
| img_data_patch + g * C_per_G, |
| col_data + |
| (((g * kernel_t + q) * kernel_h + r) * kernel_w + s) * |
| C_per_G, |
| img_data_patch + g * C_per_G, |
| context); |
| } |
| } |
| } // iw |
| } // ih |
| } // it |
| col_data += kernel_t * kernel_h * kernel_w * C; |
| w_pad += stride_w; |
| } // w |
| h_pad += stride_h; |
| } // h |
| t_pad += stride_t; |
| } // t |
| } |
| |
| template <> |
| C10_EXPORT void Col2ImNd<float, CPUContext, StorageOrder::NHWC>( |
| const int N, |
| const int /*img_size*/, |
| const int /*col_size*/, |
| const int* img_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const float* col_data, |
| float* img_data, |
| CPUContext* context, |
| const int groups) { |
| if (N == 3) { |
| const int channels = |
| col_shape[3] / kernel_shape[0] / kernel_shape[1] / kernel_shape[2]; |
| Col2Im3dNHWCImpl<float>( |
| channels, |
| img_shape[0], |
| img_shape[1], |
| img_shape[2], |
| kernel_shape[0], |
| kernel_shape[1], |
| kernel_shape[2], |
| dilation[0], |
| dilation[1], |
| dilation[2], |
| pad[0], |
| pad[1], |
| pad[2], |
| pad[3], |
| pad[4], |
| pad[5], |
| stride[0], |
| stride[1], |
| stride[2], |
| col_data, |
| img_data, |
| context, |
| groups); |
| } else { |
| CAFFE_NOT_IMPLEMENTED; |
| } |
| } |
| |
| template <> |
| C10_EXPORT void BiasCHW<float, CPUContext>( |
| const float* bias, |
| const float* /*bias_multiplier*/, |
| const int bias_channels, |
| const int image_size, |
| float* image, |
| CPUContext* /*context*/) { |
| // Sum the per-channel bias into every image plane |
| for (int c = 0; c < bias_channels; ++c) { |
| float b = bias[c]; |
| |
| #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
| float32x4_t vBias = vdupq_n_f32(b); |
| |
| // We give alignment hints for additional speed, so handle the |
| // non-vectorizable prologue separately |
| constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float); |
| |
| // FIXME: if input < kVecSizeInFloat, can't vectorize at all |
| |
| int prologue = kVecSizeInFloat - |
| // remainder in floats |
| (((uintptr_t)image) % (sizeof(float32x4_t))) / sizeof(float); |
| |
| int i = 0; |
| // Prologue loop |
| for (; i < prologue; ++i) { |
| image[i] += b; |
| } |
| |
| // The loop is manually unrolled by 8 |
| constexpr int kUnroll = 8; |
| constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat; |
| |
| int remainder = image_size - prologue; |
| int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop; |
| |
| // Vectorizable body |
| for (; i < vectorizable; i += kFloatsPerLoop) { |
| // Manually unrolled |
| float32x4_t v0 = vld1q_f32_aligned(image + i + 0); |
| float32x4_t v1 = vld1q_f32_aligned(image + i + 4); |
| float32x4_t v2 = vld1q_f32_aligned(image + i + 8); |
| float32x4_t v3 = vld1q_f32_aligned(image + i + 12); |
| float32x4_t v4 = vld1q_f32_aligned(image + i + 16); |
| float32x4_t v5 = vld1q_f32_aligned(image + i + 20); |
| float32x4_t v6 = vld1q_f32_aligned(image + i + 24); |
| float32x4_t v7 = vld1q_f32_aligned(image + i + 28); |
| |
| v0 = vaddq_f32(v0, vBias); |
| v1 = vaddq_f32(v1, vBias); |
| v2 = vaddq_f32(v2, vBias); |
| v3 = vaddq_f32(v3, vBias); |
| v4 = vaddq_f32(v4, vBias); |
| v5 = vaddq_f32(v5, vBias); |
| v6 = vaddq_f32(v6, vBias); |
| v7 = vaddq_f32(v7, vBias); |
| |
| vst1q_f32_aligned(image + i + 0, v0); |
| vst1q_f32_aligned(image + i + 4, v1); |
| vst1q_f32_aligned(image + i + 8, v2); |
| vst1q_f32_aligned(image + i + 12, v3); |
| vst1q_f32_aligned(image + i + 16, v4); |
| vst1q_f32_aligned(image + i + 20, v5); |
| vst1q_f32_aligned(image + i + 24, v6); |
| vst1q_f32_aligned(image + i + 28, v7); |
| } |
| |
| // Non-vectorizable epilogue |
| for (; i < image_size; ++i) { |
| image[i] += b; |
| } |
| #else |
| // Non-NEON CPU implementation |
| for (int i = 0; i < image_size; ++i) { |
| image[i] += b; |
| } |
| #endif // defined(__ARM_NEON__) || defined(__ARM_NEON) |
| |
| image += image_size; |
| } |
| } |
| |
| #define CAFFE2_SPECIALIZED_COPYVECTOR(T) \ |
| template <> \ |
| C10_EXPORT void CopyVector<T, CPUContext>( \ |
| const int N, const T* src, T* dst, CPUContext* /*context*/) { \ |
| if (src != dst && N > 0) { \ |
| memcpy(dst, src, sizeof(T) * N); \ |
| } \ |
| } |
| CAFFE2_SPECIALIZED_COPYVECTOR(float) |
| CAFFE2_SPECIALIZED_COPYVECTOR(int32_t) |
| #undef CAFFE2_SPECIALIZED_COPYVECTOR |
| |
| } // namespace math |
| } // namespace caffe2 |