| #include <immintrin.h> |
| #ifdef CAFFE2_PERF_USE_MKL |
| #include <c10/util/irange.h> |
| #include <caffe2/perfkernels/common.h> |
| #include <folly/SingletonThreadLocal.h> |
| |
| #include "vectorizer.h" |
| |
| // Enable compiler vectorized version only if numerical consistency is not |
| // required between dev and opt versions - disabled for now |
| #ifndef FAST_VECTORIZED_KERNEL |
| #define CPU_CAPABILITY_AVX2 |
| #include <ATen/cpu/vec/vec.h> |
| |
| namespace at::vec { |
| |
| // Implements the vectorized version of std::max() operation, |
| // which DOESNOT propagates NaN for second argument |
| template <typename scalar_t> |
| Vectorized<scalar_t> max(const Vectorized<scalar_t>& a, const Vectorized<scalar_t>& b); |
| |
| template <> |
| Vectorized<double> max(const Vectorized<double>& a, const Vectorized<double>& b) { |
| // std::max(NaN, nonNan) -> NaN |
| return _mm256_max_pd(b, a); |
| } |
| |
| template <> |
| Vectorized<float> max(const Vectorized<float>& a, const Vectorized<float>& b) { |
| // std::max(NaN, nonNan) -> NaN |
| return _mm256_max_ps(b, a); |
| } |
| |
| // Implements recieprocal method based on newton-rapson method |
| // 1. user RCP approximiation |
| // 2. update with RCP = RCP * (2 - X * RCP) |
| template <typename scalar_t> |
| Vectorized<scalar_t> fast_recieprocal(const Vectorized<scalar_t>& b); |
| template <typename scalar_t> |
| scalar_t fast_recieprocal(scalar_t b); |
| |
| template<> |
| Vectorized<float> fast_recieprocal(const Vectorized<float>& b) { |
| auto minus2 = _mm256_set1_ps(-2.f); |
| auto rcp = _mm256_rcp_ps(b); |
| rcp = _mm256_mul_ps(rcp, _mm256_fnmsub_ps(rcp, b, minus2)); |
| rcp = _mm256_mul_ps(rcp, _mm256_fnmsub_ps(rcp, b, minus2)); |
| return rcp; |
| } |
| |
| template <> |
| float fast_recieprocal(float b) { |
| auto minus2 = _mm_set_ss(-2.f); |
| auto b_reg = _mm_set_ss(b); |
| auto rcp = _mm_rcp_ss(b_reg); |
| rcp = _mm_mul_ss(rcp, _mm_fnmsub_ss(rcp, b_reg, minus2)); |
| rcp = _mm_mul_ss(rcp, _mm_fnmsub_ss(rcp, b_reg, minus2)); |
| return _mm_cvtss_f32(rcp); |
| } |
| |
| template<> |
| Vectorized<double> fast_recieprocal(const Vectorized<double>& b) { |
| return b.reciprocal(); |
| } |
| |
| template <> |
| double fast_recieprocal(double b) { |
| return 1./b; |
| } |
| |
| } |
| #endif |
| |
| #include <cstdint> |
| #include <cmath> |
| #include <vector> |
| |
| #include <mkl.h> |
| |
| namespace caffe2::details { |
| |
| // MKL VML function templates. |
| template <typename T> |
| void PackV(const int N, const T* a, const int* ia, T* y); |
| template <typename T> |
| void UnpackV(const int N, const T* a, T* y, const int* iy); |
| |
| #define DELEGATE_PACKV_FUNCTION(T, OriginalFunc) \ |
| template <> \ |
| void PackV<T>(const int N, const T* a, const int* ia, T* y) { \ |
| OriginalFunc(N, a, ia, y); \ |
| } |
| DELEGATE_PACKV_FUNCTION(float, vsPackV) |
| DELEGATE_PACKV_FUNCTION(double, vdPackV) |
| #undef DELEGATE_PACKV_FUNCTION |
| |
| #define DELEGATE_UNPACKV_FUNCTION(T, OriginalFunc) \ |
| template <> \ |
| void UnpackV<T>(const int N, const T* a, T* y, const int* iy) { \ |
| OriginalFunc(N, a, y, iy); \ |
| } |
| DELEGATE_UNPACKV_FUNCTION(float, vsUnpackV) |
| DELEGATE_UNPACKV_FUNCTION(double, vdUnpackV) |
| #undef DELEGATE_UNPACKV_FUNCTION |
| |
| #ifndef FAST_VECTORIZED_KERNEL |
| template <typename T> |
| void box_cox_zero_lambda( |
| size_t D, |
| const T* const self_data, |
| const T* const lambda2_data, |
| T k_eps, |
| T* const output_data) { |
| int j = 0; |
| using Vec = at::vec::Vectorized<T>; |
| constexpr int64_t VLEN = Vec::size(); |
| auto k_eps_vec = Vec(k_eps); |
| for(; j + VLEN < D; j += VLEN) { |
| auto data = Vec::loadu(self_data + j); |
| auto lambda2 = Vec::loadu(lambda2_data + j); |
| auto sum = data + lambda2; |
| auto max = at::vec::max(sum, k_eps_vec); |
| auto res = max.log(); |
| res.store(output_data + j); |
| } |
| for ( ;j < D; ++j) { |
| auto sum = self_data[j] + lambda2_data[j]; |
| auto max = std::max(sum, k_eps); |
| output_data[j] = std::log(max); |
| } |
| } |
| |
| template <typename T> |
| void box_cox_nonzero_lambda( |
| int64_t D, |
| const T* data_ptr, |
| const T* lambda1_ptr, |
| const T* lambda2_ptr, |
| T k_eps, |
| T* out) { |
| |
| int j = 0; |
| using Vec = at::vec::Vectorized<T>; |
| constexpr int64_t VLEN = Vec::size(); |
| auto k_eps_vec = Vec(k_eps); |
| for(; j + VLEN < D; j += VLEN) { |
| auto data = Vec::loadu(data_ptr + j); |
| auto lambda2 = Vec::loadu(lambda2_ptr + j); |
| auto sum = data + lambda2; |
| auto max = at::vec::max(sum, k_eps_vec); |
| auto lambda1 = Vec::loadu(lambda1_ptr + j); |
| auto lambda_over_1 = at::vec::fast_recieprocal(lambda1); |
| auto pow = max.pow(lambda1); |
| auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1); |
| res.store(out + j); |
| } |
| for ( ;j < D; ++j) { |
| auto sum = data_ptr[j] + lambda2_ptr[j]; |
| auto max = std::max(sum, k_eps); |
| auto lambda_over_1 = at::vec::fast_recieprocal(lambda1_ptr[j]); |
| auto pow = std::pow(max, lambda1_ptr[j]); |
| out[j] = pow * lambda_over_1 - lambda_over_1; |
| } |
| } |
| #else |
| template <typename T> |
| void box_cox_zero_lambda( |
| size_t D, |
| const T* const self_data, |
| const T* const lambda2_data, |
| T k_eps, |
| T* const output_data) { |
| VECTOR_LOOP for (auto j=0 ;j < D; ++j) { |
| auto sum = self_data[j] + lambda2_data[j]; |
| auto max = std::max(sum, k_eps); |
| output_data[j] = std::log(max); |
| } |
| } |
| |
| template <typename T> |
| void box_cox_nonzero_lambda( |
| int64_t D, |
| const T* data_ptr, |
| const T* lambda1_ptr, |
| const T* lambda2_ptr, |
| T k_eps, |
| T* out) { |
| |
| VECTOR_LOOP for (auto j=0 ;j < D; ++j) { |
| FAST_MATH |
| auto sum = data_ptr[j] + lambda2_ptr[j]; |
| auto max = std::max(sum, k_eps); |
| auto lamda1 = lambda1_ptr[j]; |
| auto lambda_over_1 = 1 / lamda1; |
| if constexpr (std::is_same<T, float>::value) { |
| lambda_over_1 = lambda_over_1 * (T{2} - lambda_over_1 * lamda1); |
| lambda_over_1 = lambda_over_1 * (T{2} - lambda_over_1 * lamda1); |
| } |
| auto pow = std::pow(max, lamda1); |
| out[j] = pow * lambda_over_1 - lambda_over_1; |
| } |
| } |
| #endif |
| |
| template <typename T> |
| void box_cox_mixed_lambda( |
| const T* const self_data, |
| const std::vector<int>& nonzeros, |
| const std::vector<int>& zeros, |
| const T* const lambda1, |
| const T* const lambda2, |
| const T* const lambda2_z_, |
| T k_eps, |
| T* const buffer, |
| T* const output_data) { |
| PackV(nonzeros.size(), self_data, nonzeros.data(), buffer); |
| box_cox_nonzero_lambda<T>( |
| nonzeros.size(), buffer, lambda1, lambda2, k_eps, buffer); |
| UnpackV(nonzeros.size(), buffer, output_data, nonzeros.data()); |
| |
| PackV(zeros.size(), self_data, zeros.data(), buffer); |
| box_cox_zero_lambda<T>( |
| zeros.size(), buffer, lambda2_z_, k_eps, buffer); |
| UnpackV(zeros.size(), buffer, output_data, zeros.data()); |
| } |
| |
| template <typename T> |
| void TileArrayIntoVector( |
| const T* const a, |
| const size_t D, |
| const int K, |
| std::vector<T>& b) { |
| b.resize(K * D); |
| for (const auto k : c10::irange(K)) { |
| std::copy(a, a + D, b.begin() + k * D); |
| } |
| } |
| |
| void TileIndicesInPlace(std::vector<int>& v, const std::size_t D, const std::size_t K) { |
| auto n = v.size(); |
| v.resize(K * n); |
| for (const auto k : c10::irange(1, K)) { |
| for (const auto j : c10::irange(n)) { |
| v[k * n + j] = v[j] + k * D; |
| } |
| } |
| } |
| |
| template <typename T> |
| void compute_batch_box_cox__avx2_fma( |
| std::size_t N, |
| std::size_t D, |
| std::size_t block_size, |
| const T* self_data, |
| const T* __restrict lambda1_data, |
| const T* __restrict lambda2_data, |
| T* output_data) { |
| constexpr T k_eps = static_cast<T>(1e-6); |
| |
| FOLLY_DECLARE_REUSED(zeros, std::vector<int>); |
| FOLLY_DECLARE_REUSED(nonzeros, std::vector<int>); |
| // Don't bother calling reserve; calls after the first will get a |
| // correctly-sized allocation anyway. |
| for (const auto j : c10::irange(D)) { |
| if (lambda1_data[j] == 0) { |
| zeros.push_back(j); |
| } else { |
| nonzeros.push_back(j); |
| } |
| } |
| |
| // Process K rows at a time for effective vectorization with small rows. |
| const auto K = std::min(N, (block_size + D - 1) / D); |
| |
| FOLLY_DECLARE_REUSED(lambda1_, std::vector<T>); |
| FOLLY_DECLARE_REUSED(lambda2_, std::vector<T>); |
| FOLLY_DECLARE_REUSED(lambda2_z_, std::vector<T>); |
| |
| if (nonzeros.size() == D) { |
| // ((x + lambda2)^lambda1 - 1)/lambda1, if lambda1 != 0 |
| size_t i = 0; |
| if (K > 1) { |
| TileArrayIntoVector(lambda1_data, D, K, lambda1_); |
| TileArrayIntoVector(lambda2_data, D, K, lambda2_); |
| DCHECK_EQ(K * D, lambda1_.size()); |
| DCHECK_EQ(K * D, lambda2_.size()); |
| for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) { |
| box_cox_nonzero_lambda<T>( |
| K * D, |
| self_data, |
| lambda1_.data(), |
| lambda2_.data(), |
| k_eps, |
| output_data); |
| } |
| } |
| for (; i < N; i++, self_data += D, output_data += D) { |
| box_cox_nonzero_lambda<T>( |
| D, self_data, lambda1_data, lambda2_data, k_eps, output_data); |
| } |
| } else if (zeros.size() == D) { |
| // ln(x + lambda2), if lambda1 == 0 |
| size_t i = 0; |
| if (K > 1) { |
| TileArrayIntoVector(lambda2_data, D, K, lambda2_z_); |
| DCHECK_EQ(K * D, lambda2_z_.size()); |
| for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) { |
| box_cox_zero_lambda<T>( |
| K * D, self_data, lambda2_z_.data(), k_eps, output_data); |
| } |
| } |
| for (; i < N; i++, self_data += D, output_data += D) { |
| box_cox_zero_lambda<T>( |
| D, self_data, lambda2_data, k_eps, output_data); |
| } |
| } else { |
| // mix zeros and nonzeros |
| const size_t n = nonzeros.size(); |
| if (K > 1) { |
| TileIndicesInPlace(nonzeros, 0, K); |
| TileIndicesInPlace(zeros, 0, K); |
| } |
| |
| FOLLY_DECLARE_REUSED(buffer, std::vector<T>); |
| |
| buffer.resize(std::max(nonzeros.size(), zeros.size())); |
| lambda1_.resize(nonzeros.size()); |
| lambda2_.resize(nonzeros.size()); |
| lambda2_z_.resize(zeros.size()); |
| PackV(nonzeros.size(), lambda1_data, nonzeros.data(), lambda1_.data()); |
| PackV(nonzeros.size(), lambda2_data, nonzeros.data(), lambda2_.data()); |
| PackV(zeros.size(), lambda2_data, zeros.data(), lambda2_z_.data()); |
| |
| size_t i = 0; |
| if (K > 1) { |
| // Truncate to original size, and re-tile with offsets this time. |
| nonzeros.resize(n); |
| DCHECK_GT(D, n); |
| zeros.resize(D - n); |
| TileIndicesInPlace(nonzeros, D, K); |
| TileIndicesInPlace(zeros, D, K); |
| DCHECK_EQ(nonzeros.size(), lambda1_.size()); |
| DCHECK_EQ(nonzeros.size(), lambda2_.size()); |
| DCHECK_EQ(zeros.size(), lambda2_z_.size()); |
| |
| for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) { |
| box_cox_mixed_lambda<T>( |
| self_data, |
| nonzeros, |
| zeros, |
| lambda1_.data(), |
| lambda2_.data(), |
| lambda2_z_.data(), |
| k_eps, |
| buffer.data(), |
| output_data); |
| } |
| // Truncate to original size. |
| nonzeros.resize(n); |
| zeros.resize(D - n); |
| } |
| for (; i < N; i++, self_data += D, output_data += D) { |
| box_cox_mixed_lambda<T>( |
| self_data, |
| nonzeros, |
| zeros, |
| lambda1_.data(), |
| lambda2_.data(), |
| lambda2_z_.data(), |
| k_eps, |
| buffer.data(), |
| output_data); |
| } |
| } |
| }; |
| |
| |
| template |
| void compute_batch_box_cox__avx2_fma<float>( |
| std::size_t N, |
| std::size_t D, |
| std::size_t block_size, |
| const float* self_data, |
| const float* __restrict lambda1_data, |
| const float* __restrict lambda2_data, |
| float* output_data); |
| |
| template |
| void compute_batch_box_cox__avx2_fma<double>( |
| std::size_t N, |
| std::size_t D, |
| std::size_t block_size, |
| const double* self_data, |
| const double* __restrict lambda1_data, |
| const double* __restrict lambda2_data, |
| double* output_data); |
| |
| } // namespace caffe2::detail |
| #endif |