blob: 6171b5bfd0326c61e8f7c3ebb5e5e8e487a2eb31 [file] [log] [blame]
#include <immintrin.h>
#ifdef CAFFE2_PERF_USE_MKL
#include <c10/util/irange.h>
#include <caffe2/perfkernels/common.h>
#include <folly/SingletonThreadLocal.h>
#include "vectorizer.h"
// Enable compiler vectorized version only if numerical consistency is not
// required between dev and opt versions - disabled for now
#ifndef FAST_VECTORIZED_KERNEL
#define CPU_CAPABILITY_AVX2
#include <ATen/cpu/vec/vec.h>
namespace at::vec {
// Implements the vectorized version of std::max() operation,
// which DOESNOT propagates NaN for second argument
template <typename scalar_t>
Vectorized<scalar_t> max(const Vectorized<scalar_t>& a, const Vectorized<scalar_t>& b);
template <>
Vectorized<double> max(const Vectorized<double>& a, const Vectorized<double>& b) {
// std::max(NaN, nonNan) -> NaN
return _mm256_max_pd(b, a);
}
template <>
Vectorized<float> max(const Vectorized<float>& a, const Vectorized<float>& b) {
// std::max(NaN, nonNan) -> NaN
return _mm256_max_ps(b, a);
}
// Implements recieprocal method based on newton-rapson method
// 1. user RCP approximiation
// 2. update with RCP = RCP * (2 - X * RCP)
template <typename scalar_t>
Vectorized<scalar_t> fast_recieprocal(const Vectorized<scalar_t>& b);
template <typename scalar_t>
scalar_t fast_recieprocal(scalar_t b);
template<>
Vectorized<float> fast_recieprocal(const Vectorized<float>& b) {
auto minus2 = _mm256_set1_ps(-2.f);
auto rcp = _mm256_rcp_ps(b);
rcp = _mm256_mul_ps(rcp, _mm256_fnmsub_ps(rcp, b, minus2));
rcp = _mm256_mul_ps(rcp, _mm256_fnmsub_ps(rcp, b, minus2));
return rcp;
}
template <>
float fast_recieprocal(float b) {
auto minus2 = _mm_set_ss(-2.f);
auto b_reg = _mm_set_ss(b);
auto rcp = _mm_rcp_ss(b_reg);
rcp = _mm_mul_ss(rcp, _mm_fnmsub_ss(rcp, b_reg, minus2));
rcp = _mm_mul_ss(rcp, _mm_fnmsub_ss(rcp, b_reg, minus2));
return _mm_cvtss_f32(rcp);
}
template<>
Vectorized<double> fast_recieprocal(const Vectorized<double>& b) {
return b.reciprocal();
}
template <>
double fast_recieprocal(double b) {
return 1./b;
}
}
#endif
#include <cstdint>
#include <cmath>
#include <vector>
#include <mkl.h>
namespace caffe2::details {
// MKL VML function templates.
template <typename T>
void PackV(const int N, const T* a, const int* ia, T* y);
template <typename T>
void UnpackV(const int N, const T* a, T* y, const int* iy);
#define DELEGATE_PACKV_FUNCTION(T, OriginalFunc) \
template <> \
void PackV<T>(const int N, const T* a, const int* ia, T* y) { \
OriginalFunc(N, a, ia, y); \
}
DELEGATE_PACKV_FUNCTION(float, vsPackV)
DELEGATE_PACKV_FUNCTION(double, vdPackV)
#undef DELEGATE_PACKV_FUNCTION
#define DELEGATE_UNPACKV_FUNCTION(T, OriginalFunc) \
template <> \
void UnpackV<T>(const int N, const T* a, T* y, const int* iy) { \
OriginalFunc(N, a, y, iy); \
}
DELEGATE_UNPACKV_FUNCTION(float, vsUnpackV)
DELEGATE_UNPACKV_FUNCTION(double, vdUnpackV)
#undef DELEGATE_UNPACKV_FUNCTION
#ifndef FAST_VECTORIZED_KERNEL
template <typename T>
void box_cox_zero_lambda(
size_t D,
const T* const self_data,
const T* const lambda2_data,
T k_eps,
T* const output_data) {
int j = 0;
using Vec = at::vec::Vectorized<T>;
constexpr int64_t VLEN = Vec::size();
auto k_eps_vec = Vec(k_eps);
for(; j + VLEN < D; j += VLEN) {
auto data = Vec::loadu(self_data + j);
auto lambda2 = Vec::loadu(lambda2_data + j);
auto sum = data + lambda2;
auto max = at::vec::max(sum, k_eps_vec);
auto res = max.log();
res.store(output_data + j);
}
for ( ;j < D; ++j) {
auto sum = self_data[j] + lambda2_data[j];
auto max = std::max(sum, k_eps);
output_data[j] = std::log(max);
}
}
template <typename T>
void box_cox_nonzero_lambda(
int64_t D,
const T* data_ptr,
const T* lambda1_ptr,
const T* lambda2_ptr,
T k_eps,
T* out) {
int j = 0;
using Vec = at::vec::Vectorized<T>;
constexpr int64_t VLEN = Vec::size();
auto k_eps_vec = Vec(k_eps);
for(; j + VLEN < D; j += VLEN) {
auto data = Vec::loadu(data_ptr + j);
auto lambda2 = Vec::loadu(lambda2_ptr + j);
auto sum = data + lambda2;
auto max = at::vec::max(sum, k_eps_vec);
auto lambda1 = Vec::loadu(lambda1_ptr + j);
auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
auto pow = max.pow(lambda1);
auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
res.store(out + j);
}
for ( ;j < D; ++j) {
auto sum = data_ptr[j] + lambda2_ptr[j];
auto max = std::max(sum, k_eps);
auto lambda_over_1 = at::vec::fast_recieprocal(lambda1_ptr[j]);
auto pow = std::pow(max, lambda1_ptr[j]);
out[j] = pow * lambda_over_1 - lambda_over_1;
}
}
#else
template <typename T>
void box_cox_zero_lambda(
size_t D,
const T* const self_data,
const T* const lambda2_data,
T k_eps,
T* const output_data) {
VECTOR_LOOP for (auto j=0 ;j < D; ++j) {
auto sum = self_data[j] + lambda2_data[j];
auto max = std::max(sum, k_eps);
output_data[j] = std::log(max);
}
}
template <typename T>
void box_cox_nonzero_lambda(
int64_t D,
const T* data_ptr,
const T* lambda1_ptr,
const T* lambda2_ptr,
T k_eps,
T* out) {
VECTOR_LOOP for (auto j=0 ;j < D; ++j) {
FAST_MATH
auto sum = data_ptr[j] + lambda2_ptr[j];
auto max = std::max(sum, k_eps);
auto lamda1 = lambda1_ptr[j];
auto lambda_over_1 = 1 / lamda1;
if constexpr (std::is_same<T, float>::value) {
lambda_over_1 = lambda_over_1 * (T{2} - lambda_over_1 * lamda1);
lambda_over_1 = lambda_over_1 * (T{2} - lambda_over_1 * lamda1);
}
auto pow = std::pow(max, lamda1);
out[j] = pow * lambda_over_1 - lambda_over_1;
}
}
#endif
template <typename T>
void box_cox_mixed_lambda(
const T* const self_data,
const std::vector<int>& nonzeros,
const std::vector<int>& zeros,
const T* const lambda1,
const T* const lambda2,
const T* const lambda2_z_,
T k_eps,
T* const buffer,
T* const output_data) {
PackV(nonzeros.size(), self_data, nonzeros.data(), buffer);
box_cox_nonzero_lambda<T>(
nonzeros.size(), buffer, lambda1, lambda2, k_eps, buffer);
UnpackV(nonzeros.size(), buffer, output_data, nonzeros.data());
PackV(zeros.size(), self_data, zeros.data(), buffer);
box_cox_zero_lambda<T>(
zeros.size(), buffer, lambda2_z_, k_eps, buffer);
UnpackV(zeros.size(), buffer, output_data, zeros.data());
}
template <typename T>
void TileArrayIntoVector(
const T* const a,
const size_t D,
const int K,
std::vector<T>& b) {
b.resize(K * D);
for (const auto k : c10::irange(K)) {
std::copy(a, a + D, b.begin() + k * D);
}
}
void TileIndicesInPlace(std::vector<int>& v, const std::size_t D, const std::size_t K) {
auto n = v.size();
v.resize(K * n);
for (const auto k : c10::irange(1, K)) {
for (const auto j : c10::irange(n)) {
v[k * n + j] = v[j] + k * D;
}
}
}
template <typename T>
void compute_batch_box_cox__avx2_fma(
std::size_t N,
std::size_t D,
std::size_t block_size,
const T* self_data,
const T* __restrict lambda1_data,
const T* __restrict lambda2_data,
T* output_data) {
constexpr T k_eps = static_cast<T>(1e-6);
FOLLY_DECLARE_REUSED(zeros, std::vector<int>);
FOLLY_DECLARE_REUSED(nonzeros, std::vector<int>);
// Don't bother calling reserve; calls after the first will get a
// correctly-sized allocation anyway.
for (const auto j : c10::irange(D)) {
if (lambda1_data[j] == 0) {
zeros.push_back(j);
} else {
nonzeros.push_back(j);
}
}
// Process K rows at a time for effective vectorization with small rows.
const auto K = std::min(N, (block_size + D - 1) / D);
FOLLY_DECLARE_REUSED(lambda1_, std::vector<T>);
FOLLY_DECLARE_REUSED(lambda2_, std::vector<T>);
FOLLY_DECLARE_REUSED(lambda2_z_, std::vector<T>);
if (nonzeros.size() == D) {
// ((x + lambda2)^lambda1 - 1)/lambda1, if lambda1 != 0
size_t i = 0;
if (K > 1) {
TileArrayIntoVector(lambda1_data, D, K, lambda1_);
TileArrayIntoVector(lambda2_data, D, K, lambda2_);
DCHECK_EQ(K * D, lambda1_.size());
DCHECK_EQ(K * D, lambda2_.size());
for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) {
box_cox_nonzero_lambda<T>(
K * D,
self_data,
lambda1_.data(),
lambda2_.data(),
k_eps,
output_data);
}
}
for (; i < N; i++, self_data += D, output_data += D) {
box_cox_nonzero_lambda<T>(
D, self_data, lambda1_data, lambda2_data, k_eps, output_data);
}
} else if (zeros.size() == D) {
// ln(x + lambda2), if lambda1 == 0
size_t i = 0;
if (K > 1) {
TileArrayIntoVector(lambda2_data, D, K, lambda2_z_);
DCHECK_EQ(K * D, lambda2_z_.size());
for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) {
box_cox_zero_lambda<T>(
K * D, self_data, lambda2_z_.data(), k_eps, output_data);
}
}
for (; i < N; i++, self_data += D, output_data += D) {
box_cox_zero_lambda<T>(
D, self_data, lambda2_data, k_eps, output_data);
}
} else {
// mix zeros and nonzeros
const size_t n = nonzeros.size();
if (K > 1) {
TileIndicesInPlace(nonzeros, 0, K);
TileIndicesInPlace(zeros, 0, K);
}
FOLLY_DECLARE_REUSED(buffer, std::vector<T>);
buffer.resize(std::max(nonzeros.size(), zeros.size()));
lambda1_.resize(nonzeros.size());
lambda2_.resize(nonzeros.size());
lambda2_z_.resize(zeros.size());
PackV(nonzeros.size(), lambda1_data, nonzeros.data(), lambda1_.data());
PackV(nonzeros.size(), lambda2_data, nonzeros.data(), lambda2_.data());
PackV(zeros.size(), lambda2_data, zeros.data(), lambda2_z_.data());
size_t i = 0;
if (K > 1) {
// Truncate to original size, and re-tile with offsets this time.
nonzeros.resize(n);
DCHECK_GT(D, n);
zeros.resize(D - n);
TileIndicesInPlace(nonzeros, D, K);
TileIndicesInPlace(zeros, D, K);
DCHECK_EQ(nonzeros.size(), lambda1_.size());
DCHECK_EQ(nonzeros.size(), lambda2_.size());
DCHECK_EQ(zeros.size(), lambda2_z_.size());
for (; i < N - K + 1; i += K, self_data += K * D, output_data += K * D) {
box_cox_mixed_lambda<T>(
self_data,
nonzeros,
zeros,
lambda1_.data(),
lambda2_.data(),
lambda2_z_.data(),
k_eps,
buffer.data(),
output_data);
}
// Truncate to original size.
nonzeros.resize(n);
zeros.resize(D - n);
}
for (; i < N; i++, self_data += D, output_data += D) {
box_cox_mixed_lambda<T>(
self_data,
nonzeros,
zeros,
lambda1_.data(),
lambda2_.data(),
lambda2_z_.data(),
k_eps,
buffer.data(),
output_data);
}
}
};
template
void compute_batch_box_cox__avx2_fma<float>(
std::size_t N,
std::size_t D,
std::size_t block_size,
const float* self_data,
const float* __restrict lambda1_data,
const float* __restrict lambda2_data,
float* output_data);
template
void compute_batch_box_cox__avx2_fma<double>(
std::size_t N,
std::size_t D,
std::size_t block_size,
const double* self_data,
const double* __restrict lambda1_data,
const double* __restrict lambda2_data,
double* output_data);
} // namespace caffe2::detail
#endif