blob: 25243b2b1910761f49a632d2d55f65647722fc81 [file] [log] [blame]
// Ternary and higher-order pointwise operations
#define TORCH_ASSERT_NO_OPERATORS
#include <ATen/Dispatch.h>
#include <ATen/native/PointwiseOps.h>
#include <ATen/native/TensorIterator.h>
#include <ATen/native/cpu/Loops.h>
#include <c10/core/Scalar.h>
#include <ATen/cpu/vec/functional.h>
namespace at::native {
namespace {
static void addcmul_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
ScalarType dtype = iter.common_dtype();
if (at::isReducedFloatingType(dtype)) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(dtype, "addcmul_cpu_out", [&]() {
float float_val = value.to<float>();
auto float_vec = Vectorized<float>(float_val);
cpu_kernel_vec(
iter,
[=](scalar_t self_val, scalar_t t1_val, scalar_t t2_val) -> scalar_t {
return float(self_val) + float_val * float(t1_val) * float(t2_val);
},
[=](Vectorized<scalar_t> self_vec,
Vectorized<scalar_t> t1_vec,
Vectorized<scalar_t> t2_vec) -> Vectorized<scalar_t> {
auto [self_vec0, self_vec1] = convert_to_float<scalar_t>(self_vec);
auto [t1_vec0, t1_vec1] = convert_to_float<scalar_t>(t1_vec);
auto [t2_vec0, t2_vec1] = convert_to_float<scalar_t>(t2_vec);
self_vec0 = self_vec0 + float_vec * t1_vec0 * t2_vec0;
self_vec1 = self_vec1 + float_vec * t1_vec1 * t2_vec1;
return convert_from_float<scalar_t>(self_vec0, self_vec1);
});
});
} else {
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(at::ScalarType::ComplexHalf,
dtype, "addcmul_cpu_out", [&] {
scalar_t scalar_val = value.to<scalar_t>();
auto scalar_vec = Vectorized<scalar_t>(scalar_val);
cpu_kernel_vec(
iter,
[=](scalar_t self_val, scalar_t t1_val, scalar_t t2_val) -> scalar_t {
return self_val + scalar_val * t1_val * t2_val;
},
[=](Vectorized<scalar_t> self_vec,
Vectorized<scalar_t> t1_vec,
Vectorized<scalar_t> t2_vec) {
return self_vec + scalar_vec * t1_vec * t2_vec;
});
});
}
}
static void addcdiv_cpu_kernel(TensorIteratorBase& iter, const Scalar& value) {
ScalarType dtype = iter.common_dtype();
if (at::isReducedFloatingType(dtype)) {
AT_DISPATCH_REDUCED_FLOATING_TYPES(dtype, "addcdiv_cpu_out", [&]() {
float float_val = value.to<float>();
auto float_vec = Vectorized<float>(float_val);
cpu_kernel_vec(
iter,
[=](scalar_t self_val, scalar_t t1_val, scalar_t t2_val) -> scalar_t {
return float(self_val) + float_val * float(t1_val) / float(t2_val);
},
[=](Vectorized<scalar_t> self_vec,
Vectorized<scalar_t> t1_vec,
Vectorized<scalar_t> t2_vec) -> Vectorized<scalar_t> {
auto [self_vec0, self_vec1] = convert_to_float<scalar_t>(self_vec);
auto [t1_vec0, t1_vec1] = convert_to_float<scalar_t>(t1_vec);
auto [t2_vec0, t2_vec1] = convert_to_float<scalar_t>(t2_vec);
self_vec0 = self_vec0 + float_vec * t1_vec0 / t2_vec0;
self_vec1 = self_vec1 + float_vec * t1_vec1 / t2_vec1;
return convert_from_float<scalar_t>(self_vec0, self_vec1);
});
});
} else {
AT_DISPATCH_ALL_TYPES_AND_COMPLEX(dtype, "addcdiv_cpu_out", [&] {
scalar_t scalar_val = value.to<scalar_t>();
auto scalar_vec = Vectorized<scalar_t>(scalar_val);
cpu_kernel_vec(
iter,
[=](scalar_t self_val, scalar_t t1_val, scalar_t t2_val) -> scalar_t {
return self_val + scalar_val * t1_val / t2_val;
},
[=](Vectorized<scalar_t> self_vec,
Vectorized<scalar_t> t1_vec,
Vectorized<scalar_t> t2_vec) {
return self_vec + scalar_vec * t1_vec / t2_vec;
});
});
}
}
static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double beta) {
ScalarType dtype = iter.dtype(0);
if (dtype == kBFloat16) {
auto norm_val = norm.to<float>();
float beta_val(beta);
auto norm_val_vec = Vectorized<float>(norm_val);
auto beta_val_vec = Vectorized<float>(beta_val);
const auto neg_1_vec = Vectorized<float>(-1);
const auto zero_vec = Vectorized<float>(0);
const auto pos_1_vec = Vectorized<float>(1);
cpu_kernel_vec(iter,
[=](BFloat16 input, BFloat16 target, BFloat16 grad_output) -> BFloat16 {
const auto x = float(input) - float(target);
if (x <= -beta){
return -norm_val * float(grad_output);
}else if (x >= beta){
return norm_val * float(grad_output);
}else{
return norm_val * x * float(grad_output) / beta;
}
},
[norm_val_vec, beta_val_vec, neg_1_vec, zero_vec, pos_1_vec](
Vectorized<BFloat16> input, Vectorized<BFloat16> target, Vectorized<BFloat16> grad_output) -> Vectorized<BFloat16> {
// using two blendv calls to simulate the 3 cases
// 1 if x >= beta
// -1 if x <= -beta
// x / beta if |x| < beta
Vectorized<float> input0, input1, target0, target1, grad_output0, grad_output1;
std::tie(input0, input1) = convert_bfloat16_float(input);
std::tie(target0, target1) = convert_bfloat16_float(target);
std::tie(grad_output0, grad_output1) = convert_bfloat16_float(grad_output);
auto x = input0 - target0;
auto pos_or_neg_1_vec = Vectorized<float>::blendv(
neg_1_vec, pos_1_vec, x > zero_vec);
auto x_abs = x.abs();
auto output = Vectorized<float>::blendv(
x / beta_val_vec, pos_or_neg_1_vec, x_abs >= beta_val_vec);
input0 = norm_val_vec * output * grad_output0;
x = input1 - target1;
pos_or_neg_1_vec = Vectorized<float>::blendv(
neg_1_vec, pos_1_vec, x > zero_vec);
x_abs = x.abs();
output = Vectorized<float>::blendv(
x / beta_val_vec, pos_or_neg_1_vec, x_abs >= beta_val_vec);
input1 = norm_val_vec * output * grad_output1;
return convert_float_bfloat16(input0, input1);
}
);
} else {
AT_DISPATCH_ALL_TYPES(dtype, "smooth_l1_backward_cpu_out", [&] {
auto norm_val = norm.to<scalar_t>();
scalar_t beta_val(beta);
auto norm_val_vec = Vectorized<scalar_t>(norm_val);
auto beta_val_vec = Vectorized<scalar_t>(beta_val);
const auto neg_1_vec = Vectorized<scalar_t>(-1);
const auto zero_vec = Vectorized<scalar_t>(0);
const auto pos_1_vec = Vectorized<scalar_t>(1);
cpu_kernel_vec(iter,
[=](scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t {
const auto x = input - target;
if (x <= -beta)
return -norm_val * grad_output;
else if (x >= beta)
return norm_val * grad_output;
else
return norm_val * x * grad_output / beta;
},
[norm_val_vec, beta_val_vec, neg_1_vec, zero_vec, pos_1_vec](
Vectorized<scalar_t> input, Vectorized<scalar_t> target, Vectorized<scalar_t> grad_output) -> Vectorized<scalar_t> {
// using two blendv calls to simulate the 3 cases
// 1 if x >= beta
// -1 if x <= -beta
// x / beta if |x| < beta
const auto x = input - target;
const auto pos_or_neg_1_vec = Vectorized<scalar_t>::blendv(
neg_1_vec, pos_1_vec, x > zero_vec);
const auto x_abs = x.abs();
const auto output = Vectorized<scalar_t>::blendv(
x / beta_val_vec, pos_or_neg_1_vec, x_abs >= beta_val_vec);
return norm_val_vec * output * grad_output;
}
);
});
}
}
static void huber_backward_cpu_kernel(TensorIterator& iter, const Scalar& norm, double delta) {
ScalarType dtype = iter.dtype(0);
AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, dtype, "huber_backward_cpu_out", [&] {
auto norm_val = norm.to<scalar_t>();
scalar_t delta_val(delta);
auto norm_val_vec = Vectorized<scalar_t>(norm_val);
auto delta_val_vec = Vectorized<scalar_t>(delta_val);
const auto neg_1_vec = Vectorized<scalar_t>(-1);
const auto zero_vec = Vectorized<scalar_t>(0);
const auto pos_1_vec = Vectorized<scalar_t>(1);
cpu_kernel_vec(iter,
[=](scalar_t input, scalar_t target, scalar_t grad_output) -> scalar_t {
const auto x = input - target;
if (x <= -delta) {
return -norm_val * grad_output * delta;
} else if (x >= delta) {
return norm_val * grad_output * delta;
} else {
return norm_val * x * grad_output;
}
},
[norm_val_vec, delta_val_vec, neg_1_vec, zero_vec, pos_1_vec](
Vectorized<scalar_t> input, Vectorized<scalar_t> target, Vectorized<scalar_t> grad_output) -> Vectorized<scalar_t> {
// using two blendv calls to simulate the 3 cases
// delta if x >= delta
// -delta if x <= -delta
// x if |x| < delta
const auto x = input - target;
const auto pos_or_neg_1_vec = Vectorized<scalar_t>::blendv(
neg_1_vec, pos_1_vec, x > zero_vec);
const auto x_abs = x.abs();
const auto output = Vectorized<scalar_t>::blendv(
x, pos_or_neg_1_vec * delta_val_vec, x_abs >= delta_val_vec);
return norm_val_vec * output * grad_output;
}
);
});
}
static void mse_backward_cpu_kernel(TensorIterator& iter, const Scalar& value) {
ScalarType dtype = iter.dtype(0);
AT_DISPATCH_ALL_TYPES(dtype, "mse_backward_cpu_out", [&] {
scalar_t scalar_val = value.to<scalar_t>();
auto scalar_vec = Vectorized<scalar_t>(scalar_val);
cpu_kernel_vec(
iter,
[=](scalar_t self_val, scalar_t t1_val, scalar_t t2_val) -> scalar_t {
return scalar_val * (self_val - t1_val) * t2_val;
},
[=](Vectorized<scalar_t> self_vec,
Vectorized<scalar_t> t1_vec,
Vectorized<scalar_t> t2_vec) {
return scalar_vec * (self_vec - t1_vec) * t2_vec;
});
});
}
} // anonymous namespace
REGISTER_DISPATCH(addcmul_stub, &addcmul_cpu_kernel);
REGISTER_DISPATCH(addcdiv_stub, &addcdiv_cpu_kernel);
REGISTER_DISPATCH(smooth_l1_backward_stub, &smooth_l1_backward_cpu_kernel);
REGISTER_DISPATCH(huber_backward_stub, &huber_backward_cpu_kernel);
REGISTER_DISPATCH(mse_backward_stub, &mse_backward_cpu_kernel);
} // namespace at::native