test/cpp/lazy/test_lazy_ops.cpp - platform/external/pytorch - Git at Google

 #include <c10/core/Device.h>
 #include <c10/core/DeviceType.h>
 #include <gtest/gtest.h>
 #include <test/cpp/lazy/test_lazy_ops_util.h>
 #include <torch/csrc/lazy/core/debug_util.h>
 #include <torch/csrc/lazy/core/helpers.h>
 #include <torch/csrc/lazy/core/ir_builder.h>
 #include <torch/csrc/lazy/core/lazy_graph_executor.h>
 #include <torch/csrc/lazy/core/metrics.h>
 #include <torch/csrc/lazy/core/permutation_util.h>
 #include <torch/csrc/lazy/ts_backend/dynamic_ir.h>
 #include <torch/csrc/lazy/ts_backend/ts_backend_impl.h>
 #include <torch/torch.h>
 #include <iostream>

 namespace torch {
 namespace lazy {

 // Lazy Tensor is disabled in FBCODE until addressing non-virtual methods (e.g.
 // sizes) in TensorImpl
 #ifndef FBCODE_CAFFE2

 namespace {
 // This registers the torchscript backend, without which lazy device won't work.
 // FIXME: This registers the backend for the whole test binary. We should
 // probably do it and undo it in the test fixture below.
 static bool inline init_backend() {
   torch::lazy::InitTorchScriptBackend();
   return true;
 }
 static const bool backend_initialized = init_backend();

 } // namespace

 class LazyTsTest : public ::testing::Test {
  protected:
   void SetUp() override;

   void TearDown() override;

   static void CommonSetup() {}

   void ExpectCounterNotChanged(
       const std::string& counter_regex,
       const std::unordered_set<std::string>* ignore_set) {}

   void ExpectCounterChanged(
       const std::string& counter_regex,
       const std::unordered_set<std::string>* ignore_set) {}

   void ResetCounters() {}

  private:
   void MakeEndSnapshot() {}
 };

 class LazyOpsTestBase : public LazyTsTest {
  protected:
   static void SetUpTestCase() {}
 };

 void LazyTsTest::SetUp() {
   (void)backend_initialized; // avoid unused parameter warning
   at::manual_seed(42);
   torch::lazy::LazyGraphExecutor::Get()->SetRngSeed(
       torch::lazy::BackendDevice(), 42);
 }

 void LazyTsTest::TearDown() {}

 namespace {
 using torch::lazy::DebugUtil;

 class LazyOpsTest : public LazyOpsTestBase {};

 static inline bool IsCuda() {
   return torch::lazy::getBackend()->EagerFallbackDeviceType() == at::kCUDA;
 }

 static inline at::DeviceType DefaultDevice() {
   return torch::lazy::getBackend()->EagerFallbackDeviceType();
 }

 } // namespace

 TEST_F(LazyOpsTest, TestScalarTensor) {
   torch::Tensor scalar_tensor = torch::scalar_tensor(
       1., torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_scalar_tensor = torch::scalar_tensor(
         1., torch::TensorOptions(torch::kFloat).device(torch::kLazy));
     AllClose(scalar_tensor, lazy_scalar_tensor);
   });
 }

 TEST_F(LazyOpsTest, TestClone) {
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor a = torch::rand(
         {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = lazy_a.clone();
     AllClose(a, lazy_b);
     lazy_a.add_(1.0);
     AllClose(a, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestTo) {
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor a = torch::rand(
         {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_a = CopyToDevice(a, device);
     AllClose(a, lazy_a);
   });
 }

 TEST_F(LazyOpsTest, TestIsFloatingPoint) {
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor a = torch::rand(
         {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_a = CopyToDevice(a, device);
     bool is_float = torch::is_floating_point(a);
     bool lazy_is_float = torch::is_floating_point(lazy_a);
     EXPECT_EQ(is_float, lazy_is_float);
   });
 }

 TEST_F(LazyOpsTest, TestIsSigned) {
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor a = torch::rand(
         {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_a = CopyToDevice(a, device);
     bool is_signed = torch::is_signed(a);
     bool lazy_is_signed = torch::is_signed(lazy_a);
     EXPECT_EQ(is_signed, lazy_is_signed);
   });
 }

 TEST_F(LazyOpsTest, TestCastByte) {
   torch::Tensor a =
       torch::rand(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100.0;
   torch::Tensor b = torch::_cast_Byte(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::_cast_Byte(lazy_a);
     AllEqual(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestCastChar) {
   torch::Tensor a =
       torch::rand(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100.0;
   torch::Tensor b = torch::_cast_Char(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::_cast_Char(lazy_a);
     AllEqual(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestCastShort) {
   torch::Tensor a =
       torch::rand(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100.0;
   torch::Tensor b = torch::_cast_Short(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::_cast_Short(lazy_a);
     AllEqual(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestCastInt) {
   torch::Tensor a =
       torch::rand(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100.0;
   torch::Tensor b = torch::_cast_Int(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::_cast_Int(lazy_a);
     AllEqual(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestCastLong) {
   torch::Tensor a =
       torch::rand(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100.0;
   torch::Tensor b = torch::_cast_Long(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::_cast_Long(lazy_a);
     AllEqual(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestCastFloat) {
   torch::Tensor a =
       torch::rand(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100.0;
   torch::Tensor b = torch::_cast_Float(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::_cast_Float(lazy_a);
     AllEqual(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestRetainType) {
   torch::Tensor lazy_a = torch::zeros(
       {2, 2}, torch::TensorOptions(torch::kByte).device(torch::kLazy));
   torch::Tensor lazy_b = torch::ones(
       {2, 2}, torch::TensorOptions(torch::kByte).device(torch::kLazy));
   torch::Tensor lazy_c = lazy_a + lazy_b;
   EXPECT_EQ(lazy_c.scalar_type(), torch::ScalarType::Byte);
 }

 TEST_F(LazyOpsTest, TestLogicalTypeWithInterop) {
   torch::Tensor query = torch::rand(
       {2, 12, 20, 64},
       torch::TensorOptions(torch::kFloat).device(torch::kLazy));
   torch::Tensor key = torch::rand(
       {2, 12, 64, 20},
       torch::TensorOptions(torch::kFloat).device(torch::kLazy));
   torch::Tensor scores =
       torch::matmul(query, key) /
       torch::scalar_tensor(
           8, torch::TensorOptions(torch::kDouble).device(torch::kLazy));
   torch::Tensor p_attn = torch::softmax(scores, /*dim=*/-1);
   EXPECT_EQ(p_attn.scalar_type(), torch::ScalarType::Float);
 }

 TEST_F(LazyOpsTest, TestAdd) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::add(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::add(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestAddHalf) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kHalf).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kHalf).device(DefaultDevice()));
   torch::Tensor c = torch::add(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::add(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestAddMixedPrecision) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kHalf).device(DefaultDevice()));
   torch::Tensor c = torch::add(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::add(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestAddInPlace) {
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor a = torch::rand(
         {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor b = torch::rand(
         {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor c = a.add_(b);
     torch::Tensor lazy_c = lazy_a.add_(lazy_b);
     AllClose(a, lazy_a);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestAddScalar) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar b(1);
   torch::Tensor c = torch::add(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_c = torch::add(lazy_a, b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestAddScalarInPlace) {
   torch::Scalar b(1);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor a = torch::rand(
         {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor c = a.add_(b);
     torch::Tensor lazy_c = lazy_a.add_(b);
     AllClose(a, lazy_a);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestAddZeroSizeDim) {
   torch::Tensor a = torch::rand(
       {0, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {1, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::add(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::add(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestSub) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::sub(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::sub(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestSubInPlace) {
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor a = torch::rand(
         {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor b = torch::rand(
         {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor c = a.sub_(b);
     torch::Tensor lazy_c = lazy_a.sub_(lazy_b);
     AllClose(a, lazy_a);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestSubScalar) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar b(1);
   torch::Tensor c = torch::sub(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_c = torch::sub(lazy_a, b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestSubScalarInPlace) {
   torch::Scalar b(1);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor a = torch::rand(
         {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor c = a.sub_(b);
     torch::Tensor lazy_c = lazy_a.sub_(b);
     AllClose(a, lazy_a);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestMul) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::mul(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::mul(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestMulInPlace) {
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor a = torch::rand(
         {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor b = torch::rand(
         {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor c = a.mul_(b);
     torch::Tensor lazy_c = lazy_a.mul_(lazy_b);
     AllClose(a, lazy_a);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestMulScalar) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar b(3);
   torch::Tensor c = torch::mul(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_c = torch::mul(lazy_a, b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestMulScalarInPlace) {
   torch::Scalar b(3);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor a = torch::rand(
         {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor c = a.mul_(b);
     torch::Tensor lazy_c = lazy_a.mul_(b);
     AllClose(a, lazy_a);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestDiv) {
   for (torch::ScalarType scalar_type1 :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor a = isFloatingType(scalar_type1)
         ? torch::rand({3, 4}, torch::TensorOptions(scalar_type1))
         : torch::randint(0, 100, {3, 4}, torch::TensorOptions(scalar_type1));
     for (torch::ScalarType scalar_type2 :
          {torch::kFloat,
           torch::kByte,
           torch::kChar,
           torch::kShort,
           torch::kInt,
           torch::kLong}) {
       torch::Tensor b = isFloatingType(scalar_type2)
           ? torch::rand({3, 4}, torch::TensorOptions(scalar_type2))
           : torch::randint(1, 100, {3, 4}, torch::TensorOptions(scalar_type2));
       torch::Tensor c = torch::div(a, b);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_a = CopyToDevice(a, device);
         torch::Tensor lazy_b = CopyToDevice(b, device);
         torch::Tensor lazy_c = torch::div(lazy_a, lazy_b);
         AllClose(c, lazy_c);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestDivWithRoundingMode) {
   c10::optional<c10::string_view> rounding_modes[] = {
       "trunc", "floor", c10::nullopt};
   for (const auto& rounding_mode : rounding_modes) {
     for (torch::ScalarType scalar_type1 :
          {torch::kFloat,
           torch::kByte,
           torch::kChar,
           torch::kShort,
           torch::kInt,
           torch::kLong}) {
       int lower_bound = (scalar_type1 == torch::kByte) ? 0 : -100;
       torch::Tensor a = isFloatingType(scalar_type1)
           ? torch::rand({3, 4}, torch::TensorOptions(scalar_type1))
           : torch::randint(
                 lower_bound, 50, {3, 4}, torch::TensorOptions(scalar_type1));
       for (torch::ScalarType scalar_type2 :
            {torch::kFloat,
             torch::kByte,
             torch::kChar,
             torch::kShort,
             torch::kInt,
             torch::kLong}) {
         torch::Tensor b = isFloatingType(scalar_type2)
             ? torch::rand({3, 4}, torch::TensorOptions(scalar_type2))
             : torch::randint(
                   51, 100, {3, 4}, torch::TensorOptions(scalar_type2));
         torch::Tensor c = torch::div(a, b, rounding_mode);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_a = CopyToDevice(a, device);
           torch::Tensor lazy_b = CopyToDevice(b, device);
           torch::Tensor lazy_c = torch::div(lazy_a, lazy_b, rounding_mode);
           AllClose(c, lazy_c);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestDivInPlace) {
   for (torch::ScalarType scalar_type1 : {torch::kFloat}) {
     torch::Tensor a = isFloatingType(scalar_type1)
         ? torch::rand({3, 4}, torch::TensorOptions(scalar_type1))
         : torch::randint(0, 100, {3, 4}, torch::TensorOptions(scalar_type1));
     for (torch::ScalarType scalar_type2 : {torch::kFloat}) {
       torch::Tensor b = isFloatingType(scalar_type2)
           ? torch::rand({3, 4}, torch::TensorOptions(scalar_type2))
           : torch::randint(1, 100, {3, 4}, torch::TensorOptions(scalar_type2));
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_a = CopyToDevice(a, device);
         torch::Tensor c = a.div_(b);
         torch::Tensor lazy_b = CopyToDevice(b, device);
         torch::Tensor lazy_c = lazy_a.div_(lazy_b);
         ;
         AllClose(c, lazy_c);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestDivInPlaceWithRoundingMode) {
   c10::optional<c10::string_view> rounding_modes[] = {
       "trunc", "floor", c10::nullopt};
   for (const auto& rounding_mode : rounding_modes) {
     for (torch::ScalarType scalar_type1 : {torch::kFloat}) {
       torch::Tensor a = isFloatingType(scalar_type1)
           ? torch::rand({3, 4}, torch::TensorOptions(scalar_type1))
           : torch::randint(
                 -100, 100, {3, 4}, torch::TensorOptions(scalar_type1));
       for (torch::ScalarType scalar_type2 : {torch::kFloat}) {
         torch::Tensor b = isFloatingType(scalar_type2)
             ? torch::rand({3, 4}, torch::TensorOptions(scalar_type2))
             : torch::randint(
                   1, 100, {3, 4}, torch::TensorOptions(scalar_type2));
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_a = CopyToDevice(a, device);
           torch::Tensor c = a.div_(b, rounding_mode);
           torch::Tensor lazy_b = CopyToDevice(b, device);
           torch::Tensor lazy_c = lazy_a.div_(lazy_b, rounding_mode);
           AllClose(c, lazy_c);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestDivScalar) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor a = isFloatingType(scalar_type)
         ? torch::rand(
               {3, 4}, torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               1,
               100,
               {3, 4},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     for (bool is_float : {true, false}) {
       torch::Scalar b = is_float ? torch::Scalar(3.0) : torch::Scalar(3);
       torch::Tensor c = torch::div(a, b);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_a = CopyToDevice(a, device);
         torch::Tensor lazy_c = torch::div(lazy_a, b);
         AllClose(c, lazy_c);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestDivScalarInPlace) {
   for (torch::ScalarType scalar_type : {torch::kFloat}) {
     torch::Tensor a = isFloatingType(scalar_type)
         ? torch::rand(
               {3, 4}, torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               1,
               100,
               {3, 4},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     for (bool is_float : {true, false}) {
       torch::Scalar b = is_float ? torch::Scalar(3.0) : torch::Scalar(3);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_a = CopyToDevice(a, device);
         torch::Tensor c = a.div_(b);
         torch::Tensor lazy_c = lazy_a.div_(b);
         AllClose(c, lazy_c);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestDivOut) {
   for (torch::ScalarType scalar_type : {torch::kFloat, torch::kDouble}) {
     torch::Tensor a = torch::rand(
         {3, 4}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor b = torch::rand(
         {3, 4}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor c = torch::empty(
         {3, 4}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::div_out(c, a, b);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = CopyToDevice(b, device);
       torch::Tensor lazy_c = torch::empty({3, 4}, lazy_b.options());
       torch::div_out(lazy_c, lazy_a, lazy_b);
       AllClose(c, lazy_c);
     });
   }
 }

 TEST_F(LazyOpsTest, TestRsubScalar) {
   torch::Tensor input = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar other(1.5);
   torch::Scalar alpha(2.5);
   torch::Tensor result = torch::rsub(input, other, alpha);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_result = torch::rsub(lazy_input, other, alpha);
     AllClose(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestNe) {
   torch::Tensor a = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::ne(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::ne(lazy_a, lazy_b);
     AllEqual(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestNeInplace) {
   torch::Tensor a = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor a_copy = a.clone();
   torch::Tensor b = a.clone();
   b[0] += 1;
   a.ne_(b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a_copy, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     lazy_a.ne_(lazy_b);
     AllClose(a, lazy_a);
   });
 }

 TEST_F(LazyOpsTest, TestEq) {
   torch::Tensor a = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = a.clone();
   torch::Tensor c = torch::eq(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::eq(lazy_a, lazy_b);
     AllEqual(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestEqInplace) {
   torch::Tensor a = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = a.clone();
   b[0] += 1;
   torch::Tensor a_copy = a.clone();
   a.eq_(b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a_copy, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     lazy_a.eq_(lazy_b);
     AllClose(lazy_a, a);
   });
 }

 TEST_F(LazyOpsTest, TestGe) {
   torch::Tensor a = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = a.clone();
   torch::Tensor c = torch::ge(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::ge(lazy_a, lazy_b);
     AllEqual(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestGeInplace) {
   torch::Tensor a = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = a.clone();
   b[0] += 1;
   b[1] -= 1;
   torch::Tensor a_copy = a.clone();
   a.ge_(b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a_copy, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     lazy_a.ge_(lazy_b);
     AllClose(lazy_a, a);
   });
 }

 TEST_F(LazyOpsTest, TestLe) {
   torch::Tensor a = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = a.clone();
   torch::Tensor c = torch::le(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::le(lazy_a, lazy_b);
     AllEqual(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestLeInplace) {
   torch::Tensor a = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = a.clone();
   b[0] += 1;
   b[1] -= 1;
   torch::Tensor a_copy = a.clone();
   a.le_(b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a_copy, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     lazy_a.le_(lazy_b);
     AllClose(lazy_a, a);
   });
 }

 TEST_F(LazyOpsTest, TestGt) {
   torch::Tensor a = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::add(a.clone(), torch::ones_like(a));
   torch::Tensor c = torch::gt(b, a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::gt(lazy_b, lazy_a);
     AllEqual(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestGtInplace) {
   torch::Tensor a = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = a.clone();
   b[0] += 1;
   b[1] -= 1;
   torch::Tensor a_copy = a.clone();
   a.gt_(b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a_copy, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     lazy_a.gt_(lazy_b);
     AllClose(lazy_a, a);
   });
 }

 TEST_F(LazyOpsTest, TestLt) {
   torch::Tensor a = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::add(a.clone(), torch::ones_like(a));
   torch::Tensor c = torch::lt(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::lt(lazy_a, lazy_b);
     AllEqual(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestLtInplace) {
   torch::Tensor a = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = a.clone();
   b[0] += 1;
   b[1] -= 1;
   torch::Tensor a_copy = a.clone();
   a.lt_(b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a_copy, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     lazy_a.lt_(lazy_b);
     AllClose(lazy_a, a);
   });
 }

 TEST_F(LazyOpsTest, TestNeScalar) {
   torch::Tensor input = torch::ones({2, 3});
   torch::Scalar other(float(0));
   torch::Tensor result = torch::ne(input, other);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_result = torch::ne(lazy_input, other);
     AllEqual(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestEqScalar) {
   torch::Tensor input = torch::ones({2, 3});
   torch::Scalar other(float(1));
   torch::Tensor result = torch::eq(input, other);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_result = torch::eq(lazy_input, other);
     AllEqual(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestGeScalar) {
   torch::Tensor input = torch::ones({2, 3});
   torch::Scalar other(float(1));
   torch::Tensor result = torch::ge(input, other);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_result = torch::ge(lazy_input, other);
     AllEqual(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestGeScalarInplace) {
   torch::Tensor input = torch::arange(
       -1.,
       1.5,
       0.5,
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar other(float(0));
   torch::Tensor input_copy = input.clone();
   input.ge_(other);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input_copy, device);
     lazy_input.ge_(other);
     AllClose(lazy_input, input);
   });
 }

 TEST_F(LazyOpsTest, TestLeScalar) {
   torch::Tensor input = torch::ones({2, 3});
   torch::Scalar other(float(1));
   torch::Tensor result = torch::le(input, other);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_result = torch::le(lazy_input, other);
     AllEqual(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestLeScalarInplace) {
   torch::Tensor input = torch::arange(
       -1.,
       1.5,
       0.5,
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar other(float(0));
   torch::Tensor input_copy = input.clone();
   input.le_(other);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input_copy, device);
     lazy_input.le_(other);
     AllClose(lazy_input, input);
   });
 }

 TEST_F(LazyOpsTest, TestGtScalar) {
   torch::Tensor input = torch::ones({2, 3});
   torch::Scalar other(float(0.5));
   torch::Tensor result = torch::gt(input, other);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_result = torch::gt(lazy_input, other);
     AllEqual(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestGtScalarInplace) {
   torch::Tensor input = torch::arange(
       -1.,
       1.5,
       0.5,
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar other(float(0));
   torch::Tensor input_copy = input.clone();
   input.gt_(other);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input_copy, device);
     lazy_input.gt_(other);
     AllClose(lazy_input, input);
   });
 }

 TEST_F(LazyOpsTest, TestLtScalar) {
   torch::Tensor input = torch::ones({2, 3});
   torch::Scalar other(float(1.5));
   torch::Tensor result = torch::lt(input, other);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_result = torch::lt(lazy_input, other);
     AllEqual(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestLtScalarInplace) {
   torch::Tensor input = torch::arange(
       -1.,
       1.5,
       0.5,
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar other(float(0));
   torch::Tensor input_copy = input.clone();
   input.lt_(other);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input_copy, device);
     lazy_input.lt_(other);
     AllClose(lazy_input, input);
   });
 }

 TEST_F(LazyOpsTest, TestIntegerAdd) {
   std::vector<torch::ScalarType> types(
       {torch::kByte, torch::kChar, torch::kShort, torch::kInt, torch::kLong});

   ForEachDevice([&](const torch::Device& device) {
     for (auto type : types) {
       torch::Tensor a =
           torch::randint(0, 63, {2, 2}, torch::TensorOptions(type));
       torch::Tensor b =
           torch::randint(0, 63, {2, 2}, torch::TensorOptions(type));
       torch::Scalar one =
           isIntegralType(type, false) ? torch::Scalar(1) : torch::Scalar(1.0);
       torch::Tensor c = torch::add(b, one);

       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = CopyToDevice(b, device);
       torch::Tensor lazy_c = torch::add(lazy_b, one);

       AllEqual(c, lazy_c);
     }
   });
 }

 TEST_F(LazyOpsTest, TestSVD) {
   static const int dims[] = {4, 7};
   for (auto m : dims) {
     for (auto n : dims) {
       torch::Tensor a = torch::rand(
           {m, n}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       auto b = torch::svd(a, /*some=*/true, /*compute_uv=*/true);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_a = CopyToDevice(a, device);
         auto lazy_b = torch::svd(lazy_a, /*some=*/true, /*compute_uv=*/true);
         // The U and V matrices might have different sign for column vectors, so
         // cannot be compared if not by absolute value.
         AllClose(
             std::get<0>(b).abs(),
             std::get<0>(lazy_b).abs(),
             /*rtol=*/1e-3,
             /*atol=*/1e-4);
         torch::Tensor diag = std::get<1>(b);
         torch::Tensor lazy_diag = std::get<1>(lazy_b);
         ASSERT_EQ(diag.sizes(), lazy_diag.sizes());
         AllClose(
             diag,
             lazy_diag,
             /*rtol=*/1e-3,
             /*atol=*/1e-4);
         AllClose(
             std::get<2>(b).abs(),
             std::get<2>(lazy_b).abs(),
             /*rtol=*/1e-3,
             /*atol=*/1e-4);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestQR) {
   static const int dims[] = {4, 7};
   for (auto m : dims) {
     for (auto n : dims) {
       torch::Tensor a = torch::rand(
           {m, n}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       auto b = torch::qr(a);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_a = CopyToDevice(a, device);
         auto lazy_b = torch::qr(lazy_a);
         AllClose(
             std::get<0>(b).abs(),
             std::get<0>(lazy_b).abs(),
             /*rtol=*/1e-3,
             /*atol=*/1e-4);
         AllClose(
             std::get<1>(b).abs(),
             std::get<1>(lazy_b).abs(),
             /*rtol=*/1e-3,
             /*atol=*/1e-4);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestCholesky) {
   static const int dims[] = {4, 7};
   for (auto m : dims) {
     for (bool upper : {true, false}) {
       torch::Tensor a = torch::rand(
           {3, m, m},
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor pd_a =
           torch::matmul(a, torch::transpose(a, 1, 2)) +
           torch::eye(
               m, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       auto b = torch::cholesky(pd_a, upper);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_a = CopyToDevice(pd_a, device);
         auto lazy_b = torch::cholesky(lazy_a, upper);
         AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-4);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestLogDet) {
   static const int dims[] = {4, 7};
   for (auto m : dims) {
     torch::Tensor a = torch::rand(
         {3, m, m}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor pd_a = torch::matmul(a, torch::transpose(a, 1, 2)) +
         torch::eye(m,
                    torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor b = torch::logdet(pd_a);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(pd_a, device);
       torch::Tensor lazy_b = torch::logdet(lazy_a);
       AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-4);
     });
   }
 }

 TEST_F(LazyOpsTest, TestTriangularSolve) {
   static const int dims[] = {4, 7};
   for (bool batched_a : {true, false}) {
     for (bool batched_b : {true, false}) {
       for (auto m : dims) {
         for (auto n : dims) {
           for (bool upper : {true, false}) {
             for (bool transpose : {true, false}) {
               for (bool unitriangular : {true, false}) {
                 torch::Tensor a = torch::randn(
                     {m, m},
                     torch::TensorOptions(torch::kFloat)
                         .device(DefaultDevice()));
                 torch::Tensor b = torch::randn(
                     {m, n},
                     torch::TensorOptions(torch::kFloat)
                         .device(DefaultDevice()));
                 a = batched_a ? a.expand({3, m, m}).clone() : a;
                 b = batched_b ? b.expand({3, m, n}).clone() : b;
                 auto result = torch::triangular_solve(
                     b,
                     a,
                     /*upper=*/upper,
                     /*transpose=*/transpose,
                     /*unitriangular=*/unitriangular);
                 ForEachDevice([&](const torch::Device& device) {
                   torch::Tensor lazy_a = CopyToDevice(a, device);
                   torch::Tensor lazy_b = CopyToDevice(b, device);
                   auto lazy_result = torch::triangular_solve(
                       lazy_b,
                       lazy_a,
                       /*upper=*/upper,
                       /*transpose=*/transpose,
                       /*unitriangular=*/unitriangular);
                   AllClose(
                       std::get<0>(result),
                       std::get<0>(lazy_result),
                       /*rtol=*/1e-3,
                       /*atol=*/1e-4);
                   AllClose(
                       std::get<1>(result),
                       std::get<1>(lazy_result),
                       /*rtol=*/1e-3,
                       /*atol=*/1e-4);
                 });
               }
             }
           }
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestKthValue) {
   torch::Tensor a = torch::rand(
       {4, 5, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int k = 1; k <= 3; ++k) {
     int rank = a.dim();
     for (int dim = -rank; dim < rank; ++dim) {
       for (bool keepdim : {false, true}) {
         auto b = torch::kthvalue(a, k, dim, keepdim);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_a = CopyToDevice(a, device);
           auto lazy_b = torch::kthvalue(lazy_a, k, dim, keepdim);
           AllClose(std::get<0>(b), std::get<0>(lazy_b));
           AllEqual(std::get<1>(b), std::get<1>(lazy_b));
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestTopK) {
   torch::Tensor a = torch::rand(
       {4, 5, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int k = 1; k <= 3; ++k) {
     int rank = a.dim();
     for (int dim = -rank; dim < rank; ++dim) {
       for (bool largest : {false, true}) {
         auto b = torch::topk(a, k, dim, largest, /*sorted=*/true);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_a = CopyToDevice(a, device);
           auto lazy_b = torch::topk(lazy_a, k, dim, largest, /*sorted=*/true);
           AllClose(std::get<0>(b), std::get<0>(lazy_b));
           AllEqual(std::get<1>(b), std::get<1>(lazy_b));
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestSort) {
   torch::Tensor a = torch::rand(
       {4, 5, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int k = 1; k <= 3; ++k) {
     for (int dim = 0; dim < 3; ++dim) {
       for (bool descending : {false, true}) {
         auto b = torch::sort(a, dim, descending);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_a = CopyToDevice(a, device);
           auto lazy_b = torch::sort(lazy_a, dim, descending);
           AllClose(std::get<0>(b), std::get<0>(lazy_b));
           AllEqual(std::get<1>(b), std::get<1>(lazy_b));
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestSortDescWithMinValue) {
   std::vector<int8_t> values{-128, 100};
   torch::Tensor input =
       torch::tensor(values, torch::TensorOptions(torch::kChar));
   auto output = torch::sort(input, /*dim=*/0, /*descending=*/true);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     auto lazy_output = torch::sort(lazy_input, /*dim=*/0, /*descending=*/true);
     AllEqual(std::get<0>(output), std::get<0>(lazy_output));
     AllEqual(std::get<1>(output), std::get<1>(lazy_output));
   });
 }

 TEST_F(LazyOpsTest, TestArgSort) {
   torch::Tensor a = torch::rand(
       {4, 5, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int k = 1; k <= 3; ++k) {
     for (int dim = 0; dim < 3; ++dim) {
       for (bool descending : {false, true}) {
         torch::Tensor b = torch::argsort(a, dim, descending);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_a = CopyToDevice(a, device);
           torch::Tensor lazy_b = torch::argsort(lazy_a, dim, descending);
           AllEqual(b, lazy_b);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMin) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::min(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::min(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestMax) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::max(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::max(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestUnaryMin) {
   torch::Tensor input = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::min(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::min(lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestUnaryMax) {
   torch::Tensor input = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::max(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::max(lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestAll) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor a = isFloatingType(scalar_type)
         ? torch::rand(
               {3, 4}, torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {3, 4},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor b = torch::all(a);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::all(lazy_a);
       EqualValues(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestAllDim) {
   torch::Tensor a = torch::randint(
       0,
       5,
       {2, 3, 4},
       torch::TensorOptions(torch::kByte).device(DefaultDevice()));
   int rank = a.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor b = torch::all(a, dim, /*keepdim=*/false);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::all(lazy_a, dim, /*keepdim=*/false);
       EqualValues(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestAllDimKeep) {
   torch::Tensor a = torch::randint(
       0,
       5,
       {2, 3, 4},
       torch::TensorOptions(torch::kByte).device(DefaultDevice()));
   int rank = a.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor b = torch::all(a, dim, /*keepdim=*/true);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::all(lazy_a, dim, /*keepdim=*/true);
       EqualValues(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestAmax) {
   torch::Tensor input = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = input.dim();
   for (bool keepdim : {false, true}) {
     for (int dim = -rank; dim < rank; ++dim) {
       torch::Tensor values = torch::amax(input, {dim}, /*keepdim=*/keepdim);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_input = CopyToDevice(input, device);
         torch::Tensor lazy_values =
             torch::amax(lazy_input, {dim}, /*keepdim=*/keepdim);
         AllClose(values, lazy_values);
       });
     }
     for (int dim1 = -rank; dim1 < rank; ++dim1) {
       for (int dim2 = -rank; dim2 < rank; ++dim2) {
         if ((dim1 == dim2) || (dim1 == rank + dim2) || (dim2 == rank + dim1))
           continue;
         torch::Tensor values =
             torch::amax(input, {dim1, dim2}, /*keepdim=*/keepdim);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_input = CopyToDevice(input, device);
           torch::Tensor lazy_values =
               torch::amax(lazy_input, {dim1, dim2}, /*keepdim=*/keepdim);
           AllClose(values, lazy_values);
         });
       }
     }
   }
   ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
   ExpectCounterChanged("xla::amax", GetIgnoredCounters());
 }

 TEST_F(LazyOpsTest, TestAmin) {
   torch::Tensor input = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = input.dim();
   for (bool keepdim : {false, true}) {
     for (int dim = -rank; dim < rank; ++dim) {
       torch::Tensor values = torch::amin(input, {dim}, /*keepdim=*/keepdim);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_input = CopyToDevice(input, device);
         torch::Tensor lazy_values =
             torch::amin(lazy_input, {dim}, /*keepdim=*/keepdim);
         AllClose(values, lazy_values);
       });
     }
     for (int dim1 = -rank; dim1 < rank; ++dim1) {
       for (int dim2 = -rank; dim2 < rank; ++dim2) {
         if ((dim1 == dim2) || (dim1 == rank + dim2) || (dim2 == rank + dim1))
           continue;
         torch::Tensor values =
             torch::amin(input, {dim1, dim2}, /*keepdim=*/keepdim);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_input = CopyToDevice(input, device);
           torch::Tensor lazy_values =
               torch::amin(lazy_input, {dim1, dim2}, /*keepdim=*/keepdim);
           AllClose(values, lazy_values);
         });
       }
     }
   }
   ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
   ExpectCounterChanged("xla::amin", GetIgnoredCounters());
 }

 TEST_F(LazyOpsTest, TestAny) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor a = isFloatingType(scalar_type)
         ? torch::rand(
               {3, 4}, torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {3, 4},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor b = torch::any(a);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::any(lazy_a);
       EqualValues(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestAnyDim) {
   torch::Tensor a = torch::randint(
       0,
       5,
       {2, 3, 4},
       torch::TensorOptions(torch::kByte).device(DefaultDevice()));
   int rank = a.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor b = torch::any(a, dim, /*keepdim=*/false);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::any(lazy_a, dim, /*keepdim=*/false);
       EqualValues(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestAnyDimKeep) {
   torch::Tensor a = torch::randint(
       0,
       5,
       {2, 3, 4},
       torch::TensorOptions(torch::kByte).device(DefaultDevice()));
   int rank = a.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor b = torch::any(a, dim, /*keepdim=*/true);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::any(lazy_a, dim, /*keepdim=*/true);
       EqualValues(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestMean) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::mean(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::mean(lazy_a);
     ASSERT_EQ(b.sizes(), lazy_b.sizes());
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestMeanCast) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::mean(a, torch::kDouble);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::mean(lazy_a, torch::kDouble);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestMeanInDim) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = a.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor b = torch::mean(a, {dim});
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::mean(lazy_a, {dim});
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestMeanInDims) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (auto dims : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
     torch::Tensor b = torch::mean(a, dims);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::mean(lazy_a, dims);
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestMeanInDimsKeepCast) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (auto dims : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
     torch::Tensor b = torch::mean(a, dims, true, torch::kDouble);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::mean(lazy_a, dims, true, torch::kDouble);
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestMeanInDimOut) {
   torch::Tensor a = torch::rand(
       {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = a.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor b = torch::empty(
         {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::mean_out(b, a, {dim});
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::empty({4, 4}, lazy_a.options());
       torch::mean_out(lazy_b, lazy_a, {dim});
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestStd) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (auto unbiased : {true, false}) {
     torch::Tensor b = torch::std(a, unbiased);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::std(lazy_a, unbiased);
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestStdInDim) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = a.dim();
   for (auto unbiased : {true, false}) {
     for (auto keepdim : {true, false}) {
       for (int dim = -rank; dim < rank; ++dim) {
         torch::Tensor b = torch::std(a, {dim}, unbiased, keepdim);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_a = CopyToDevice(a, device);
           torch::Tensor lazy_b = torch::std(lazy_a, {dim}, unbiased, keepdim);
           AllClose(b, lazy_b);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestStdWithCorrection) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // int rank = a.dim();
   c10::optional<c10::Scalar> corrections[] = {1, 2, c10::nullopt};
   for (const auto& correction : corrections) {
     for (auto keepdim : {true, false}) {
       for (const auto& dim :
            std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
         torch::Tensor b = torch::std(a, dim, correction, keepdim);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_a = CopyToDevice(a, device);
           torch::Tensor lazy_b = torch::std(lazy_a, dim, correction, keepdim);
           AllClose(b, lazy_b);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestStdMeanWithCorrection) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // int rank = a.dim();
   c10::optional<c10::Scalar> corrections[] = {1, 2, c10::nullopt};
   for (const auto& correction : corrections) {
     for (auto keepdim : {true, false}) {
       for (const auto& dim :
            std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
         auto b = torch::std_mean(a, dim, correction, keepdim);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_a = CopyToDevice(a, device);
           auto lazy_b = torch::std_mean(lazy_a, dim, correction, keepdim);
           AllClose(std::get<0>(b), std::get<0>(lazy_b));
           AllClose(std::get<1>(b), std::get<1>(lazy_b));
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestSum) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::sum(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::sum(lazy_a);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestSumCast) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::sum(a, torch::kDouble);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::sum(lazy_a, torch::kDouble);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestSumU8) {
   torch::Tensor a = torch::ones(
       {256}, torch::TensorOptions(torch::kByte).device(DefaultDevice()));
   torch::Tensor b = torch::sum(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::sum(lazy_a);
     AllEqual(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestSumInDim) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = a.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor b = torch::sum(a, {dim});
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::sum(lazy_a, {dim});
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestSumInDims) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (auto dims : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
     torch::Tensor b = torch::sum(a, dims);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::sum(lazy_a, dims);
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestSumInDimsKeep) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (auto dims : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
     torch::Tensor b = torch::sum(a, dims, /*keepdim=*/true);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::sum(lazy_a, dims, /*keepdim=*/true);
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestSumInDimsKeepCast) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (auto dims : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
     torch::Tensor b = torch::sum(a, dims, /*keepdim=*/true, torch::kDouble);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b =
           torch::sum(lazy_a, dims, /*keepdim=*/true, torch::kDouble);
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestVar) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (bool unbiased : {true, false}) {
     torch::Tensor b = torch::var(a, unbiased);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::var(lazy_a, unbiased);
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestVarWithDim) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (auto dims : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
     for (bool keepDim : {true, false}) {
       for (bool unbiased : {true, false}) {
         torch::Tensor b = torch::var(a, dims, unbiased, keepDim);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_a = CopyToDevice(a, device);
           torch::Tensor lazy_b = torch::var(lazy_a, dims, unbiased, keepDim);
           AllClose(b, lazy_b);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestVarWithCorrection) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   c10::optional<c10::Scalar> corrections[] = {1, 2, c10::nullopt};
   for (const auto& dim : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
     for (bool keepDim : {true, false}) {
       for (const auto& correction : corrections) {
         torch::Tensor b = torch::var(a, dim, correction, keepDim);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_a = CopyToDevice(a, device);
           torch::Tensor lazy_b = torch::var(lazy_a, dim, correction, keepDim);
           AllClose(b, lazy_b);
         });
       }
     }
   }
   ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
   ExpectCounterChanged("lazy::var", GetIgnoredCounters());
 }

 TEST_F(LazyOpsTest, TestVarMeanWithCorrection) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   c10::optional<c10::Scalar> corrections[] = {1, 2, c10::nullopt};
   for (const auto& dim : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
     for (const auto& correction : corrections) {
       for (auto keepdim : {true, false}) {
         auto b = torch::var_mean(a, dim, correction, keepdim);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_a = CopyToDevice(a, device);
           auto lazy_b = torch::var_mean(lazy_a, dim, correction, keepdim);
           AllClose(std::get<0>(b), std::get<0>(lazy_b));
           AllClose(std::get<1>(b), std::get<1>(lazy_b));
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxInDim) {
   torch::Tensor input = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = input.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     for (bool keepdim : {false, true}) {
       auto values_indices = torch::max(input, dim, /*keepdim=*/keepdim);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_input = CopyToDevice(input, device);
         auto lazy_values_indices =
             torch::max(lazy_input, dim, /*keepdim=*/keepdim);
         AllClose(std::get<0>(values_indices), std::get<0>(lazy_values_indices));
         AllEqual(std::get<1>(values_indices), std::get<1>(lazy_values_indices));
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestMinInDim) {
   torch::Tensor input = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = input.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     for (bool keepdim : {false, true}) {
       auto values_indices = torch::min(input, dim, /*keepdim=*/keepdim);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_input = CopyToDevice(input, device);
         auto lazy_values_indices =
             torch::min(lazy_input, dim, /*keepdim=*/keepdim);
         AllClose(std::get<0>(values_indices), std::get<0>(lazy_values_indices));
         AllEqual(std::get<1>(values_indices), std::get<1>(lazy_values_indices));
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestNorm) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::norm(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::norm(lazy_a);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestNormInDim) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int dim : {1, -2}) {
     torch::Tensor b = torch::norm(a, 2, {dim}, /*keepdim=*/false);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::norm(lazy_a, 2, {dim}, /*keepdim=*/false);
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestNormInDims) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (auto dims : std::vector<std::vector<int64_t>>{{1, 2}, {-2, -1}}) {
     torch::Tensor b = torch::norm(a, 2, dims, /*keepdim=*/false);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::norm(lazy_a, 2, dims, /*keepdim=*/false);
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestNormInDimsKeep) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (auto dims : std::vector<std::vector<int64_t>>{{1, 2}, {-2, -1}}) {
     torch::Tensor b = torch::norm(a, 2, dims, /*keepdim=*/true);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::norm(lazy_a, 2, dims, /*keepdim=*/true);
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestNormalTwoTensor) {
   at::Tensor mean = at::zeros({10, 10, 10}, at::dtype(at::kFloat));
   at::Tensor std = at::ones({10, 10, 10}, at::dtype(at::kFloat));
   ForEachDevice([&](const torch::Device& device) {
     at::Tensor lazy_mean = CopyToDevice(mean, device);
     at::Tensor lazy_std = CopyToDevice(std, device);
     at::Tensor lazy_normal = at::normal(lazy_mean, lazy_std);
     double res_mean = lazy_normal.mean().item().toDouble();
     double res_std = lazy_normal.std().item().toDouble();
     EXPECT_GT(res_mean, -0.06);
     EXPECT_LT(res_mean, 0.06);
     EXPECT_GT(res_std, 0.94);
     EXPECT_LT(res_std, 1.06);
   });
 }

 TEST_F(LazyOpsTest, TestNormalDoubleMean) {
   at::Tensor std = at::ones({10, 10, 10}, at::dtype(at::kFloat));
   ForEachDevice([&](const torch::Device& device) {
     at::Tensor lazy_std = CopyToDevice(std, device);
     at::Tensor lazy_normal = at::normal(0, lazy_std);
     double res_mean = lazy_normal.mean().item().toDouble();
     double res_std = lazy_normal.std().item().toDouble();
     EXPECT_GT(res_mean, -0.06);
     EXPECT_LT(res_mean, 0.06);
     EXPECT_GT(res_std, 0.94);
     EXPECT_LT(res_std, 1.06);
   });
 }

 TEST_F(LazyOpsTest, TestNormalDoubleStd) {
   at::Tensor mean = at::zeros({10, 10, 10}, at::dtype(at::kFloat));
   ForEachDevice([&](const torch::Device& device) {
     at::Tensor lazy_mean = CopyToDevice(mean, device);
     at::Tensor lazy_normal = at::normal(lazy_mean, 1);
     double res_mean = lazy_normal.mean().item().toDouble();
     double res_std = lazy_normal.std().item().toDouble();
     EXPECT_GT(res_mean, -0.06);
     EXPECT_LT(res_mean, 0.06);
     EXPECT_GT(res_std, 0.94);
     EXPECT_LT(res_std, 1.06);
   });
 }

 TEST_F(LazyOpsTest, TestNormalInPlace) {
   at::Tensor a = at::zeros({10, 10, 10}, at::dtype(at::kFloat));
   ForEachDevice([&](const torch::Device& device) {
     at::Tensor lazy_a = CopyToDevice(a, device);
     lazy_a.normal_(/*mean=*/0, /*std=*/1);
     double res_mean = lazy_a.mean().item().toDouble();
     double res_std = lazy_a.std().item().toDouble();
     EXPECT_GT(res_mean, -0.06);
     EXPECT_LT(res_mean, 0.06);
     EXPECT_GT(res_std, 0.94);
     EXPECT_LT(res_std, 1.06);
   });
 }

 TEST_F(LazyOpsTest, TestUniformInPlace) {
   const double eps = 1e-3;
   at::Tensor a = at::zeros({10, 10, 10}, at::dtype(at::kFloat));
   ForEachDevice([&](const torch::Device& device) {
     at::Tensor lazy_a = CopyToDevice(a, device);
     lazy_a.uniform_(/*from=*/0, /*to=*/1);
     at::Tensor cpu_a = ToCpuTensor(lazy_a);
     double res_min = cpu_a.min().item().toDouble();
     double res_max = cpu_a.max().item().toDouble();
     EXPECT_GT(res_min, 0.0 - eps);
     EXPECT_LT(res_max, 1.0 + eps);
   });
 }

 TEST_F(LazyOpsTest, TestRandomInPlace) {
   for (auto dtype :
        {torch::kFloat,
         torch::kDouble,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     const double eps = 0.2;
     torch::Tensor a = torch::zeros({10, 10, 10}, torch::TensorOptions(dtype));
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       lazy_a.random_(/*from=*/0, /*to=*/10);
       double res_mean = lazy_a.sum().item().toDouble() / a.numel();
       double res_min = lazy_a.min().item().toDouble();
       double res_max = lazy_a.max().item().toDouble();
       EXPECT_GT(res_mean, 4.5 - eps);
       EXPECT_LT(res_mean, 4.5 + eps);
       EXPECT_EQ(res_min, 0.0);
       EXPECT_EQ(res_max, 9.0);
     });
   }
 }

 TEST_F(LazyOpsTest, TestRandomInPlaceDefaultFrom) {
   for (auto dtype :
        {torch::kFloat,
         torch::kDouble,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     const double eps = 0.2;
     torch::Tensor a = torch::zeros({10, 10, 10}, torch::TensorOptions(dtype));
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       lazy_a.random_(/*to=*/10);
       double res_mean = lazy_a.sum().item().toDouble() / a.numel();
       double res_min = lazy_a.min().item().toDouble();
       double res_max = lazy_a.max().item().toDouble();
       EXPECT_GT(res_mean, 4.5 - eps);
       EXPECT_LT(res_mean, 4.5 + eps);
       EXPECT_EQ(res_min, 0.0);
       EXPECT_EQ(res_max, 9.0);
     });
   }
 }

 TEST_F(LazyOpsTest, TestRandomInPlaceDefault) {
   for (auto dtype :
        {torch::kFloat,
         torch::kDouble,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     auto input = torch::zeros({10}, torch::TensorOptions(dtype));
     ForEachDevice([&](const torch::Device& device) {
       auto lazyInput = CopyToDevice(input, device);
       lazyInput.random_();
       auto output = ToCpuTensor(lazyInput);
       EXPECT_TRUE(torch::all(output.ne(input)).item<bool>());
     });
   }
 }

 TEST_F(LazyOpsTest, TestNormGeneral) {
   torch::Tensor a = torch::randn(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::norm(a, 3.5);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::norm(lazy_a, 3.5);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestNormNuclear) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::norm(a, 1);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::norm(lazy_a, 1);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestFrobeniusNormInDim) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int dim : {1, -2}) {
     torch::Tensor b = torch::frobenius_norm(a, {dim}, /*keepdim=*/false);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b =
           torch::frobenius_norm(lazy_a, {dim}, /*keepdim=*/false);
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestFrobeniusNormInDims) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (auto dims : std::vector<std::vector<int64_t>>{{1, 2}, {-2, -1}}) {
     torch::Tensor b = torch::frobenius_norm(a, dims, /*keepdim=*/false);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b =
           torch::frobenius_norm(lazy_a, dims, /*keepdim=*/false);
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestGroupNorm) {
   int num_channels = 6;
   torch::Tensor input = torch::rand(
       {20, num_channels, 10, 10},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor weight = torch::rand(
       {num_channels},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor bias = torch::rand(
       {num_channels},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   double eps = 1e-05;
   for (int num_groups : {3, 6, 1}) {
     torch::Tensor output = torch::group_norm(
         input,
         num_groups,
         weight,
         bias,
         eps,
         /*cudnn_enabled=*/false);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_weight = CopyToDevice(weight, device);
       torch::Tensor lazy_bias = CopyToDevice(bias, device);
       torch::Tensor lazy_output = torch::group_norm(
           lazy_input,
           num_groups,
           lazy_weight,
           lazy_bias,
           eps,
           /*cudnn_enabled=*/false);
       AllClose(output, lazy_output, /*rtol=*/1e-3, /*atol=*/1e-5);
     });
   }
 }

 TEST_F(LazyOpsTest, TestGroupNormBackward) {
   int num_channels = 6;
   torch::Tensor input = torch::rand(
       {2, num_channels, 5, 5},
       torch::TensorOptions(torch::kFloat)
           .device(DefaultDevice())
           .requires_grad(true));
   torch::Tensor weight = torch::rand(
       {num_channels},
       torch::TensorOptions(torch::kFloat)
           .device(DefaultDevice())
           .requires_grad(true));
   torch::Tensor bias = torch::rand(
       {num_channels},
       torch::TensorOptions(torch::kFloat)
           .device(DefaultDevice())
           .requires_grad(true));
   double eps = 1e-05;
   for (bool undef_weight : {true, false}) {
     for (int num_groups : {3, 6, 1}) {
       auto testfn =
           [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
         return torch::group_norm(
             /*input=*/inputs[0],
             num_groups,
             inputs[1],
             inputs[2],
             /*eps=*/eps,
             /*cudnn_enabled=*/false);
       };
       torch::Tensor undef;
       ForEachDevice([&](const torch::Device& device) {
         TestBackward(
             {input, undef_weight ? undef : weight, undef_weight ? undef : bias},
             device,
             testfn,
             /*rtol=*/1e-3,
             /*atol=*/1e-3,
             /*derivative_level=*/2);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestInstanceNorm) {
   int batch = 5;
   int num_channels = 20;
   torch::Tensor input = torch::rand(
       {batch, num_channels, 10, 10},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor weight = torch::rand(
       {num_channels},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor bias = torch::rand(
       {num_channels},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor running_mean = torch::zeros(
       {num_channels},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor running_var = torch::ones(
       {num_channels},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   double momentum = 0.1;
   double eps = 1e-05;
   torch::Tensor output = torch::instance_norm(
       input,
       weight,
       bias,
       running_mean,
       running_var,
       /*use_input_stats=*/true,
       momentum,
       eps,
       /*cudnn_enabled=*/false);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_weight = CopyToDevice(weight, device);
     torch::Tensor lazy_bias = CopyToDevice(bias, device);
     torch::Tensor lazy_running_mean = CopyToDevice(running_mean, device);
     torch::Tensor lazy_running_var = CopyToDevice(running_var, device);
     torch::Tensor lazy_output = torch::instance_norm(
         lazy_input,
         lazy_weight,
         lazy_bias,
         lazy_running_mean,
         lazy_running_var,
         /*use_input_stats=*/true,
         momentum,
         eps,
         /*cudnn_enabled=*/false);
     AllClose(output, lazy_output, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestLayerNorm) {
   torch::Tensor input = torch::rand(
       {20, 10, 10, 10},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   double eps = 1e-05;
   torch::Tensor undef;
   for (bool undef_weight : {true, false}) {
     for (int64_t normalized_size : {2, 3}) {
       std::vector<int64_t> normalized_shape(normalized_size, 10);
       torch::Tensor weight = torch::rand(
           normalized_shape,
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor bias = torch::rand(
           normalized_shape,
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor output = torch::layer_norm(
           input,
           normalized_shape,
           undef_weight ? undef : weight,
           undef_weight ? undef : bias,
           eps,
           /*cudnn_enabled=*/false);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_input = CopyToDevice(input, device);
         torch::Tensor lazy_weight =
             undef_weight ? undef : CopyToDevice(weight, device);
         torch::Tensor lazy_bias =
             undef_weight ? undef : CopyToDevice(bias, device);
         torch::Tensor lazy_output = torch::layer_norm(
             lazy_input,
             normalized_shape,
             lazy_weight,
             lazy_bias,
             eps,
             /*cudnn_enabled=*/false);
         AllClose(output, lazy_output, /*rtol=*/1e-3, /*atol=*/1e-5);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestLayerNormBackward) {
   torch::Tensor input = torch::rand(
       {2, 3, 3, 3},
       torch::TensorOptions(torch::kFloat)
           .device(DefaultDevice())
           .requires_grad(true));
   double eps = 1e-05;
   for (bool undef_weight : {true, false}) {
     for (int64_t normalized_size : {2, 3}) {
       std::vector<int64_t> normalized_shape(normalized_size, 3);
       auto testfn =
           [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
         return torch::layer_norm(
             /*input=*/inputs[0],
             normalized_shape,
             inputs[1],
             inputs[2],
             /*eps=*/eps,
             /*cudnn_enabled=*/false);
       };
       torch::Tensor weight = torch::rand(
           normalized_shape,
           torch::TensorOptions(torch::kFloat)
               .device(DefaultDevice())
               .requires_grad(true));
       torch::Tensor bias = torch::rand(
           normalized_shape,
           torch::TensorOptions(torch::kFloat)
               .device(DefaultDevice())
               .requires_grad(true));
       torch::Tensor undef;
       ForEachDevice([&](const torch::Device& device) {
         TestBackward(
             {input, undef_weight ? undef : weight, undef_weight ? undef : bias},
             device,
             testfn,
             /*rtol=*/1e-3,
             /*atol=*/1e-4,
             /*derivative_level=*/2);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestNuclearNorm) {
   torch::Tensor a = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::nuclear_norm(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::nuclear_norm(lazy_a);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestPairwiseDistance) {
   torch::Tensor x1 = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor x2 = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   double eps = 1e-6;
   for (bool keepdim : {false, true}) {
     for (double p : {1, 2, 3, 4}) {
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor output =
             torch::pairwise_distance(x1, x2, p, eps, keepdim);
         torch::Tensor lazy_x1 = CopyToDevice(x1, device);
         torch::Tensor lazy_x2 = CopyToDevice(x2, device);
         torch::Tensor lazy_output =
             torch::pairwise_distance(lazy_x1, lazy_x2, p, eps, keepdim);
         AllClose(output, lazy_output, /*rtol=*/1e-5, /*atol=*/1e-5);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestCosineSimilarity) {
   torch::Tensor x1 = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor x2 = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   double eps = 1e-8;
   int rank = x1.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor output = torch::cosine_similarity(x1, x2, dim, eps);
       torch::Tensor lazy_x1 = CopyToDevice(x1, device);
       torch::Tensor lazy_x2 = CopyToDevice(x2, device);
       torch::Tensor lazy_output =
           torch::cosine_similarity(lazy_x1, lazy_x2, dim, eps);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestCosineEmbeddingLoss) {
   torch::Tensor input1 = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor input2 = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor target = torch::rand(
       {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (torch::Reduction::Reduction reduction :
        {torch::Reduction::Mean, torch::Reduction::Sum}) {
     for (double margin : {0., 0.2}) {
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor output = torch::cosine_embedding_loss(
             input1, input2, target, margin, reduction);
         torch::Tensor lazy_input1 = CopyToDevice(input1, device);
         torch::Tensor lazy_input2 = CopyToDevice(input2, device);
         torch::Tensor lazy_target = CopyToDevice(target, device);
         torch::Tensor lazy_output = torch::cosine_embedding_loss(
             lazy_input1, lazy_input2, lazy_target, margin, reduction);
         AllClose(output, lazy_output);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestHingeEmbeddingLoss) {
   torch::Tensor input = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor target = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (torch::Reduction::Reduction reduction :
        {torch::Reduction::Mean, torch::Reduction::Sum}) {
     for (double margin : {0., 0.2}) {
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor output =
             torch::hinge_embedding_loss(input, target, margin, reduction);
         torch::Tensor lazy_input = CopyToDevice(input, device);
         torch::Tensor lazy_target = CopyToDevice(target, device);
         torch::Tensor lazy_output = torch::hinge_embedding_loss(
             lazy_input, lazy_target, margin, reduction);
         AllClose(output, lazy_output);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestTripletMarginLoss) {
   torch::Tensor anchor = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor positive = torch::abs(torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())));
   torch::Tensor negative = torch::neg(torch::abs(torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()))));
   double eps = 1e-6;
   for (double margin : {0., 0.2}) {
     for (double p : {1, 2, 3, 4}) {
       for (bool swap : {false, true}) {
         for (torch::Reduction::Reduction reduction :
              {torch::Reduction::Mean, torch::Reduction::Sum}) {
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor output = torch::triplet_margin_loss(
                 anchor, positive, negative, margin, p, eps, swap, reduction);
             torch::Tensor lazy_anchor = CopyToDevice(anchor, device);
             torch::Tensor lazy_positive = CopyToDevice(positive, device);
             torch::Tensor lazy_negative = CopyToDevice(negative, device);
             torch::Tensor lazy_output = torch::triplet_margin_loss(
                 lazy_anchor,
                 lazy_positive,
                 lazy_negative,
                 margin,
                 p,
                 eps,
                 swap,
                 reduction);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestBinaryCrossEntropy) {
   int batch = 10;
   int classes = 5;
   torch::Tensor input = torch::rand(
       {batch, classes},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor target = torch::rand(
       {batch, classes},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor weight = torch::rand(
       {batch, classes},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor undef;
   for (torch::Reduction::Reduction reduction :
        {torch::Reduction::Mean,
         torch::Reduction::Sum,
         torch::Reduction::None}) {
     for (bool undef_weight : {false, true}) {
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor output = torch::binary_cross_entropy(
             input, target, undef_weight ? undef : weight, reduction);
         torch::Tensor lazy_input = CopyToDevice(input, device);
         torch::Tensor lazy_target = CopyToDevice(target, device);
         torch::Tensor lazy_weight =
             undef_weight ? undef : CopyToDevice(weight, device);
         torch::Tensor lazy_output = torch::binary_cross_entropy(
             lazy_input, lazy_target, lazy_weight, reduction);
         AllClose(output, lazy_output, /*rtol=*/1e-4, /*atol=*/1e-5);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestMarginRankingLoss) {
   torch::Tensor input1 = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor input2 = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor target = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (torch::Reduction::Reduction reduction :
        {torch::Reduction::Mean, torch::Reduction::Sum}) {
     for (double margin : {0., 0.2}) {
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor output = torch::margin_ranking_loss(
             input1, input2, target, margin, reduction);
         torch::Tensor lazy_input1 = CopyToDevice(input1, device);
         torch::Tensor lazy_input2 = CopyToDevice(input2, device);
         torch::Tensor lazy_target = CopyToDevice(target, device);
         torch::Tensor lazy_output = torch::margin_ranking_loss(
             lazy_input1, lazy_input2, lazy_target, margin, reduction);
         AllClose(output, lazy_output);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestBCEWithLogits) {
   int batch = 10;
   int classes = 5;
   torch::Tensor input = torch::rand(
       {batch, classes},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor target = torch::rand(
       {batch, classes},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor weight = torch::rand(
       {classes}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor pos_weight = torch::rand(
       {classes}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor undef;
   for (torch::Reduction::Reduction reduction :
        {torch::Reduction::Mean, torch::Reduction::Sum}) {
     for (bool undef_weight : {false, true}) {
       for (bool undef_pos_weight : {false, true}) {
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor output = torch::binary_cross_entropy_with_logits(
               input,
               target,
               undef_weight ? undef : weight,
               undef_pos_weight ? undef : pos_weight,
               reduction);
           torch::Tensor lazy_input = CopyToDevice(input, device);
           torch::Tensor lazy_target = CopyToDevice(target, device);
           torch::Tensor lazy_weight =
               undef_weight ? undef : CopyToDevice(weight, device);
           torch::Tensor lazy_pos_weight =
               undef_pos_weight ? undef : CopyToDevice(pos_weight, device);
           torch::Tensor lazy_output = torch::binary_cross_entropy_with_logits(
               lazy_input, lazy_target, lazy_weight, lazy_pos_weight, reduction);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestKlDiv) {
   torch::Tensor input = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor target = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (bool log_target : {true, false}) {
     for (torch::Reduction::Reduction reduction :
          {torch::Reduction::Mean, torch::Reduction::Sum}) {
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor output =
             torch::kl_div(input, target, reduction, log_target);
         torch::Tensor lazy_input = CopyToDevice(input, device);
         torch::Tensor lazy_target = CopyToDevice(target, device);
         torch::Tensor lazy_output =
             torch::kl_div(lazy_input, lazy_target, reduction, log_target);
         AllClose(output, lazy_output);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestProd) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::prod(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::prod(lazy_a);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestProdCast) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::prod(a, torch::kDouble);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::prod(lazy_a, torch::kDouble);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestProdInDim) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = a.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor b = torch::prod(a, dim);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::prod(lazy_a, dim);
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestProdInDimKeepCast) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = a.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor b = torch::prod(a, dim, /*keepdim=*/true, torch::kDouble);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b =
           torch::prod(lazy_a, dim, /*keepdim=*/true, torch::kDouble);
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestProdInDimKeep) {
   torch::Tensor a = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = a.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor b = torch::prod(a, dim, /*keepdim=*/true);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::prod(lazy_a, dim, /*keepdim=*/true);
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestCumSum) {
   torch::Tensor input = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = input.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor result = torch::cumsum(input, dim);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_result = torch::cumsum(lazy_input, dim);
       AllClose(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestCumSumCast) {
   torch::Tensor input = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = input.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor result = torch::cumsum(input, dim, torch::kDouble);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_result =
           torch::cumsum(lazy_input, dim, torch::kDouble);
       AllClose(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestCumSumLong) {
   torch::Tensor input = torch::randint(
       1000,
       {4, 3, 4},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   int rank = input.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor result = torch::cumsum(input, dim);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_result = torch::cumsum(lazy_input, dim);
       AllEqual(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestCumSumCastLong) {
   torch::Tensor input = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = input.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor result = torch::cumsum(input, dim, torch::kLong);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_result = torch::cumsum(lazy_input, dim, torch::kLong);
       AllEqual(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestCumProd) {
   torch::Tensor input = torch::rand(
       {4, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = input.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor result = torch::cumprod(input, dim);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_result = torch::cumprod(lazy_input, dim);
       AllClose(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestCumProdCast) {
   torch::Tensor input = torch::mul(
       torch::rand(
           {4, 3, 4},
           torch::TensorOptions(torch::kFloat).device(DefaultDevice())),
       10);
   int rank = input.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor result = torch::cumprod(input, dim, torch::kDouble);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_result =
           torch::cumprod(lazy_input, dim, torch::kDouble);
       AllClose(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestCumProdLong) {
   torch::Tensor input = torch::randint(
       7, {2, 3}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   int rank = input.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor result = torch::cumsum(input, dim);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_result = torch::cumsum(lazy_input, dim);
       AllEqual(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestCumProdCastLong) {
   torch::Tensor input =
       torch::rand(
           {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       7;
   int rank = input.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor result = torch::cumsum(input, dim, torch::kLong);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_result = torch::cumsum(lazy_input, dim, torch::kLong);
       AllEqual(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestArgMin) {
   torch::Tensor a = torch::rand(
       {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::argmin(a, c10::nullopt, /*keepdim=*/false);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b =
         torch::argmin(lazy_a, c10::nullopt, /*keepdim=*/false);
     AllEqual(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestArgMinDim) {
   torch::Tensor a = torch::rand(
       {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int dim : {1, -2}) {
     torch::Tensor b = torch::argmin(a, dim, /*keepdim=*/false);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::argmin(lazy_a, dim, /*keepdim=*/false);
       AllEqual(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestArgMinDimKeep) {
   torch::Tensor a = torch::rand(
       {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int dim : {1, -2}) {
     torch::Tensor b = torch::argmin(a, dim, /*keepdim=*/true);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::argmin(lazy_a, dim, /*keepdim=*/true);
       AllEqual(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestArgMinSameValue) {
   torch::Tensor a = torch::ones(
       {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::argmin(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::argmin(lazy_a);
     AllEqual(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestArgMinWrapper) {
   torch::Tensor a = torch::rand(
       {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int dim : {1, -2}) {
     torch::Tensor b = torch::argmin(a, dim, /*keepdim=*/false);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::argmin(lazy_a, dim, /*keepdim=*/false);
       AllEqual(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestArgMax) {
   torch::Tensor a = torch::rand(
       {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::argmax(a, c10::nullopt, /*keepdim=*/false);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b =
         torch::argmax(lazy_a, c10::nullopt, /*keepdim=*/false);
     AllEqual(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestArgMaxDim) {
   torch::Tensor a = torch::rand(
       {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int dim : {1, -2}) {
     torch::Tensor b = torch::argmax(a, dim, /*keepdim=*/false);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::argmax(lazy_a, dim, /*keepdim=*/false);
       AllEqual(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestArgMaxDimKeep) {
   torch::Tensor a = torch::rand(
       {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int dim : {1, -2}) {
     torch::Tensor b = torch::argmax(a, dim, /*keepdim=*/true);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::argmax(lazy_a, dim, /*keepdim=*/true);
       AllEqual(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestArgMaxSameValue) {
   torch::Tensor a = torch::ones(
       {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::argmax(a, c10::nullopt, /*keepdim=*/false);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b =
         torch::argmax(lazy_a, c10::nullopt, /*keepdim=*/false);
     AllEqual(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestArgMaxWrapper) {
   torch::Tensor a = torch::rand(
       {4, 4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int dim : {1, -2}) {
     torch::Tensor b = torch::argmax(a, dim, /*keepdim=*/false);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::argmax(lazy_a, dim, /*keepdim=*/false);
       AllEqual(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestAsin) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::asin(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::asin(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestAsinh) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::asinh(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::asinh(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestAsinhInPlace) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor b = torch::asinh_(a);
     torch::Tensor lazy_b = torch::asinh_(lazy_a);
     AllClose(a, lazy_a, /*rtol=*/1e-3, /*atol=*/1e-5);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestSin) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::sin(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::sin(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestSinh) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::sinh(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::sinh(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestAcos) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::acos(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::acos(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestAcosh) {
   torch::Tensor a =
       torch::rand(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100;
   torch::Tensor b = torch::acosh(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::acosh(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestAcoshInPlace) {
   torch::Tensor a =
       torch::rand(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor b = torch::acosh_(a);
     torch::Tensor lazy_b = torch::acosh_(lazy_a);
     AllClose(a, lazy_a, /*rtol=*/1e-3, /*atol=*/1e-5);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestCos) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::cos(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::cos(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestCosh) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::cosh(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::cosh(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestAtan) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::atan(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::atan(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestAtanh) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::atanh(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::atanh(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestAtanhInPlace) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor b = torch::atanh_(a);
     torch::Tensor lazy_b = torch::atanh_(lazy_a);
     AllClose(a, lazy_a, /*rtol=*/1e-3, /*atol=*/1e-5);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestAtan2) {
   torch::Tensor a = torch::randn(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::randn(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::atan2(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::atan2(lazy_a, lazy_b);
     AllClose(c, lazy_c, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestTan) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::tan(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::tan(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestTanh) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::tanh(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::tanh(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestClampMinMax) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar min_val(0.311);
   torch::Scalar max_val(0.409);
   torch::Tensor b = torch::clamp(a, min_val, max_val);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::clamp(lazy_a, min_val, max_val);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestClampMin) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar min_val(0.311);
   torch::Tensor b = torch::clamp(a, min_val, c10::nullopt);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::clamp(lazy_a, min_val, c10::nullopt);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestClampMax) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar max_val(0.409);
   torch::Tensor b = torch::clamp(a, c10::nullopt, max_val);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::clamp(lazy_a, c10::nullopt, max_val);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestClampMinExplicit) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar min_val(0.311);
   torch::Tensor b = torch::clamp_min(a, min_val);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::clamp_min(lazy_a, min_val);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestClampMaxExplicit) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar max_val(0.409);
   torch::Tensor b = torch::clamp_max(a, max_val);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::clamp_max(lazy_a, max_val);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestClampMinExplicitInPlace) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar min_val(0.311);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor b = torch::clamp_min_(a, min_val);
     torch::Tensor lazy_b = torch::clamp_min_(lazy_a, min_val);
     AllClose(a, lazy_a);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestClampMaxExplicitInPlace) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar max_val(0.409);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor b = torch::clamp_max_(a, max_val);
     torch::Tensor lazy_b = torch::clamp_max_(lazy_a, max_val);
     AllClose(a, lazy_a);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestCeil) {
   torch::Tensor a =
       torch::randn(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100.0;
   torch::Tensor b = torch::ceil(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::ceil(lazy_a);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestFloor) {
   torch::Tensor a =
       torch::randn(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100.0;
   torch::Tensor b = torch::floor(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::floor(lazy_a);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestRound) {
   torch::Tensor a = torch::cat(
       {torch::randn(
            {8}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
            100.0,
        // Special case: 0.5, -0.5. lazy::Round impl rounds to -1/1 whereas
        // lazy::RoundToEven properly implements bankers rounding.
        torch::tensor(
            {-0.5, 0.5},
            torch::TensorOptions(torch::kFloat).device(DefaultDevice()))},
       0);
   torch::Tensor b = torch::round(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::round(lazy_a);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestTrunc) {
   torch::Tensor a =
       torch::randn(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100.0;
   torch::Tensor b = torch::trunc(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::trunc(lazy_a);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestFrac) {
   torch::Tensor a =
       torch::randn(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100.0;
   torch::Tensor b = torch::frac(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::frac(lazy_a);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestNeg) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::neg(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::neg(lazy_a);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestBitwiseNot) {
   std::vector<torch::ScalarType> types(
       {torch::kByte, torch::kChar, torch::kShort, torch::kInt, torch::kLong});

   ForEachDevice([&](const torch::Device& device) {
     for (auto type : types) {
       torch::Tensor a =
           torch::randint(0, 63, {2, 2}, torch::TensorOptions(type));
       torch::Tensor b = torch::bitwise_not(a);
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = torch::bitwise_not(lazy_a);
       AllEqual(b, lazy_b);
     }
   });
 }

 TEST_F(LazyOpsTest, TestBitwiseNotInPlace) {
   std::vector<torch::ScalarType> types(
       {torch::kByte, torch::kChar, torch::kShort, torch::kInt, torch::kLong});

   ForEachDevice([&](const torch::Device& device) {
     for (auto type : types) {
       torch::Tensor a =
           torch::randint(0, 63, {2, 2}, torch::TensorOptions(type));
       torch::Tensor lazy_a = CopyToDevice(a, device);
       a.bitwise_not_();
       lazy_a.bitwise_not_();
       AllEqual(a, lazy_a);
     }
   });
 }

 TEST_F(LazyOpsTest, TestSign) {
   torch::Tensor a =
       torch::randn(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100.0;
   torch::Tensor b = torch::sign(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::sign(lazy_a);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestSignByte) {
   torch::Tensor a = torch::randint(
       256, {2, 2}, torch::TensorOptions(torch::kByte).device(DefaultDevice()));
   torch::Tensor b = torch::sign(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::sign(lazy_a);
     AllEqual(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestAbs) {
   torch::Tensor a = torch::randn(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::abs(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::abs(lazy_a);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestAbsByte) {
   torch::Tensor a = torch::randint(
       256, {2, 2}, torch::TensorOptions(torch::kByte).device(DefaultDevice()));
   torch::Tensor b = torch::abs(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::abs(lazy_a);
     AllEqual(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestEmptyLike) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::empty_like(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::empty_like(lazy_a);
     EXPECT_EQ(b.sizes(), lazy_b.sizes());
   });
 }

 TEST_F(LazyOpsTest, TestEmptyLikeOptions) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::empty_like(
       a, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::empty_like(
         lazy_a, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     EXPECT_EQ(b.sizes(), lazy_b.sizes());
   });
 }

 TEST_F(LazyOpsTest, TestEmpty) {
   torch::Tensor a = torch::zeros(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = torch::empty(
         {2, 2}, torch::TensorOptions(torch::kFloat).device(device));
     EXPECT_EQ(a.sizes(), lazy_a.sizes());
   });
 }

 TEST_F(LazyOpsTest, TestZeroInPlace) {
   torch::Tensor input = torch::ones(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));

   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazyInput = CopyToDevice(input, device);
     auto& output = torch::zero_(input);
     auto& lazyOutput = torch::zero_(lazyInput);
     AllClose(output, lazyOutput);
   });
 }

 TEST_F(LazyOpsTest, TestZerosLike) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::zeros_like(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::zeros_like(lazy_a);
     AllClose(a, lazy_a);
   });
 }

 TEST_F(LazyOpsTest, TestZerosLikeOptions) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::zeros_like(
       a, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::zeros_like(
         lazy_a, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     AllClose(a, lazy_a);
   });
 }

 TEST_F(LazyOpsTest, TestZeros) {
   torch::Tensor a = torch::zeros(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = torch::zeros(
         {2, 2}, torch::TensorOptions(torch::kFloat).device(device));
     AllClose(a, lazy_a);
   });
 }

 TEST_F(LazyOpsTest, TestOnes) {
   torch::Tensor a = torch::ones(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a =
         torch::ones({2, 2}, torch::TensorOptions(torch::kFloat).device(device));
     AllClose(a, lazy_a);
   });
 }

 TEST_F(LazyOpsTest, TestOnesLike) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::ones_like(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::ones_like(lazy_a);
     AllClose(a, lazy_a);
   });
 }

 TEST_F(LazyOpsTest, TestOnesLikeOptions) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::ones_like(
       a, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::ones_like(
         lazy_a, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     AllClose(a, lazy_a);
   });
 }

 TEST_F(LazyOpsTest, TestFull) {
   torch::Tensor a = torch::full(
       {2, 2},
       3.1165,
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = torch::full(
         {2, 2}, 3.1165, torch::TensorOptions(torch::kFloat).device(device));
     AllClose(a, lazy_a);
   });
 }

 TEST_F(LazyOpsTest, TestFullLike) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::full_like(a, 3.1165);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::full_like(lazy_a, 3.1165);
     AllClose(a, lazy_a);
   });
 }

 TEST_F(LazyOpsTest, TestFullLikeOptions) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::full_like(
       a, 3.1165, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::full_like(
         lazy_a,
         3.1165,
         torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     AllClose(a, lazy_a);
   });
 }

 TEST_F(LazyOpsTest, TestARange) {
   for (auto& ranges : std::vector<std::vector<float>>{
            {0.0, 100.0, 0.5}, {0.0, -100.0, -0.5}}) {
     torch::Tensor a = torch::arange(
         ranges[0],
         ranges[1],
         ranges[2],
         torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = torch::arange(
           ranges[0],
           ranges[1],
           ranges[2],
           torch::TensorOptions(torch::kFloat).device(device));
       AllClose(a, lazy_a);
     });
   }
 }

 TEST_F(LazyOpsTest, TestARangeOut) {
   torch::Tensor a = torch::randn(
       {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (auto& ranges : std::vector<std::vector<float>>{
            {0.0, 100.0, 0.5}, {0.0, -100.0, -0.5}}) {
     torch::Tensor b = torch::arange_out(a, ranges[0], ranges[1], ranges[2]);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b =
           torch::arange_out(lazy_a, ranges[0], ranges[1], ranges[2]);
       AllClose(b, lazy_b);
     });
   }
 }

 TEST_F(LazyOpsTest, TestDimARange) {
   torch::Tensor like = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor a = torch::_dim_arange(like, 1);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_like = CopyToDevice(like, device);
     torch::Tensor lazy_a = torch::_dim_arange(lazy_like, 1);
     AllClose(a, lazy_a);
   });
 }

 TEST_F(LazyOpsTest, TestBartlettWindow) {
   int window_length = 10;
   for (bool periodic : {false, true}) {
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor output = torch::bartlett_window(
           window_length,
           periodic,
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));

       torch::Tensor lazy_output = torch::bartlett_window(
           window_length,
           periodic,
           torch::TensorOptions(torch::kFloat).device(device));
       AllClose(output, lazy_output, /*rtol=*/1e-5, /*atol=*/1e-7);
     });
   }
 }

 TEST_F(LazyOpsTest, TestBlackmanWindow) {
   int window_length = 10;
   for (bool periodic : {false, true}) {
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor output = torch::blackman_window(
           window_length,
           periodic,
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor lazy_output = torch::blackman_window(
           window_length,
           periodic,
           torch::TensorOptions(torch::kFloat).device(device));
       AllClose(output, lazy_output, /*rtol=*/1e-5, /*atol=*/1e-7);
     });
   }
 }

 TEST_F(LazyOpsTest, TestHammingWindow) {
   double alpha = 0.54;
   double beta = 0.46;
   int window_length = 10;
   for (bool periodic : {false, true}) {
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor output = torch::hamming_window(
           window_length,
           periodic,
           alpha,
           beta,
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor lazy_output = torch::hamming_window(
           window_length,
           periodic,
           alpha,
           beta,
           torch::TensorOptions(torch::kFloat).device(device));
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestHannWindow) {
   int window_length = 10;
   for (bool periodic : {false, true}) {
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor output = torch::hann_window(
           window_length,
           periodic,
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor lazy_output = torch::hann_window(
           window_length,
           periodic,
           torch::TensorOptions(torch::kFloat).device(device));
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestLogSigmoid) {
   torch::Tensor a = torch::empty(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   a.uniform_(-1.0, 1.0);
   torch::Tensor b = torch::log_sigmoid(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::log_sigmoid(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestLogSigmoidForward) {
   torch::Tensor a = torch::empty(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   a.uniform_(-1.0, 1.0);
   auto tuple = torch::log_sigmoid_forward(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     auto lazy_tuple = torch::log_sigmoid_forward(lazy_a);
     AllClose(
         std::get<0>(tuple),
         std::get<0>(lazy_tuple),
         /*rtol=*/1e-3,
         /*atol=*/1e-5);
     AllClose(
         std::get<1>(tuple),
         std::get<1>(lazy_tuple),
         /*rtol=*/1e-3,
         /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestLogsumexp) {
   torch::Tensor a = torch::rand(
       {3, 4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (auto dims : std::vector<std::vector<int64_t>>{{0, 1}, {-3, -2}}) {
     for (bool keepdim : {false, true}) {
       torch::Tensor b = torch::logsumexp(a, dims, keepdim);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_a = CopyToDevice(a, device);
         torch::Tensor lazy_b = torch::logsumexp(lazy_a, dims, keepdim);
         AllClose(b, lazy_b);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestSiLU) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::silu(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::silu(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
   ExpectCounterChanged("lazy::silu_out", GetIgnoredCounters());
 }

 TEST_F(LazyOpsTest, TestSigmoid) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::sigmoid(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::sigmoid(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestMatmul_1x1) {
   torch::Tensor a = torch::rand(
       {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::matmul(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::matmul(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestMatmul_2x1) {
   torch::Tensor a = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::matmul(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::matmul(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestMatmul_1x2) {
   torch::Tensor a = torch::rand(
       {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::matmul(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::matmul(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestMatmul_2x2) {
   torch::Tensor a = torch::rand(
       {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::matmul(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::matmul(lazy_a, lazy_b);
     AllClose(c, lazy_c, /*rtol=*/1e-3, /*atol=*/1e-4);
   });
 }

 TEST_F(LazyOpsTest, TestMatmulBcast) {
   torch::Tensor a = torch::rand(
       {4, 2, 3, 2, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2, 1, 4, 3},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::matmul(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::matmul(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestDot) {
   torch::Tensor a = torch::rand(
       {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::dot(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::dot(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestTensorDot) {
   torch::Tensor a = torch::rand(
       {6, 4, 8}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {4, 7, 8}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<int64_t> dims_a = {1, 2};
   std::vector<int64_t> dims_b = {0, 2};
   torch::Tensor c = torch::tensordot(a, b, dims_a, dims_b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::tensordot(lazy_a, lazy_b, dims_a, dims_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestGer) {
   torch::Tensor a = torch::rand(
       {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::ger(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::ger(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestMv) {
   torch::Tensor a = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::mv(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::mv(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestMvOut) {
   torch::Tensor a = torch::rand(
       {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::empty(
       {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::mv_out(c, a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::empty({4}, lazy_b.options());
     torch::mv_out(lazy_c, lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestBatchAddBatchMatMul) {
   torch::Tensor a = torch::rand(
       {3, 6, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {3, 6, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::rand(
       {3, 4, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar alpha = 0.5;
   torch::Scalar beta = 1.5;
   torch::Tensor d = torch::baddbmm(a, b, c, beta, alpha);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = CopyToDevice(c, device);
     torch::Tensor lazy_d = torch::baddbmm(lazy_a, lazy_b, lazy_c, beta, alpha);
     AllClose(d, lazy_d, /*rtol=*/1e-3, /*atol=*/1e-4);
   });
 }

 TEST_F(LazyOpsTest, TestBatchAddBatchMatMulInPlace) {
   torch::Tensor a = torch::rand(
       {3, 6, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {3, 6, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::rand(
       {3, 4, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar alpha = 0.5;
   torch::Scalar beta = 1.5;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = CopyToDevice(c, device);
     torch::Tensor d = a.baddbmm_(b, c, beta, alpha);
     torch::Tensor lazy_d = lazy_a.baddbmm_(lazy_b, lazy_c, beta, alpha);
     AllClose(d, lazy_d, /*rtol=*/1e-3, /*atol=*/1e-4);
     AllClose(a, lazy_a, /*rtol=*/1e-3, /*atol=*/1e-4);
   });
 }

 TEST_F(LazyOpsTest, TestBatchMatMul) {
   torch::Tensor a = torch::rand(
       {3, 6, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {3, 4, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::bmm(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::bmm(lazy_a, lazy_b);
     AllClose(c, lazy_c, /*rtol=*/1e-3, /*atol=*/1e-4);
   });
 }

 TEST_F(LazyOpsTest, TestChainMatMul) {
   torch::Tensor a = torch::rand(
       {5, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {4, 6}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::rand(
       {6, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor d = torch::rand(
       {2, 7}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor result = torch::chain_matmul({a, b, c, d});
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = CopyToDevice(c, device);
     torch::Tensor lazy_d = CopyToDevice(d, device);
     torch::Tensor lazy_result =
         torch::chain_matmul({lazy_a, lazy_b, lazy_c, lazy_d});
     AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-4);
   });
 }

 TEST_F(LazyOpsTest, TestLinear) {
   torch::Tensor input = torch::rand(
       {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor weight = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor bias = torch::rand(
       {3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor result = torch::linear(input, weight);
   torch::Tensor result_with_bias = torch::linear(input, weight, bias);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_weight = CopyToDevice(weight, device);
     torch::Tensor lazy_bias = CopyToDevice(bias, device);
     torch::Tensor lazy_result = torch::linear(lazy_input, lazy_weight);
     torch::Tensor lazy_result_with_bias =
         torch::linear(lazy_input, lazy_weight, lazy_bias);
     AllClose(result, lazy_result, /*rtol=*/1e-2, /*atol=*/1e-4);
     AllClose(
         result_with_bias,
         lazy_result_with_bias,
         /*rtol=*/1e-2,
         /*atol=*/1e-4);
   });
 }

 TEST_F(LazyOpsTest, TestPinverse) {
   torch::Tensor input = torch::rand(
       {4, 6}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor result = torch::pinverse(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_result = torch::pinverse(lazy_input);
     AllClose(result, lazy_result, /*rtol=*/1e-4);
   });
 }

 TEST_F(LazyOpsTest, TestEinsumOuter) {
   torch::Tensor a = torch::rand(
       {5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::string equation = "i,j->ij";
   torch::Tensor c = torch::einsum(equation, {a, b});
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::einsum(equation, {lazy_a, lazy_b});
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestEinsumOuterBackward) {
   torch::Tensor a = torch::rand(
       {5},
       torch::TensorOptions(torch::kFloat)
           .device(DefaultDevice())
           .requires_grad(true));
   torch::Tensor b = torch::rand(
       {5},
       torch::TensorOptions(torch::kFloat)
           .device(DefaultDevice())
           .requires_grad(true));
   std::string equation = "i,j->ij";
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::einsum(equation, inputs);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward({a, b}, device, testfn, /*rtol=*/1e-3, /*atol=*/1e-4);
   });
 }

 TEST_F(LazyOpsTest, TestEinsumBatchMatMul) {
   torch::Tensor a = torch::rand(
       {3, 2, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {3, 5, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::string equation = "bij,bjk->bik";
   torch::Tensor c = torch::einsum(equation, {a, b});
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::einsum(equation, {lazy_a, lazy_b});
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestEinsumPyTorchLowerBilinear) {
   torch::Tensor a = torch::rand(
       {3, 5, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor l = torch::rand(
       {2, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor r = torch::rand(
       {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::string equation = "bn,anm,bm->ba";
   torch::Tensor c = torch::einsum(equation, {l, a, r});
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_l = CopyToDevice(l, device);
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_r = CopyToDevice(r, device);
     torch::Tensor lazy_c = torch::einsum(equation, {lazy_l, lazy_a, lazy_r});
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestEinsumPyTorchLowerDiagonal) {
   torch::Tensor input = torch::rand(
       {3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::string equation = "ii->i";
   torch::Tensor result = torch::einsum(equation, {input});
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_result = torch::einsum(equation, {lazy_input});
     AllClose(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestEinsumPyTorchLowerBatchDiagonal) {
   torch::Tensor input = torch::rand(
       {4, 3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::string equation = "...ii->...i";
   torch::Tensor result = torch::einsum(equation, {input});
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_result = torch::einsum(equation, {lazy_input});
     AllClose(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestEinsumPyTorchLowerBatchPermute) {
   torch::Tensor input = torch::rand(
       {2, 3, 4, 5},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::string equation = "...ij->...ji";
   torch::Tensor result = torch::einsum(equation, {input});
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_result = torch::einsum(equation, {lazy_input});
     AllClose(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestEinsumPyTorchLowerRepeatedAxis) {
   torch::Tensor x = torch::rand(
       {2, 3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor y = torch::rand(
       {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::string equation = "ijj,k->ik";
   torch::Tensor result = torch::einsum(equation, {x, y});
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_x = CopyToDevice(x, device);
     torch::Tensor lazy_y = CopyToDevice(y, device);
     torch::Tensor lazy_result = torch::einsum(equation, {lazy_x, lazy_y});
     AllClose(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestBilinear) {
   int batch_size = 16;
   int in1_features = 4;
   int in2_features = 6;
   int out_features = 8;
   torch::Tensor input1 = torch::rand(
       {batch_size, in1_features},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor input2 = torch::rand(
       {batch_size, in2_features},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor weight = torch::rand(
       {out_features, in1_features, in2_features},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor bias = torch::rand(
       {out_features},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input1 = CopyToDevice(input1, device);
     torch::Tensor lazy_input2 = CopyToDevice(input2, device);
     torch::Tensor lazy_weight = CopyToDevice(weight, device);
     torch::Tensor lazy_bias = CopyToDevice(bias, device);
     torch::Tensor result = torch::bilinear(input1, input2, weight, bias);
     torch::Tensor lazy_result =
         torch::bilinear(lazy_input1, lazy_input2, lazy_weight, lazy_bias);
     AllClose(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestUpsampleNearest2D) {
   int batch_size = 2;
   int h = 5;
   int w = 5;
   int uh = 8;
   int uw = 8;
   int chans = 2;
   torch::Tensor input = torch::rand(
       {batch_size, chans, h, w},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor result = torch::upsample_nearest2d(input, {uh, uw});
     torch::Tensor lazy_result = torch::upsample_nearest2d(lazy_input, {uh, uw});
     AllClose(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestUpsampleNearest2DBackward) {
   int batch_size = 2;
   int h = 5;
   int w = 5;
   int uh = 8;
   int uw = 8;
   int chans = 2;
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::upsample_nearest2d(inputs[0], {uh, uw});
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {batch_size, chans, h, w},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestUpsampleNearest2DWithScale) {
   int batch_size = 2;
   int h = 5;
   int w = 5;
   int chans = 2;
   double scale_h = 2.5;
   double scale_w = 3.4;
   torch::Tensor input = torch::rand(
       {batch_size, chans, h, w},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor result = torch::upsample_nearest2d(
         input, c10::nullopt, at::ArrayRef<double>{scale_h, scale_w});
     torch::Tensor lazy_result = torch::upsample_nearest2d(
         lazy_input, c10::nullopt, at::ArrayRef<double>{scale_h, scale_w});
     AllClose(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestUpsampleNearest2DBackwardWithScale) {
   int batch_size = 2;
   int h = 5;
   int w = 5;
   int chans = 2;
   double scale_h = 2.5;
   double scale_w = 3.4;
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::upsample_nearest2d(
         inputs[0], c10::nullopt, at::ArrayRef<double>{scale_h, scale_w});
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {batch_size, chans, h, w},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestUpsampleBilinear2D) {
   int batch_size = 2;
   int h = 5;
   int w = 5;
   int uh = 8;
   int uw = 8;
   int chans = 2;
   for (bool align_corners : {true, false}) {
     torch::Tensor input = torch::rand(
         {batch_size, chans, h, w},
         torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor result =
           torch::upsample_bilinear2d(input, {uh, uw}, align_corners);
       torch::Tensor lazy_result =
           torch::upsample_bilinear2d(lazy_input, {uh, uw}, align_corners);
       AllClose(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestUpsampleBilinear2DBackward) {
   int batch_size = 2;
   int h = 5;
   int w = 5;
   int uh = 8;
   int uw = 8;
   int chans = 2;
   for (bool align_corners : {true, false}) {
     auto testfn =
         [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
       return torch::upsample_bilinear2d(inputs[0], {uh, uw}, align_corners);
     };
     ForEachDevice([&](const torch::Device& device) {
       TestBackward(
           {torch::rand(
               {batch_size, chans, h, w},
               torch::TensorOptions(torch::kFloat)
                   .device(DefaultDevice())
                   .requires_grad(true))},
           device,
           testfn);
     });
   }
 }

 TEST_F(LazyOpsTest, TestAddCMul) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor d = torch::addcmul(a, b, c, 3.1165);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = CopyToDevice(c, device);
     torch::Tensor lazy_d = torch::addcmul(lazy_a, lazy_b, lazy_c, 3.1165);
     AllClose(d, lazy_d);
   });
 }

 TEST_F(LazyOpsTest, TestAddCDiv) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c =
       torch::abs(torch::rand(
           {2, 2},
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()))) +
       1.0;
   torch::Tensor d = torch::addcdiv(a, b, c, 3.1165);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = CopyToDevice(c, device);
     torch::Tensor lazy_d = torch::addcdiv(lazy_a, lazy_b, lazy_c, 3.1165);
     AllClose(d, lazy_d);
   });
 }

 TEST_F(LazyOpsTest, TestAddCDivWithBroadcast) {
   torch::Tensor a = torch::rand(
       {1, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {3, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c =
       torch::abs(torch::rand(
           {1, 3},
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()))) +
       1.0;
   torch::Tensor d = torch::addcdiv(a, b, c, 3.1165);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = CopyToDevice(c, device);
     torch::Tensor lazy_d = torch::addcdiv(lazy_a, lazy_b, lazy_c, 3.1165);
     AllClose(d, lazy_d);
   });
 }

 TEST_F(LazyOpsTest, TestSize) {
   torch::Tensor input = torch::rand(
       {2, 1, 4, 6},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = input.dim();
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     for (int dim = -rank; dim < rank; ++dim) {
       EXPECT_EQ(torch::size(input, dim), torch::size(lazy_input, dim));
     }
   });
 }

 TEST_F(LazyOpsTest, TestSelect) {
   std::vector<int64_t> input_sizes = {14, 24, 8};
   int rank = input_sizes.size();
   for (int dim = -rank; dim < rank; ++dim) {
     auto testfn =
         [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
       return torch::select(inputs[0], dim, 0);
     };
     ForEachDevice([&](const torch::Device& device) {
       TestBackward(
           {torch::rand(
               input_sizes,
               torch::TensorOptions(torch::kFloat).requires_grad(true))},
           device,
           testfn);
     });
   };
 }

 TEST_F(LazyOpsTest, TestBernoulliScalarProb) {
   torch::Tensor input = torch::zeros(
       1000, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::bernoulli(lazy_input, 0.1);
     double frac = lazy_output.sum().item().toDouble() / input.numel();
     EXPECT_GT(frac, 0.06);
     EXPECT_LT(frac, 0.14);
   });
 }

 TEST_F(LazyOpsTest, TestBernoulliTensorProb) {
   std::vector<float> prob_values(1000, 0.1);
   torch::Tensor input = torch::tensor(
       prob_values, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::bernoulli(lazy_input);
     double frac = lazy_output.sum().item().toDouble() / input.numel();
     EXPECT_GT(frac, 0.06);
     EXPECT_LT(frac, 0.14);
   });
 }

 TEST_F(LazyOpsTest, TestBernoulliScalarProbInPlace) {
   torch::Tensor input = torch::zeros(
       1000, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     lazy_input.bernoulli_(0.1);
     double frac = lazy_input.sum().item().toDouble() / input.numel();
     EXPECT_GT(frac, 0.06);
     EXPECT_LT(frac, 0.14);
   });
 }

 TEST_F(LazyOpsTest, TestBernoulliTensorProbInPlace) {
   torch::Tensor input = torch::zeros(
       1000, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor prob = torch::scalar_tensor(
       0.1, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_prob = CopyToDevice(prob, device);
     lazy_input.bernoulli_(lazy_prob);
     double frac = lazy_input.sum().item().toDouble() / input.numel();
     EXPECT_GT(frac, 0.06);
     EXPECT_LT(frac, 0.14);
   });
 }

 TEST_F(LazyOpsTest, TestDropout) {
   torch::Tensor a = torch::rand(
       {17, 21}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::dropout(lazy_a, 0.1, /*train=*/true);
     double prob =
         static_cast<double>(lazy_b.cpu().ne(0.0f).sum().item().toDouble()) /
         a.numel();
     EXPECT_GT(prob, 0.86);
     EXPECT_LT(prob, 0.94);
   });
 }

 TEST_F(LazyOpsTest, TestDropoutInPlace) {
   torch::Tensor a = torch::rand(
       {17, 21}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::dropout_(lazy_a, 0.1, /*train=*/true);
     double prob =
         static_cast<double>(lazy_a.cpu().ne(0.0f).sum().item().toDouble()) /
         a.numel();
     EXPECT_GT(prob, 0.85);
     EXPECT_LT(prob, 0.94);
   });
 }

 TEST_F(LazyOpsTest, TestRandperm) {
   unsigned n = 5;
   torch::Tensor shuffle = torch::randperm(
       n, torch::TensorOptions(torch::kLong).device(torch::kLazy));
   torch::Tensor shuffle_cpu = CopyToDevice(shuffle, torch::kCPU);
   std::vector<int64_t> shuffle_data(
       shuffle_cpu.data_ptr<int64_t>(), shuffle_cpu.data_ptr<int64_t>() + n);
   EXPECT_TRUE(
       shuffle_data.size() == n && torch::lazy::IsPermutation(shuffle_data));
 }

 TEST_F(LazyOpsTest, TestSlice) {
   torch::Tensor a = torch::rand(
       {32, 24, 16},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::slice(a, 1, 0, 16, 1);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::slice(lazy_a, 1, 0, 16, 1);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestTake) {
   torch::Tensor a = torch::rand(
       {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::randint(
       16, {5}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   torch::Tensor c = torch::take(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::take(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestTakeBackward) {
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::take(inputs[0], inputs[1]);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
              {4, 4},
              torch::TensorOptions(torch::kFloat)
                  .device(DefaultDevice())
                  .requires_grad(true)),
          torch::randint(
              16,
              {5},
              torch::TensorOptions(torch::kLong).device(DefaultDevice()))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestStack) {
   torch::Tensor a = torch::rand(
       {2, 4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2, 4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::rand(
       {2, 4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = a.dim() + 1;
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor d = torch::stack({a, b, c}, dim);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = CopyToDevice(b, device);
       torch::Tensor lazy_c = CopyToDevice(c, device);
       torch::Tensor lazy_d = torch::stack({lazy_a, lazy_b, lazy_c}, dim);
       AllClose(d, lazy_d);
     });
   }
 }

 TEST_F(LazyOpsTest, TestCat) {
   torch::Tensor a = torch::rand(
       {2, 1, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2, 2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::rand(
       {2, 3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int dim : {1, -2}) {
     torch::Tensor d = torch::cat({a, b, c}, dim);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = CopyToDevice(b, device);
       torch::Tensor lazy_c = CopyToDevice(c, device);
       torch::Tensor lazy_d = torch::cat({lazy_a, lazy_b, lazy_c}, dim);
       EXPECT_TRUE(d.sizes() == lazy_d.sizes() && d.dtype() == lazy_d.dtype());
       AllClose(d, lazy_d);
     });
   }
 }

 TEST_F(LazyOpsTest, TestUnbind) {
   torch::Tensor input = torch::rand(
       {4, 3, 7}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = input.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     std::vector<torch::Tensor> output = torch::unbind(input, dim);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       std::vector<torch::Tensor> lazy_output = torch::unbind(lazy_input, dim);
       ASSERT_EQ(output.size(), lazy_output.size());
       for (size_t i = 0; i < output.size(); ++i) {
         AllClose(output[i], lazy_output[i]);
       }
     });
   }
 }

 TEST_F(LazyOpsTest, TestRepeat) {
   std::vector<std::vector<int64_t>> repeats_list = {{4, 2}, {4, 2, 3}};
   std::vector<std::vector<int64_t>> input_size_list = {{3}, {2, 4}};
   for (const auto& repeats : repeats_list) {
     for (const auto& input_size : input_size_list) {
       torch::Tensor input = torch::rand(
           input_size,
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor output = input.repeat(repeats);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_input = CopyToDevice(input, device);
         torch::Tensor lazy_output = lazy_input.repeat(repeats);
         AllClose(output, lazy_output);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestGather) {
   torch::Tensor a = torch::rand(
       {3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::empty(
       {3, 3}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (int i = 0; i < 3; i++) {
     for (int j = 0; j < 3; j++) {
       b[i][j] = (i + j) % 3;
     }
   }
   for (bool sparse_grad : {false, true}) {
     torch::Tensor c = torch::gather(a, 1, b, sparse_grad);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = CopyToDevice(b, device);
       torch::Tensor lazy_c = torch::gather(lazy_a, 1, lazy_b, sparse_grad);
       AllClose(c, lazy_c);
     });
   }
 }

 TEST_F(LazyOpsTest, TestScatter) {
   torch::Tensor a = torch::rand(
       {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::empty(
       {3, 5}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (int dim = 0; dim < 2; ++dim) {
     for (int i = 0; i < 3; i++) {
       for (int j = 0; j < 5; j++) {
         c[i][j] = (i + j) % c.sizes()[dim];
       }
     }
     torch::Tensor d = torch::scatter(a, dim, c, b);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = CopyToDevice(b, device);
       torch::Tensor lazy_c = CopyToDevice(c, device);
       torch::Tensor lazy_d = torch::scatter(lazy_a, dim, lazy_c, lazy_b);
       AllClose(d, lazy_d);
     });
   }
 }

 TEST_F(LazyOpsTest, TestScatterR1) {
   torch::Tensor a = torch::rand(
       {5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::empty(
       {2}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   c[0] = 1;
   c[1] = 3;
   torch::Tensor d = torch::scatter(a, 0, c, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = CopyToDevice(c, device);
     torch::Tensor lazy_d = torch::scatter(lazy_a, 0, lazy_c, lazy_b);
     AllClose(d, lazy_d);
   });
 }

 TEST_F(LazyOpsTest, TestScatterR3) {
   torch::Tensor a = torch::rand(
       {3, 5, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {3, 4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::empty(
       {3, 4, 2}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (int i = 0; i < 3; i++) {
     for (int j = 0; j < 4; j++) {
       for (int k = 0; k < 2; k++) {
         c[i][j][k] = (i + j + k) % 4;
       }
     }
   }
   torch::Tensor d = torch::scatter(a, 1, c, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = CopyToDevice(c, device);
     torch::Tensor lazy_d = torch::scatter(lazy_a, 1, lazy_c, lazy_b);
     AllClose(d, lazy_d);
   });
 }

 TEST_F(LazyOpsTest, TestScatterBiggerSource) {
   torch::Tensor a = torch::rand(
       {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {8, 8}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::empty(
       {4, 4}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (int i = 0; i < 4; i++) {
     for (int j = 0; j < 4; j++) {
       c[i][j] = (i + j) % 4;
     }
   }
   for (int dim = 0; dim < 2; ++dim) {
     torch::Tensor d = torch::scatter(a, dim, c, b);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = CopyToDevice(b, device);
       torch::Tensor lazy_c = CopyToDevice(c, device);
       torch::Tensor lazy_d = torch::scatter(lazy_a, dim, lazy_c, lazy_b);
       AllClose(d, lazy_d);
     });
   }
 }

 TEST_F(LazyOpsTest, TestScatterScalar) {
   torch::Tensor a = torch::rand(
       {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar b = 1.0f;
   torch::Tensor c = torch::empty(
       {4, 4}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (int i = 0; i < 4; i++) {
     for (int j = 0; j < 4; j++) {
       c[i][j] = (i + j) % 4;
     }
   }
   for (int dim = 0; dim < 2; ++dim) {
     torch::Tensor d = torch::scatter(a, dim, c, b);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_c = CopyToDevice(c, device);
       torch::Tensor lazy_d = torch::scatter(lazy_a, dim, lazy_c, b);
       AllClose(d, lazy_d);
     });
   }
 }

 TEST_F(LazyOpsTest, TestScatterReduceAdd) {
   torch::Tensor a = torch::rand(
       {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::empty(
       {3, 5}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (int dim = 0; dim < 2; ++dim) {
     for (int i = 0; i < 3; i++) {
       for (int j = 0; j < 5; j++) {
         c[i][j] = (i + j) % c.sizes()[dim];
       }
     }
     torch::Tensor d = torch::scatter(a, dim, c, b, "add");
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = CopyToDevice(b, device);
       torch::Tensor lazy_c = CopyToDevice(c, device);
       torch::Tensor lazy_d = torch::scatter(lazy_a, dim, lazy_c, lazy_b, "add");
       AllClose(d, lazy_d);
     });
   }

   ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
   ExpectCounterChanged("lazy::scatter_out", GetIgnoredCounters());
 }

 TEST_F(LazyOpsTest, TestScatterAdd) {
   torch::Tensor a = torch::rand(
       {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::empty(
       {3, 5}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (int dim = 0; dim < 2; ++dim) {
     for (int i = 0; i < 3; i++) {
       for (int j = 0; j < 5; j++) {
         c[i][j] = (i + j) % c.sizes()[dim];
       }
     }
     torch::Tensor d = torch::scatter_add(a, dim, c, b);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = CopyToDevice(b, device);
       torch::Tensor lazy_c = CopyToDevice(c, device);
       torch::Tensor lazy_d = torch::scatter_add(lazy_a, dim, lazy_c, lazy_b);
       AllClose(d, lazy_d);
     });
   }
 }

 TEST_F(LazyOpsTest, TestScatterAddInPlace) {
   torch::Tensor b = torch::rand(
       {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::empty(
       {4, 4}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (int i = 0; i < 4; i++) {
     for (int j = 0; j < 4; j++) {
       c[i][j] = (i + j) % 4;
     }
   }
   for (int dim = 0; dim < 2; ++dim) {
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor a = torch::rand(
           {4, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor d = a.scatter_add_(dim, c, b);
       torch::Tensor lazy_b = CopyToDevice(b, device);
       torch::Tensor lazy_c = CopyToDevice(c, device);
       torch::Tensor lazy_d = lazy_a.scatter_add_(dim, lazy_c, lazy_b);
       AllClose(d, lazy_d);
       AllClose(a, lazy_a);
     });
   }
 }

 TEST_F(LazyOpsTest, TestIndexSelect) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor a = isFloatingType(scalar_type)
         ? torch::rand(
               {3, 4}, torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {3, 4},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     for (torch::ScalarType index_scalar_type : {torch::kInt, torch::kLong}) {
       torch::Tensor b = torch::empty(
           {2}, torch::TensorOptions(index_scalar_type).device(DefaultDevice()));
       b[0] = 0;
       b[1] = 2;
       for (auto offset : {-2, 0}) {
         torch::Tensor c0 = torch::index_select(a, 0 + offset, b);
         torch::Tensor c1 = torch::index_select(a, 1 + offset, b);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_a = CopyToDevice(a, device);
           torch::Tensor lazy_b = CopyToDevice(b, device);
           torch::Tensor lazy_c0 =
               torch::index_select(lazy_a, 0 + offset, lazy_b);
           torch::Tensor lazy_c1 =
               torch::index_select(lazy_a, 1 + offset, lazy_b);
           AllEqual(c0, lazy_c0);
           AllEqual(c1, lazy_c1);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestIndexSelectRank0) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor a = isFloatingType(scalar_type)
         ? torch::rand(
               {3, 4}, torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {3, 4},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor b = torch::scalar_tensor(
         2, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
     torch::Tensor c0 = torch::index_select(a, 0, b);
     torch::Tensor c1 = torch::index_select(a, 1, b);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_a = CopyToDevice(a, device);
       torch::Tensor lazy_b = CopyToDevice(b, device);
       torch::Tensor lazy_c0 = torch::index_select(lazy_a, 0, lazy_b);
       torch::Tensor lazy_c1 = torch::index_select(lazy_a, 1, lazy_b);
       AllEqual(c0, lazy_c0);
       AllEqual(c1, lazy_c1);
     });
   }
 }

 TEST_F(LazyOpsTest, TestInverse) {
   if (IsCuda()) {
     // TODO(whc) debug failure on cuda, lazy_b comes back transposed
     GTEST_SKIP();
   }
   torch::Tensor a = torch::randn(
       {5, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::inverse(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::inverse(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-4);
   });
 }

 TEST_F(LazyOpsTest, TestIsnan) {
   torch::Tensor a = torch::tensor(
       {1.0, 2.0, std::nan("1"), 4.0},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::isnan(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::isnan(lazy_a);
     AllEqual(b, lazy_b);
   });
   ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
   ExpectCounterChanged("lazy::isnan", GetIgnoredCounters());
 }

 TEST_F(LazyOpsTest, TestExpand) {
   torch::Tensor a = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = a.expand({2, 3, 4}, /*implicit=*/false);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = lazy_a.expand({2, 3, 4}, /*implicit=*/false);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestExpandBack) {
   torch::Tensor a = torch::rand(
       {3, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = a.expand({3, 4}, /*implicit=*/false);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = lazy_a.expand({3, 4}, /*implicit=*/false);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestExpandAs) {
   torch::Tensor a = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::native::expand_as(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::native::expand_as(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestEye) {
   int n = 5;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor out = torch::eye(
         n, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_out =
         torch::eye(n, torch::TensorOptions(torch::kFloat).device(device));
     AllClose(out, lazy_out);
   });
 }

 TEST_F(LazyOpsTest, TestEyeWide) {
   int lines = 3;
   int cols = 5;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor out = torch::eye(
         lines,
         cols,
         torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_out = torch::eye(
         lines, cols, torch::TensorOptions(torch::kFloat).device(device));
     AllClose(out, lazy_out);
   });
 }

 TEST_F(LazyOpsTest, TestEyeNarrow) {
   int lines = 5;
   int cols = 3;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor out = torch::eye(
         lines,
         cols,
         torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_out = torch::eye(
         lines, cols, torch::TensorOptions(torch::kFloat).device(device));
     AllClose(out, lazy_out);
   });
 }

 TEST_F(LazyOpsTest, TestBroadcastTensors) {
   torch::Tensor a = torch::rand(
       {2, 1, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<torch::Tensor> c = torch::broadcast_tensors({a, b});
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     std::vector<torch::Tensor> lazy_c =
         torch::broadcast_tensors({lazy_a, lazy_b});
     ASSERT_EQ(c.size(), lazy_c.size());
     for (size_t i = 0; i < c.size(); ++i) {
       AllClose(c[i], lazy_c[i]);
     }
   });
 }

 TEST_F(LazyOpsTest, TestOneIndex) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor indices = torch::randint(
         -3,
         3,
         {2, 4, 3},
         torch::TensorOptions(torch::kLong).device(DefaultDevice()));
     torch::Tensor result = torch::index(params, {indices});
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_params = CopyToDevice(params, device);
       torch::Tensor lazy_indices = CopyToDevice(indices, device);
       torch::Tensor lazy_result = torch::index(lazy_params, {lazy_indices});
       AllEqual(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestOneIndexTransfer) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor indices = torch::randint(
         -3,
         3,
         {2, 4, 3},
         torch::TensorOptions(torch::kLong).device(DefaultDevice()));
     torch::Tensor result = torch::index(params, {indices});
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_params = CopyToDevice(params, device);
       torch::Tensor lazy_result = torch::index(lazy_params, {indices.cpu()});
       AllEqual(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestNonzero) {
   torch::Tensor a = torch::zeros(
       {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   a[0][1] = 1.0;
   a[1][0] = 2.0;
   a[3][1] = 3.0;
   torch::Tensor b = torch::nonzero(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::nonzero(lazy_a);
     AllClose(b, lazy_b);

     if (DebugUtil::ExperimentEnabled("nonzero")) {
       // If the nonzero support is enabled, we must not see any aten:: calls.
       ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
     }
     ResetCounters();
   });
 }

 TEST_F(LazyOpsTest, TestMaskedSelect) {
   torch::Tensor a = torch::rand(
       {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::randint(
       0, 2, {5}, torch::TensorOptions(torch::kBool).device(DefaultDevice()));
   torch::Tensor c = torch::masked_select(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::masked_select(lazy_a, lazy_b);
     AllClose(c, lazy_c);

     if (DebugUtil::ExperimentEnabled("masked_select")) {
       // If the masked_select support is enabled, we must not see any aten::
       // calls.
       ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
     }
     ResetCounters();
   });
 }

 TEST_F(LazyOpsTest, TestMaskedScatter) {
   torch::Tensor a = torch::rand(
       {3, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::randint(
       0, 2, {3, 5}, torch::TensorOptions(torch::kBool).device(DefaultDevice()));
   torch::Tensor c = torch::rand(
       {15}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor d = torch::masked_scatter(a, b, c);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = CopyToDevice(c, device);
     torch::Tensor lazy_d = torch::masked_scatter(lazy_a, lazy_b, lazy_c);
     AllClose(d, lazy_d);

     if (DebugUtil::ExperimentEnabled("masked_scatter")) {
       // If the masked_select support is enabled, we must not see any aten::
       // calls.
       ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
     }
     ResetCounters();
   });
 }

 TEST_F(LazyOpsTest, TestMultiIndexHeadNull) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor indices_null;
     torch::Tensor indices_0 = torch::randint(
         -3,
         3,
         {2, 4, 3},
         torch::TensorOptions(torch::kLong).device(DefaultDevice()));
     torch::Tensor indices_1 = torch::randint(
         -3,
         3,
         {2, 4, 3},
         torch::TensorOptions(torch::kLong).device(DefaultDevice()));
     torch::Tensor result =
         torch::index(params, {indices_null, indices_0, indices_1});
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_params = CopyToDevice(params, device);
       torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
       torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
       torch::Tensor lazy_result = torch::index(
           lazy_params, {indices_null, lazy_indices_0, lazy_indices_1});
       AllEqual(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestMultiIndexMiddleNull) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor indices_0 = torch::randint(
         -3,
         3,
         {2, 4, 3},
         torch::TensorOptions(torch::kLong).device(DefaultDevice()));
     torch::Tensor indices_null;
     torch::Tensor indices_1 = torch::randint(
         -3,
         3,
         {2, 4, 3},
         torch::TensorOptions(torch::kLong).device(DefaultDevice()));
     torch::Tensor result =
         torch::index(params, {indices_0, indices_null, indices_1});
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_params = CopyToDevice(params, device);
       torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
       torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
       torch::Tensor lazy_result = torch::index(
           lazy_params, {lazy_indices_0, indices_null, lazy_indices_1});
       AllEqual(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestMultiIndexTailNull) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor indices_0 = torch::randint(
         -3,
         3,
         {2, 4, 3},
         torch::TensorOptions(torch::kLong).device(DefaultDevice()));
     torch::Tensor indices_null;
     torch::Tensor indices_1 = torch::randint(
         -3,
         3,
         {2, 4, 3},
         torch::TensorOptions(torch::kLong).device(DefaultDevice()));
     torch::Tensor result =
         torch::index(params, {indices_0, indices_1, indices_null});
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_params = CopyToDevice(params, device);
       torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
       torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
       torch::Tensor lazy_result = torch::index(
           lazy_params, {lazy_indices_0, lazy_indices_1, indices_null});
       AllEqual(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestMultiIndexMiddleBroadcast) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor indices_0 = torch::randint(
         -3,
         3,
         {2, 4, 3},
         torch::TensorOptions(torch::kLong).device(DefaultDevice()));
     torch::Tensor indices_1 = torch::randint(
         -3,
         3,
         {2, 1, 3},
         torch::TensorOptions(torch::kLong).device(DefaultDevice()));
     torch::Tensor result = torch::index(params, {indices_0, indices_1});
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_params = CopyToDevice(params, device);
       torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
       torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
       torch::Tensor lazy_result =
           torch::index(lazy_params, {lazy_indices_0, lazy_indices_1});
       AllEqual(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestMultiIndexTailBroadcast) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor indices_0 = torch::randint(
         -3,
         3,
         {2, 1, 3},
         torch::TensorOptions(torch::kLong).device(DefaultDevice()));
     torch::Tensor indices_1 = torch::randint(
         -3,
         3,
         {2, 1},
         torch::TensorOptions(torch::kLong).device(DefaultDevice()));
     torch::Tensor result = torch::index(params, {indices_0, indices_1});
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_params = CopyToDevice(params, device);
       torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
       torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
       torch::Tensor lazy_result =
           torch::index(lazy_params, {lazy_indices_0, lazy_indices_1});
       AllEqual(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestMaskIndex) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {2, 2}, torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {2, 2},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor indices = torch::randint(
         0,
         2,
         {2, 2},
         torch::TensorOptions(torch::kBool).device(DefaultDevice()));
     torch::Tensor result = torch::index(params, {indices});
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_params = CopyToDevice(params, device);
       torch::Tensor lazy_indices = CopyToDevice(indices, device);
       torch::Tensor lazy_result = torch::index(lazy_params, {lazy_indices});
       AllEqual(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestOneIndexPut) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor indices = torch::randint(
         -3,
         3,
         {2, 4, 3},
         torch::TensorOptions(torch::kLong).device(DefaultDevice()));
     torch::Tensor values = isFloatingType(scalar_type)
         ? torch::rand(
               {3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     for (bool accumulate : {false, true}) {
       if (accumulate && IsCuda()) {
         GTEST_SKIP();
       }
       torch::Tensor result =
           torch::index_put(params, {indices}, values, accumulate);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_params = CopyToDevice(params, device);
         torch::Tensor lazy_indices = CopyToDevice(indices, device);
         torch::Tensor lazy_values = CopyToDevice(values, device);
         torch::Tensor lazy_result = torch::index_put(
             lazy_params, {lazy_indices}, lazy_values, accumulate);
         AllEqual(result, lazy_result);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestOneIndexPutInPlace) {
   torch::Tensor indices = torch::randint(
       -3,
       3,
       {2, 4, 3},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor values = torch::ones(
         {3, 5, 6, 7},
         torch::TensorOptions(scalar_type).device(DefaultDevice()));
     for (bool accumulate : {false, true}) {
       if (accumulate && IsCuda()) {
         GTEST_SKIP();
       }
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor params = isFloatingType(scalar_type)
             ? torch::rand(
                   {4, 3, 5, 6, 7},
                   torch::TensorOptions(scalar_type).device(DefaultDevice()))
             : torch::randint(
                   100,
                   {4, 3, 5, 6, 7},
                   torch::TensorOptions(scalar_type).device(DefaultDevice()));
         torch::Tensor lazy_params = CopyToDevice(params.clone(), device);
         torch::Tensor result =
             torch::index_put_(params, {indices}, values, accumulate);
         torch::Tensor lazy_indices = CopyToDevice(indices, device);
         torch::Tensor lazy_values = CopyToDevice(values, device);
         torch::Tensor lazy_result = torch::index_put_(
             lazy_params, {lazy_indices}, lazy_values, accumulate);
         AllEqual(result, lazy_result);
         AllEqual(params, lazy_params);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestOneIndexPutTransfer) {
   torch::Tensor indices = torch::randint(
       -3,
       3,
       {2, 4, 3},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor values = torch::ones(
         {3, 5, 6, 7},
         torch::TensorOptions(scalar_type).device(DefaultDevice()));
     for (bool accumulate : {false, true}) {
       if (accumulate && IsCuda()) {
         GTEST_SKIP();
       }
       torch::Tensor result =
           torch::index_put(params, {indices}, values, accumulate);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_params = CopyToDevice(params, device);
         torch::Tensor lazy_values = CopyToDevice(values, device);
         torch::Tensor lazy_result =
             torch::index_put(lazy_params, {indices}, lazy_values, accumulate);
         AllEqual(result, lazy_result);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestMultiIndexPut) {
   torch::Tensor indices_0 = torch::randint(
       -3,
       3,
       {2, 4, 3},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   torch::Tensor indices_1 = torch::randint(
       -3,
       3,
       {2, 4, 3},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor values = torch::ones(
         {5, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
     for (bool accumulate : {false, true}) {
       if (accumulate && IsCuda()) {
         GTEST_SKIP();
       }
       torch::Tensor result =
           torch::index_put(params, {indices_0, indices_1}, values, accumulate);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_params = CopyToDevice(params, device);
         torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
         torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
         torch::Tensor lazy_values = CopyToDevice(values, device);
         torch::Tensor lazy_result = torch::index_put(
             lazy_params,
             {lazy_indices_0, lazy_indices_1},
             lazy_values,
             accumulate);
         AllEqual(result, lazy_result);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestMultiIndexPutHeadNull) {
   torch::Tensor indices_0 = torch::randint(
       -3,
       3,
       {2, 4, 3},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   torch::Tensor indices_null;
   torch::Tensor indices_1 = torch::randint(
       -3,
       3,
       {2, 4, 3},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {4, 3, 3, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {4, 3, 3, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor values = torch::ones(
         {3, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
     for (bool accumulate : {false, true}) {
       if (accumulate && IsCuda()) {
         GTEST_SKIP();
       }
       torch::Tensor result = torch::index_put(
           params, {indices_null, indices_0, indices_1}, values, accumulate);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_params = CopyToDevice(params, device);
         torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
         torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
         torch::Tensor lazy_values = CopyToDevice(values, device);
         torch::Tensor lazy_result = torch::index_put(
             lazy_params,
             {indices_null, lazy_indices_0, lazy_indices_1},
             lazy_values,
             accumulate);
         AllEqual(result, lazy_result);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestMultiIndexPutMiddleNull) {
   torch::Tensor indices_0 = torch::randint(
       -3,
       3,
       {2, 4, 3},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   torch::Tensor indices_null;
   torch::Tensor indices_1 = torch::randint(
       -3,
       3,
       {2, 4, 3},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {4, 3, 3, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {4, 3, 3, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor values = torch::ones(
         {3, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
     for (bool accumulate : {false, true}) {
       if (accumulate && IsCuda()) {
         GTEST_SKIP();
       }
       torch::Tensor result = torch::index_put(
           params, {indices_0, indices_null, indices_1}, values, accumulate);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_params = CopyToDevice(params, device);
         torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
         torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
         torch::Tensor lazy_values = CopyToDevice(values, device);
         torch::Tensor lazy_result = torch::index_put(
             lazy_params,
             {lazy_indices_0, indices_null, lazy_indices_1},
             lazy_values,
             accumulate);
         AllEqual(result, lazy_result);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestMultiIndexPutTailNull) {
   torch::Tensor indices_0 = torch::randint(
       -3,
       3,
       {2, 4, 3},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   torch::Tensor indices_1 = torch::randint(
       -3,
       3,
       {2, 4, 3},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   torch::Tensor indices_null;
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {4, 3, 3, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {4, 3, 3, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor values = torch::ones(
         {3, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
     for (bool accumulate : {false, true}) {
       if (accumulate && IsCuda()) {
         GTEST_SKIP();
       }
       torch::Tensor result = torch::index_put(
           params, {indices_0, indices_1, indices_null}, values, accumulate);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_params = CopyToDevice(params, device);
         torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
         torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
         torch::Tensor lazy_values = CopyToDevice(values, device);
         torch::Tensor lazy_result = torch::index_put(
             lazy_params,
             {lazy_indices_0, lazy_indices_1, indices_null},
             lazy_values,
             accumulate);
         AllEqual(result, lazy_result);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestMultiIndexPutMiddleBroadcast) {
   torch::Tensor indices_0 = torch::randint(
       -3,
       3,
       {2, 4, 3},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   torch::Tensor indices_1 = torch::randint(
       -3,
       3,
       {2, 1, 3},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor values = torch::ones(
         {5, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
     for (bool accumulate : {false, true}) {
       if (accumulate && IsCuda()) {
         GTEST_SKIP();
       }
       torch::Tensor result =
           torch::index_put(params, {indices_0, indices_1}, values, accumulate);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_params = CopyToDevice(params, device);
         torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
         torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
         torch::Tensor lazy_values = CopyToDevice(values, device);
         torch::Tensor lazy_result = torch::index_put(
             lazy_params,
             {lazy_indices_0, lazy_indices_1},
             lazy_values,
             accumulate);
         AllEqual(result, lazy_result);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestMultiIndexPutTailBroadcast) {
   torch::Tensor indices_0 = torch::randint(
       -3,
       3,
       {2, 1, 3},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   torch::Tensor indices_1 = torch::randint(
       -3,
       3,
       {2, 1},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {4, 3, 5, 6, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor values = torch::ones(
         {5, 6, 7}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
     for (bool accumulate : {false, true}) {
       if (accumulate && IsCuda()) {
         GTEST_SKIP();
       }
       torch::Tensor result =
           torch::index_put(params, {indices_0, indices_1}, values, accumulate);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_params = CopyToDevice(params, device);
         torch::Tensor lazy_indices_0 = CopyToDevice(indices_0, device);
         torch::Tensor lazy_indices_1 = CopyToDevice(indices_1, device);
         torch::Tensor lazy_values = CopyToDevice(values, device);
         torch::Tensor lazy_result = torch::index_put(
             lazy_params,
             {lazy_indices_0, lazy_indices_1},
             lazy_values,
             accumulate);
         AllEqual(result, lazy_result);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaskIndexPut) {
   torch::Tensor indices =
       torch::tensor(
           {0, 1}, torch::TensorOptions(torch::kByte).device(DefaultDevice()))
           .to(torch::kBool);
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor params = isFloatingType(scalar_type)
         ? torch::rand(
               {2, 2}, torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {2, 2},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor values = torch::ones(
         {2}, torch::TensorOptions(scalar_type).device(DefaultDevice()));
     for (bool accumulate : {false, true}) {
       torch::Tensor result =
           torch::index_put(params, {indices}, values, accumulate);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_params = CopyToDevice(params, device);
         torch::Tensor lazy_indices = CopyToDevice(indices, device);
         torch::Tensor lazy_values = CopyToDevice(values, device);
         torch::Tensor lazy_result = torch::index_put(
             lazy_params, {lazy_indices}, lazy_values, accumulate);
         AllEqual(result, lazy_result);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestIndexPutImpl) {
   torch::Tensor indices = torch::randint(
       -3,
       3,
       {2, 4, 3},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor values = torch::ones(
         {3, 5, 6, 7},
         torch::TensorOptions(scalar_type).device(DefaultDevice()));
     for (bool accumulate : {false, true}) {
       if (accumulate && IsCuda()) {
         GTEST_SKIP();
       }
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor params = isFloatingType(scalar_type)
             ? torch::rand(
                   {4, 3, 5, 6, 7},
                   torch::TensorOptions(scalar_type).device(DefaultDevice()))
             : torch::randint(
                   100,
                   {4, 3, 5, 6, 7},
                   torch::TensorOptions(scalar_type).device(DefaultDevice()));
         torch::Tensor lazy_params = CopyToDevice(params.clone(), device);
         torch::Tensor result = torch::_index_put_impl_(
             params, {indices}, values, accumulate, /*unsafe=*/true);
         torch::Tensor lazy_indices = CopyToDevice(indices, device);
         torch::Tensor lazy_values = CopyToDevice(values, device);
         torch::Tensor lazy_result = torch::_index_put_impl_(
             lazy_params,
             {lazy_indices},
             lazy_values,
             accumulate,
             /*unsafe=*/true);
         AllEqual(result, lazy_result);
         AllEqual(params, lazy_params);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestIndexFillWithScalar) {
   torch::Tensor index = torch::tensor(
       {0, 2}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   torch::Scalar value = 42;
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor base = isFloatingType(scalar_type)
         ? torch::rand(
               {3, 4, 5},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {3, 4, 5},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     int rank = base.dim();
     for (int dim = -rank; dim < rank; ++dim) {
       torch::Tensor result = torch::index_fill(base, dim, index, value);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_base = CopyToDevice(base, device);
         torch::Tensor lazy_index = CopyToDevice(index, device);
         torch::Tensor lazy_result =
             torch::index_fill(lazy_base, dim, lazy_index, value);
         AllEqual(result, lazy_result);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestIndexFillWithScalarInPlace) {
   torch::Tensor index = torch::tensor(
       {0, 2}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   torch::Scalar value = 42;
   int rank = 3;
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     for (int dim = -rank; dim < rank; ++dim) {
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor base = isFloatingType(scalar_type)
             ? torch::rand(
                   {3, 4, 5},
                   torch::TensorOptions(scalar_type).device(DefaultDevice()))
             : torch::randint(
                   100,
                   {3, 4, 5},
                   torch::TensorOptions(scalar_type).device(DefaultDevice()));
         torch::Tensor lazy_base = CopyToDevice(base.clone(), device);
         torch::Tensor result = base.index_fill_(dim, index, value);
         torch::Tensor lazy_index = CopyToDevice(index, device);
         torch::Tensor lazy_result =
             lazy_base.index_fill_(dim, lazy_index, value);
         AllEqual(result, lazy_result);
         AllEqual(base, lazy_base);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestIndexFillWithTensor) {
   torch::Tensor index = torch::tensor(
       {0, 2}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor base = isFloatingType(scalar_type)
         ? torch::rand(
               {3, 4, 5},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {3, 4, 5},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor value = torch::scalar_tensor(
         42, torch::TensorOptions(scalar_type).device(DefaultDevice()));
     int rank = base.dim();
     for (int dim = -rank; dim < rank; ++dim) {
       torch::Tensor result = torch::index_fill(base, dim, index, value);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_base = CopyToDevice(base, device);
         torch::Tensor lazy_index = CopyToDevice(index, device);
         torch::Tensor lazy_value = CopyToDevice(value, device);
         torch::Tensor lazy_result =
             torch::index_fill(lazy_base, dim, lazy_index, lazy_value);
         AllEqual(result, lazy_result);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestIndexFillWithTensorInPlace) {
   torch::Tensor index = torch::tensor(
       {0, 2}, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor value = torch::scalar_tensor(
         42, torch::TensorOptions(scalar_type).device(DefaultDevice()));
     int rank = 3;
     for (int dim = -rank; dim < rank; ++dim) {
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor base = isFloatingType(scalar_type)
             ? torch::rand(
                   {3, 4, 5},
                   torch::TensorOptions(scalar_type).device(DefaultDevice()))
             : torch::randint(
                   100,
                   {3, 4, 5},
                   torch::TensorOptions(scalar_type).device(DefaultDevice()));
         torch::Tensor lazy_base = CopyToDevice(base.clone(), device);
         torch::Tensor result = base.index_fill_(dim, index, value);
         torch::Tensor lazy_index = CopyToDevice(index, device);
         torch::Tensor lazy_value = CopyToDevice(value, device);
         torch::Tensor lazy_result =
             lazy_base.index_fill_(dim, lazy_index, lazy_value);
         AllEqual(result, lazy_result);
         AllEqual(base, lazy_base);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestIndexFillRank0) {
   torch::Tensor index = torch::scalar_tensor(
       2, torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor base = isFloatingType(scalar_type)
         ? torch::rand(
               {3, 4, 5},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {3, 4, 5},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     torch::Tensor value = torch::scalar_tensor(
         42, torch::TensorOptions(scalar_type).device(DefaultDevice()));
     int rank = base.dim();
     for (int dim = -rank; dim < rank; ++dim) {
       torch::Tensor result = torch::index_fill(base, dim, index, value);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_base = CopyToDevice(base, device);
         torch::Tensor lazy_index = CopyToDevice(index, device);
         torch::Tensor lazy_value = CopyToDevice(value, device);
         torch::Tensor lazy_result =
             torch::index_fill(lazy_base, dim, lazy_index, lazy_value);
         AllEqual(result, lazy_result);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestIndexAdd) {
   int index_size = 10;
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor base = isFloatingType(scalar_type)
         ? torch::rand(
               {5, 3, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {5, 3, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     int rank = base.dim();
     for (int dim = -rank; dim < rank; ++dim) {
       for (torch::ScalarType index_scalar_type : {torch::kInt, torch::kLong}) {
         torch::Tensor index = torch::randint(
             0,
             base.size(dim),
             {index_size},
             torch::TensorOptions(index_scalar_type).device(DefaultDevice()));
         std::vector<int64_t> value_sizes(
             base.sizes().begin(), base.sizes().end());
         int canonical_dim = dim < 0 ? dim + rank : dim;
         value_sizes[canonical_dim] = index_size;
         torch::Tensor value = isFloatingType(scalar_type)
             ? torch::rand(
                   value_sizes,
                   torch::TensorOptions(scalar_type).device(DefaultDevice()))
             : torch::randint(
                   100,
                   value_sizes,
                   torch::TensorOptions(scalar_type).device(DefaultDevice()));
         torch::Tensor result = torch::index_add(base, dim, index, value);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_base = CopyToDevice(base, device);
           torch::Tensor lazy_index = CopyToDevice(index, device);
           torch::Tensor lazy_value = CopyToDevice(value, device);
           torch::Tensor lazy_result =
               torch::index_add(lazy_base, dim, lazy_index, lazy_value);
           AllClose(result, lazy_result);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestIndexAddInPlace) {
   int index_size = 10;
   int rank = 3;
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     for (int dim = -rank; dim < rank; ++dim) {
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor base = isFloatingType(scalar_type)
             ? torch::rand(
                   {5, 3, 7},
                   torch::TensorOptions(scalar_type).device(DefaultDevice()))
             : torch::randint(
                   100,
                   {5, 3, 7},
                   torch::TensorOptions(scalar_type).device(DefaultDevice()));
         torch::Tensor index = torch::randint(
             0,
             base.size(dim),
             {index_size},
             torch::TensorOptions(torch::kLong).device(DefaultDevice()));
         std::vector<int64_t> value_sizes(
             base.sizes().begin(), base.sizes().end());
         int canonical_dim = dim < 0 ? dim + rank : dim;
         value_sizes[canonical_dim] = index_size;
         torch::Tensor value = isFloatingType(scalar_type)
             ? torch::rand(
                   value_sizes,
                   torch::TensorOptions(scalar_type).device(DefaultDevice()))
             : torch::randint(
                   100,
                   value_sizes,
                   torch::TensorOptions(scalar_type).device(DefaultDevice()));
         torch::Tensor lazy_base = CopyToDevice(base.clone(), device);
         torch::Tensor result = base.index_add_(dim, index, value);
         torch::Tensor lazy_index = CopyToDevice(index, device);
         torch::Tensor lazy_value = CopyToDevice(value, device);
         torch::Tensor lazy_result =
             lazy_base.index_add_(dim, lazy_index, lazy_value);
         AllClose(result, lazy_result);
         AllClose(base, lazy_base);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestIndexAddRank0) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor base = isFloatingType(scalar_type)
         ? torch::rand(
               {5, 3, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {5, 3, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     int rank = base.dim();
     for (int dim = -rank; dim < rank; ++dim) {
       torch::Tensor index = torch::randint(
           0,
           base.size(dim),
           at::IntArrayRef{},
           torch::TensorOptions(torch::kLong).device(DefaultDevice()));
       std::vector<int64_t> value_sizes(
           base.sizes().begin(), base.sizes().end());
       int canonical_dim = dim < 0 ? dim + rank : dim;
       value_sizes[canonical_dim] = 1;
       torch::Tensor value = isFloatingType(scalar_type)
           ? torch::rand(
                 value_sizes,
                 torch::TensorOptions(scalar_type).device(DefaultDevice()))
           : torch::randint(
                 100,
                 value_sizes,
                 torch::TensorOptions(scalar_type).device(DefaultDevice()));
       torch::Tensor result = torch::index_add(base, dim, index, value);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_base = CopyToDevice(base, device);
         torch::Tensor lazy_index = CopyToDevice(index, device);
         torch::Tensor lazy_value = CopyToDevice(value, device);
         torch::Tensor lazy_result =
             torch::index_add(lazy_base, dim, lazy_index, lazy_value);
         AllEqual(result, lazy_result);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestIndexCopy) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor base = isFloatingType(scalar_type)
         ? torch::rand(
               {5, 3, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {5, 3, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     int rank = base.dim();
     for (int dim = -rank; dim < rank; ++dim) {
       torch::Tensor index = torch::randperm(
           base.size(dim),
           torch::TensorOptions(torch::kLong).device(DefaultDevice()));
       torch::Tensor value = isFloatingType(scalar_type)
           ? torch::rand(
                 base.sizes(),
                 torch::TensorOptions(scalar_type).device(DefaultDevice()))
           : torch::randint(
                 100,
                 base.sizes(),
                 torch::TensorOptions(scalar_type).device(DefaultDevice()));
       torch::Tensor result = torch::index_copy(base, dim, index, value);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_base = CopyToDevice(base, device);
         torch::Tensor lazy_index = CopyToDevice(index, device);
         torch::Tensor lazy_value = CopyToDevice(value, device);
         torch::Tensor lazy_result =
             torch::index_copy(lazy_base, dim, lazy_index, lazy_value);
         AllEqual(result, lazy_result);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestIndexCopyInPlace) {
   if (IsCuda()) {
     GTEST_SKIP();
   }
   int index_size = 10;
   int rank = 3;
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     for (int dim = -rank; dim < rank; ++dim) {
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor base = isFloatingType(scalar_type)
             ? torch::rand(
                   {5, 3, 7},
                   torch::TensorOptions(scalar_type).device(DefaultDevice()))
             : torch::randint(
                   100,
                   {5, 3, 7},
                   torch::TensorOptions(scalar_type).device(DefaultDevice()));
         torch::Tensor index = torch::randint(
             0,
             base.size(dim),
             {index_size},
             torch::TensorOptions(torch::kLong).device(DefaultDevice()));
         std::vector<int64_t> value_sizes(
             base.sizes().begin(), base.sizes().end());
         int canonical_dim = dim < 0 ? dim + rank : dim;
         value_sizes[canonical_dim] = index_size;
         torch::Tensor value = isFloatingType(scalar_type)
             ? torch::rand(
                   value_sizes,
                   torch::TensorOptions(scalar_type).device(DefaultDevice()))
             : torch::randint(
                   100,
                   value_sizes,
                   torch::TensorOptions(scalar_type).device(DefaultDevice()));
         torch::Tensor lazy_base = CopyToDevice(base.clone(), device);
         torch::Tensor result = base.index_copy_(dim, index, value);
         torch::Tensor lazy_index = CopyToDevice(index, device);
         torch::Tensor lazy_value = CopyToDevice(value, device);
         torch::Tensor lazy_result =
             lazy_base.index_copy_(dim, lazy_index, lazy_value);
         AllEqual(result, lazy_result);
         AllEqual(base, lazy_base);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestIndexCopyRank0) {
   for (torch::ScalarType scalar_type :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor base = isFloatingType(scalar_type)
         ? torch::rand(
               {5, 3, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()))
         : torch::randint(
               100,
               {5, 3, 7},
               torch::TensorOptions(scalar_type).device(DefaultDevice()));
     int rank = base.dim();
     for (int dim = -rank; dim < rank; ++dim) {
       torch::Tensor index = torch::randint(
           0,
           base.size(dim),
           at::IntArrayRef{},
           torch::TensorOptions(torch::kLong).device(DefaultDevice()));
       std::vector<int64_t> value_sizes(
           base.sizes().begin(), base.sizes().end());
       int canonical_dim = dim < 0 ? dim + rank : dim;
       value_sizes[canonical_dim] = 1;
       torch::Tensor value = isFloatingType(scalar_type)
           ? torch::rand(
                 value_sizes,
                 torch::TensorOptions(scalar_type).device(DefaultDevice()))
           : torch::randint(
                 100,
                 value_sizes,
                 torch::TensorOptions(scalar_type).device(DefaultDevice()));
       torch::Tensor result = torch::index_copy(base, dim, index, value);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_base = CopyToDevice(base, device);
         torch::Tensor lazy_index = CopyToDevice(index, device);
         torch::Tensor lazy_value = CopyToDevice(value, device);
         torch::Tensor lazy_result =
             torch::index_copy(lazy_base, dim, lazy_index, lazy_value);
         AllEqual(result, lazy_result);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestRelu) {
   torch::Tensor input = torch::rand(
       {2, 1, 4, 6},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::relu(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::relu(lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestReluInPlace) {
   torch::Tensor input = torch::rand(
       {2, 1, 4, 6},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor output = torch::relu_(input);
     torch::Tensor lazy_output = torch::relu_(lazy_input);
     AllClose(output, lazy_output);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestHardshrink) {
   torch::Tensor input = torch::randn(
       {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::hardshrink(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::hardshrink(lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestHardSigmoid) {
   torch::Tensor input = torch::randn(
       {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::hardsigmoid(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::hardsigmoid(lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestHardSigmoidInPlace) {
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor input = torch::randn(
         {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor output = torch::hardsigmoid_(input);
     torch::Tensor lazy_output = torch::hardsigmoid_(lazy_input);
     AllClose(input, lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestHardSigmoidBackward) {
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::hardsigmoid(inputs[0]);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::randn(
             {10},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestSoftshrink) {
   torch::Tensor input = torch::randn(
       {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::softshrink(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::softshrink(lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestHardtanh) {
   torch::Tensor input = torch::randn(
       {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::hardtanh(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::hardtanh(lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestHardtanhInPlace) {
   torch::Tensor input = torch::randn(
       {10}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor output = torch::hardtanh_(input);
     torch::Tensor lazy_output = torch::hardtanh_(lazy_input);
     AllClose(output, lazy_output);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestLeakyRelu) {
   torch::Tensor input = torch::rand(
       {2, 1, 4, 6},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   double negative_slope = 0.01;
   torch::Tensor output = torch::leaky_relu(input, negative_slope);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::leaky_relu(lazy_input, negative_slope);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestLeakyReluInPlace) {
   torch::Tensor input = torch::rand(
       {2, 1, 4, 6},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   double negative_slope = 0.01;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor output = torch::leaky_relu_(input, negative_slope);
     torch::Tensor lazy_output = torch::leaky_relu_(lazy_input, negative_slope);
     AllClose(output, lazy_output);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestExp) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::exp(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::exp(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestExpm1) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::expm1(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::expm1(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestLog) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::log(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::log(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestLog2) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::log2(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::log2(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestLog10) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::log10(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::log10(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestLog1p) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::log1p(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::log1p(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestErf) {
   torch::Tensor a = torch::randn(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::erf(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::erf(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestErfc) {
   torch::Tensor a = torch::randn(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::erfc(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::erfc(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestErfinv) {
   torch::Tensor a = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::erfinv(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::erfinv(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestSqrt) {
   torch::Tensor a = torch::abs(torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())));
   torch::Tensor b = torch::sqrt(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::sqrt(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestRsqrt) {
   torch::Tensor a = torch::abs(torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())));
   torch::Tensor b = torch::rsqrt(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::rsqrt(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestReciprocal) {
   torch::Tensor a = torch::randn(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::reciprocal(a);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::reciprocal(lazy_a);
     AllClose(b, lazy_b, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestPowTensorScalar) {
   torch::Tensor base = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar exponent = 4.09;
   torch::Tensor result = torch::pow(base, exponent);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_base = CopyToDevice(base, device);
     torch::Tensor lazy_result = torch::pow(lazy_base, exponent);
     AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestPowTensorScalarInPlace) {
   torch::Tensor base = torch::rand(
       {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar exponent = 4.09;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_base = CopyToDevice(base.clone(), device);
     torch::Tensor result = base.pow_(exponent);
     torch::Tensor lazy_result = lazy_base.pow_(exponent);
     AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5);
     AllClose(base, lazy_base, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestPowTensorTensor) {
   torch::Tensor base = torch::abs(torch::rand(
       {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())));
   torch::Tensor exponent = torch::rand(
       {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor result = torch::pow(base, exponent);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_base = CopyToDevice(base, device);
     torch::Tensor lazy_exponent = CopyToDevice(exponent, device);
     torch::Tensor lazy_result = torch::pow(lazy_base, lazy_exponent);
     AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestPowTensorTensorInPlace) {
   torch::Tensor base = torch::abs(torch::rand(
       {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())));
   torch::Tensor exponent = torch::rand(
       {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_base = CopyToDevice(base.clone(), device);
     torch::Tensor result = base.pow_(exponent);
     torch::Tensor lazy_exponent = CopyToDevice(exponent, device);
     torch::Tensor lazy_result = lazy_base.pow_(lazy_exponent);
     AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5);
     AllClose(base, lazy_base, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestPowTensorTensorBroadcast) {
   torch::Tensor base = torch::abs(torch::rand(
       {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())));
   torch::Tensor exponent = torch::rand(
       {4, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor result = torch::pow(base, exponent);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_base = CopyToDevice(base, device);
     torch::Tensor lazy_exponent = CopyToDevice(exponent, device);
     torch::Tensor lazy_result = torch::pow(lazy_base, lazy_exponent);
     AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestPowScalarTensor) {
   torch::Scalar base = 3.5;
   torch::Tensor exponent = torch::rand({4, 2});
   torch::Tensor result = torch::pow(base, exponent);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_exponent = CopyToDevice(exponent, device);
     torch::Tensor lazy_result = torch::pow(base, lazy_exponent);
     AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestPowIntExponent) {
   torch::Tensor base = torch::abs(torch::rand(
       {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())));
   torch::Scalar exponent = 3;
   torch::Tensor result = torch::pow(base, exponent);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_base = CopyToDevice(base, device);
     torch::Tensor lazy_result = torch::pow(lazy_base, exponent);
     AllClose(result, lazy_result, /*rtol=*/1e-3, /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestFmodScalar) {
   torch::Tensor a =
       torch::rand(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100.0;
   torch::Scalar divisor = 2.0;
   torch::Tensor b = torch::fmod(a, divisor);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::fmod(lazy_a, divisor);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestFmodScalarInPlace) {
   torch::Scalar divisor = 2.0;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor a =
         torch::rand(
             {2, 2},
             torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
         100.0;
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor b = a.fmod_(divisor);
     torch::Tensor lazy_b = lazy_a.fmod_(divisor);
     AllClose(b, lazy_b);
     AllClose(a, lazy_a);
   });
 }

 TEST_F(LazyOpsTest, TestFmodTensor) {
   torch::Tensor a =
       torch::rand(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100.0;
   torch::Tensor b =
       torch::rand(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       10.0;
   torch::Tensor c = torch::fmod(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::fmod(lazy_a, lazy_b);
     AllClose(c, lazy_c);
   });
 }

 TEST_F(LazyOpsTest, TestFmodTensorInPlace) {
   torch::Tensor b =
       torch::rand(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       10.0;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor a =
         torch::rand(
             {2, 2},
             torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
         100.0;
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor c = a.fmod_(b);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = lazy_a.fmod_(lazy_b);
     AllClose(c, lazy_c);
     AllClose(a, lazy_a);
   });
 }

 TEST_F(LazyOpsTest, TestRemainderScalar) {
   torch::Tensor a =
       torch::randn(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100.0;
   torch::Scalar divisor = -2.0;
   torch::Tensor b = torch::remainder(a, divisor);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = torch::remainder(lazy_a, divisor);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestRemainderScalarInPlace) {
   torch::Scalar divisor = -2.0;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor a =
         torch::randn(
             {2, 2},
             torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
         100.0;
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor b = a.remainder_(divisor);
     torch::Tensor lazy_b = lazy_a.remainder_(divisor);
     AllClose(b, lazy_b);
     AllClose(a, lazy_a);
   });
 }

 TEST_F(LazyOpsTest, TestRemainderTensor) {
   torch::Tensor a =
       torch::randn(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       100.0;
   torch::Tensor b =
       torch::randn(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       10.0;
   torch::Tensor c = torch::remainder(a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = torch::remainder(lazy_a, lazy_b);
     AllClose(c, lazy_c, /*rtol=*/1e-4, /*atol=*/1e-6);
   });
 }

 TEST_F(LazyOpsTest, TestRemainderTensorInPlace) {
   torch::Tensor b =
       torch::randn(
           {2, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
       10.0;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor a =
         torch::randn(
             {2, 2},
             torch::TensorOptions(torch::kFloat).device(DefaultDevice())) *
         100.0;
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor c = a.remainder_(b);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = lazy_a.remainder_(lazy_b);
     AllClose(c, lazy_c, /*rtol=*/1e-4, /*atol=*/1e-6);
     AllClose(a, lazy_a, /*rtol=*/1e-4, /*atol=*/1e-6);
   });
 }

 TEST_F(LazyOpsTest, TestWhere) {
   torch::Tensor a = torch::rand(
       {3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::empty(
       {3, 3}, torch::TensorOptions(torch::kByte).device(DefaultDevice()));
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 3; ++j) {
       c[i][j] = i == j;
     }
   }
   torch::Tensor d = torch::where(c, a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = CopyToDevice(c, device);
     torch::Tensor lazy_d = torch::where(lazy_c, lazy_a, lazy_b);
     AllClose(d, lazy_d);
   });
 }

 TEST_F(LazyOpsTest, TestWhereBroadcast) {
   torch::Tensor a = torch::rand(
       {3, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::zeros(
       {}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::empty(
       {3, 3}, torch::TensorOptions(torch::kByte).device(DefaultDevice()));
   for (int i = 0; i < 3; ++i) {
     for (int j = 0; j < 3; ++j) {
       c[i][j] = i == j;
     }
   }
   torch::Tensor d = torch::where(c, a, b);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = CopyToDevice(c, device);
     torch::Tensor lazy_d = torch::where(lazy_c, lazy_a, lazy_b);
     AllClose(d, lazy_d);
   });
 }

 TEST_F(LazyOpsTest, TestThreshold) {
   torch::Tensor input = torch::rand(
       {2, 1, 4, 6},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   float threshold = 0.4;
   float value = 20;
   torch::Tensor output = torch::threshold(input, threshold, value);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::threshold(lazy_input, threshold, value);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestThresholdBackward) {
   float threshold = 0.4;
   float value = 20;

   auto testFunction =
       [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::threshold(inputs[0], threshold, value);
   };

   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {2, 1, 4, 6},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testFunction);
   });
 }

 TEST_F(LazyOpsTest, TestThresholdInPlace) {
   torch::Tensor input = torch::rand(
       {2, 1, 4, 6},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = input.clone();
   float threshold = 0.4;
   float value = 20;
   torch::threshold_(output, threshold, value);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_output = CopyToDevice(input, device);
     torch::threshold_(lazy_output, threshold, value);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestElu) {
   torch::Tensor input = torch::rand(
       {2, 1, 4, 6},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar alpha = 0.5;
   torch::Scalar scale = 2.5;
   torch::Scalar input_scale = 1.5;
   torch::Tensor output = torch::elu(input, alpha, scale, input_scale);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output =
         torch::elu(lazy_input, alpha, scale, input_scale);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestEluInPlace) {
   torch::Tensor input = torch::rand(
       {2, 1, 4, 6},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar alpha = 0.5;
   torch::Scalar scale = 2.5;
   torch::Scalar input_scale = 1.5;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor output = torch::elu_(input, alpha, scale, input_scale);
     torch::Tensor lazy_output =
         torch::elu_(lazy_input, alpha, scale, input_scale);
     AllClose(output, lazy_output);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestSelu) {
   torch::Tensor input = torch::rand(
       {2, 1, 4, 6},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::selu(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::selu(lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestSeluInPlace) {
   torch::Tensor input = torch::rand(
       {2, 1, 4, 6},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor output = torch::selu_(input);
     torch::Tensor lazy_output = torch::selu_(lazy_input);
     AllClose(output, lazy_output);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestCelu) {
   torch::Tensor input = torch::rand(
       {2, 1, 4, 6},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar alpha = 2.5;
   torch::Tensor output = torch::celu(input, alpha);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::celu(lazy_input, alpha);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestCeluInPlace) {
   torch::Tensor input = torch::rand(
       {2, 1, 4, 6},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar alpha = 2.5;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor output = torch::celu_(input, alpha);
     torch::Tensor lazy_output = torch::celu_(lazy_input, alpha);
     AllClose(output, lazy_output);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestGelu) {
   torch::Tensor input = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::gelu(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::gelu(lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestAddMatMul) {
   int in_channels = 32;
   int out_channels = 320;
   int labels = 50;
   torch::Tensor input = torch::rand(
       {in_channels, out_channels},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor weight = torch::rand(
       {out_channels, labels},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor bias = torch::rand(
       {labels}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // Test beta != 1. through the CPU interop.
   for (double beta : {1., 2.}) {
     torch::Tensor output = torch::addmm(bias, input, weight, /*beta=*/beta);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_weight = CopyToDevice(weight, device);
       torch::Tensor lazy_bias = CopyToDevice(bias, device);
       torch::Tensor lazy_output =
           torch::addmm(lazy_bias, lazy_input, lazy_weight, /*beta=*/beta);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestEmbedding) {
   torch::Tensor a = torch::rand(
       {32, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor i = torch::randint(
       0,
       31,
       {3, 4},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   torch::Tensor b = torch::embedding(
       a,
       i,
       /*padding_idx=*/0,
       /*scale_grad_by_freq=*/false,
       /*sparse=*/false);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_i = CopyToDevice(i, device);
     torch::Tensor lazy_b = torch::embedding(
         lazy_a,
         lazy_i,
         /*padding_idx=*/0,
         /*scale_grad_by_freq=*/false,
         /*sparse=*/false);
     AllClose(b, lazy_b);
   });
 }

 TEST_F(LazyOpsTest, TestOneHot) {
   int num_classes = 5;
   torch::Tensor input = torch::randint(
       0,
       num_classes,
       {10},
       torch::TensorOptions(torch::kLong).device(DefaultDevice()));
   torch::Tensor output = torch::one_hot(input, num_classes);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::one_hot(lazy_input, num_classes);
     AllEqual(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestTranspose) {
   torch::Tensor input = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::t(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::t(lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestTransposeInPlace) {
   torch::Tensor input = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor output = input.t_();
     torch::Tensor lazy_output = lazy_input.t_();
     EXPECT_EQ(lazy_output.sizes(), output.sizes());
     AllClose(output, lazy_output);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestReshape) {
   torch::Tensor input = torch::rand(
       {32, 20, 4, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::reshape(input, {-1, 320});
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::reshape(lazy_input, {-1, 320});
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestResize) {
   // Testing a resize_() with target size bigger than original size is not
   // possible, as we fill with zeros, while pytorch fills with random garbage.
   torch::Tensor input = torch::rand(
       {2, 2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor saved_input = input.clone();
   input.resize_({3, 3});
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(saved_input, device);
     lazy_input.resize_({3, 3});
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestViewResize) {
   torch::Tensor input = torch::zeros(
       {8, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor saved_input = input.clone();
   torch::Tensor output = input.view({4, 4});
   output.resize_({3, 3});
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(saved_input, device);
     torch::Tensor lazy_output = lazy_input.view({4, 4});
     lazy_output.resize_({3, 3});
     AllClose(input, lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestView) {
   torch::Tensor input = torch::rand(
       {32, 20, 4, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = input.view({-1, 320});
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = lazy_input.view({-1, 320});
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestViewMod) {
   torch::Tensor input = torch::zeros(
       {32, 20, 4, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor one = torch::tensor(
       1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = input.view({-1, 320});
   output.add_(one, 1.0);
   input.add_(one, 1.0);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor xinput = torch::zeros(
         {32, 20, 4, 4},
         torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_input = CopyToDevice(xinput, device);
     torch::Tensor lazy_one = CopyToDevice(one, device);
     torch::Tensor lazy_output = lazy_input.view({-1, 320});
     lazy_output.add_(lazy_one, 1.0);
     lazy_input.add_(lazy_one, 1.0);
     AllClose(output, lazy_output);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestViewModComplex) {
   torch::Tensor input = torch::zeros(
       {32, 20, 4, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor one = torch::tensor(
       1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output1 = input.view({-1, 320});
   output1.add_(one, 1.0);
   torch::Tensor output2 = input.view({-1, 160});
   output2.add_(one, 1.0);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor xinput = torch::zeros(
         {32, 20, 4, 4},
         torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_input = CopyToDevice(xinput, device);
     torch::Tensor lazy_one = CopyToDevice(one, device);
     torch::Tensor lazy_output1 = lazy_input.view({-1, 320});
     lazy_output1.add_(lazy_one, 1.0);
     torch::Tensor lazy_output2 = lazy_input.view({-1, 160});
     lazy_output2.add_(lazy_one, 1.0);
     AllClose(output1, lazy_output1);
     AllClose(output2, lazy_output2);
   });
 }

 TEST_F(LazyOpsTest, TestViewOfViewMod) {
   torch::Tensor input = torch::zeros(
       {32, 20, 4, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor one = torch::tensor(
       1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output1 = input.view({-1, 320});
   output1.add_(one, 1.0);
   torch::Tensor output2 = output1.view({-1, 160});
   output2.add_(one, 1.0);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor xinput = torch::zeros(
         {32, 20, 4, 4},
         torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_input = CopyToDevice(xinput, device);
     torch::Tensor lazy_one = CopyToDevice(one, device);
     torch::Tensor lazy_output1 = lazy_input.view({-1, 320});
     lazy_output1.add_(lazy_one, 1.0);
     torch::Tensor lazy_output2 = lazy_output1.view({-1, 160});
     lazy_output2.add_(lazy_one, 1.0);
     AllClose(output1, lazy_output1);
     AllClose(output2, lazy_output2);
   });
 }

 TEST_F(LazyOpsTest, TestViewSqueezeAddInPlace) {
   torch::Tensor input = torch::zeros(
       {2, 3, 1}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<int64_t> view_size = {2, 3, 1, 1};
   int squeeze_dim = 2;
   torch::Tensor one = torch::tensor(
       1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor output = input.view(view_size);
     output.squeeze_(squeeze_dim);
     output.add_(one, 1.0);
     torch::Tensor lazy_one = CopyToDevice(one, device);
     torch::Tensor lazy_output = lazy_input.view(view_size);
     lazy_output.squeeze_(squeeze_dim);
     lazy_output.add_(lazy_one, 1.0);
     AllClose(output, lazy_output);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestUnsafeView) {
   torch::Tensor input = torch::rand(
       {32, 20, 4, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::_unsafe_view(input, {-1, 320});
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::_unsafe_view(lazy_input, {-1, 320});
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestNarrow) {
   torch::Tensor a = torch::rand(
       {8, 10, 4, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int64_t dim : {1, -3}) {
     for (int64_t start : {2, -8}) {
       torch::Tensor b = a.narrow(dim, start, 6);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_a = CopyToDevice(a, device);
         torch::Tensor lazy_b = lazy_a.narrow(dim, start, 6);
         AllClose(b, lazy_b);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestNarrowUpdate) {
   for (int64_t dim : {1, -2}) {
     for (int64_t start : {2, -6}) {
       torch::Tensor a = torch::rand(
           {3, 8, 3},
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor a_copy = a.clone();
       torch::Tensor b = torch::rand(
           {3, 4, 3},
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor c = a.narrow(dim, start, 4);
       c.add_(b, 1.0);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_a = CopyToDevice(a_copy, device);
         torch::Tensor lazy_b = CopyToDevice(b, device);
         torch::Tensor lazy_c = lazy_a.narrow(dim, start, 4);
         lazy_c.add_(lazy_b, 1.0);
         AllClose(c, lazy_c);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestNarrowUpdateBaseCheck) {
   for (int64_t dim : {0, -2}) {
     for (int64_t start : {2, -6}) {
       torch::Tensor a = torch::zeros(
           {8, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor a_copy = a.clone();
       torch::Tensor b = torch::ones(
           {4, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor c = a.narrow(dim, start, 4);
       c.add_(b, 1.0);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_a = CopyToDevice(a_copy, device);
         torch::Tensor lazy_b = CopyToDevice(b, device);
         torch::Tensor lazy_c = lazy_a.narrow(dim, start, 4);
         lazy_c.add_(lazy_b, 1.0);
         AllClose(a, lazy_a);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestNarrowUpdateTwoSlices) {
   for (int64_t dim : {0, -2}) {
     for (int64_t start0 : {2, -6}) {
       for (int64_t start1 : {6, -2}) {
         torch::Tensor a = torch::zeros(
             {8, 3},
             torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
         torch::Tensor a_copy = a.clone();
         torch::Tensor b = torch::ones(
             {2, 3},
             torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
         torch::Tensor c = b + 1;
         torch::Tensor d = a.narrow(dim, start0, 2);
         torch::Tensor e = a.narrow(dim, start1, 2);
         d.add_(b, 1.0);
         e.add_(c, 1.0);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_a = CopyToDevice(a_copy, device);
           torch::Tensor lazy_b = CopyToDevice(b, device);
           torch::Tensor lazy_c = CopyToDevice(c, device);
           torch::Tensor lazy_d = lazy_a.narrow(dim, start0, 2);
           torch::Tensor lazy_e = lazy_a.narrow(dim, start1, 2);
           lazy_d.add_(lazy_b, 1.0);
           lazy_e.add_(lazy_c, 1.0);
           AllClose(d, lazy_d);
           AllClose(e, lazy_e);
           AllClose(a, lazy_a);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestNarrowUpdateView) {
   for (int64_t dim : {0, -3}) {
     for (int64_t start : {2, -6}) {
       torch::Tensor a = torch::rand(
           {8, 2, 3},
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor a_copy = a.clone();
       torch::Tensor b = torch::rand(
           {4, 6}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor c = a.narrow(dim, start, 4);
       torch::Tensor d = c.view({4, 6});
       d.add_(b, 1.0);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_a = CopyToDevice(a_copy, device);
         torch::Tensor lazy_b = CopyToDevice(b, device);
         torch::Tensor lazy_c = lazy_a.narrow(dim, start, 4);
         torch::Tensor lazy_d = lazy_c.view({4, 6});
         lazy_d.add_(lazy_b, 1.0);
         AllClose(d, lazy_d);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestNarrowInNarrowUpdate) {
   for (int64_t dim : {1, -2}) {
     for (int64_t start0 : {1, -7}) {
       for (int64_t start1 : {1, -5}) {
         torch::Tensor a = torch::rand(
             {3, 8, 3},
             torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
         torch::Tensor a_copy = a.clone();
         torch::Tensor b = torch::rand(
             {3, 2, 3},
             torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
         torch::Tensor c = a.narrow(dim, start0, 6);
         torch::Tensor d = c.narrow(dim, start1, 2);
         d.add_(b, 1.0);
         ForEachDevice([&](const torch::Device& device) {
           torch::Tensor lazy_a = CopyToDevice(a_copy, device);
           torch::Tensor lazy_b = CopyToDevice(b, device);
           torch::Tensor lazy_c = lazy_a.narrow(dim, start0, 6);
           torch::Tensor lazy_d = lazy_c.narrow(dim, start1, 2);
           lazy_d.add_(lazy_b, 1.0);
           AllClose(a, lazy_a);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestNarrowCopy) {
   for (int64_t dim : {1, -3}) {
     for (int64_t start : {2, -8}) {
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor input = torch::rand(
             {8, 10, 4, 4},
             torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
         torch::Tensor lazy_input = CopyToDevice(input, device);
         torch::Tensor result = input.narrow_copy(dim, start, 6);
         input.add_(1);
         torch::Tensor lazy_result = lazy_input.narrow_copy(dim, start, 6);
         lazy_input.add_(1);
         AllClose(result, lazy_result);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestViewAs) {
   torch::Tensor input = torch::rand(
       {32, 20, 4, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor empty = torch::empty({32, 320});
   torch::Tensor output = input.view_as(empty);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_empty = CopyToDevice(empty, device);
     torch::Tensor lazy_output = lazy_input.view_as(lazy_empty);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestLogSoftmax) {
   torch::Tensor input = torch::rand(
       {5, 3, 4, 2},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     int rank = input.dim();
     for (int dim = -rank; dim < rank; ++dim) {
       torch::Tensor output = torch::log_softmax(input, dim);
       torch::Tensor lazy_output = torch::log_softmax(lazy_input, dim);
       AllClose(output, lazy_output, /*rtol=*/1e-3);
     }
   });
 }

 TEST_F(LazyOpsTest, TestLogSoftmaxCast) {
   torch::Tensor input = torch::rand(
       {5, 3, 4, 2},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     int rank = input.dim();
     for (int dim = -rank; dim < rank; ++dim) {
       torch::Tensor output = torch::log_softmax(input, dim, torch::kDouble);
       torch::Tensor lazy_output =
           torch::log_softmax(lazy_input, dim, torch::kDouble);
       AllClose(output, lazy_output, /*rtol=*/1e-3);
     }
   });
 }

 TEST_F(LazyOpsTest, TestLogSoftmaxWrapper) {
   torch::Tensor input = torch::rand(
       {10, 2, 6, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     int rank = input.dim();
     for (int dim = -rank; dim < rank; ++dim) {
       torch::Tensor output =
           torch::_log_softmax(input, dim, /*half_to_float=*/false);
       torch::Tensor lazy_output =
           torch::_log_softmax(lazy_input, dim, /*half_to_float=*/false);
       AllClose(output, lazy_output, /*rtol=*/1e-3);
     }
   });
 }

 TEST_F(LazyOpsTest, TestSoftmax) {
   torch::Tensor input = torch::rand(
       {10, 2, 6, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     int rank = input.dim();
     for (int dim = -rank; dim < rank; ++dim) {
       torch::Tensor output = torch::softmax(input, dim);
       torch::Tensor lazy_output = torch::softmax(lazy_input, dim);
       AllClose(output, lazy_output, /*rtol=*/1e-3);
     }
   });
 }

 TEST_F(LazyOpsTest, TestSoftmaxCast) {
   torch::Tensor input = torch::rand(
       {10, 2, 6, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     int rank = input.dim();
     for (int dim = -rank; dim < rank; ++dim) {
       torch::Tensor output = torch::softmax(input, dim, torch::kDouble);
       torch::Tensor lazy_output =
           torch::softmax(lazy_input, dim, torch::kDouble);
       AllClose(output, lazy_output, /*rtol=*/1e-3);
     }
   });
 }

 TEST_F(LazyOpsTest, TestSoftmaxWrapper) {
   torch::Tensor input = torch::rand(
       {10, 2, 6, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     int rank = input.dim();
     for (int dim = -rank; dim < rank; ++dim) {
       torch::Tensor output =
           torch::_softmax(input, dim, /*half_to_float=*/false);
       torch::Tensor lazy_output =
           torch::_softmax(lazy_input, dim, /*half_to_float=*/false);
       AllClose(output, lazy_output, /*rtol=*/1e-3);
     }
   });
 }

 TEST_F(LazyOpsTest, TestSoftplus) {
   torch::Tensor input = torch::rand(
       {2, 1, 4, 6},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::softplus(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::softplus(lazy_input);
     AllClose(output, lazy_output, /*rtol=*/1e-4);
   });
 }

 TEST_F(LazyOpsTest, TestMaxPool1D) {
   torch::Tensor input = torch::rand(
       {1, 16, 56}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 3;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         // Test dilation through the CPU interop.
         for (int dilation = 1; dilation <= 2; ++dilation) {
           torch::Tensor output = torch::max_pool1d(
               input,
               /*kernel_size=*/{kernel_size},
               /*stride=*/{stride},
               /*padding=*/{padding},
               /*dilation=*/{dilation},
               /*ceil_mode=*/ceil_mode);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output = torch::max_pool1d(
                 lazy_input,
                 /*kernel_size=*/{kernel_size},
                 /*stride=*/{stride},
                 /*padding=*/{padding},
                 /*dilation=*/{dilation},
                 /*ceil_mode=*/ceil_mode);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxPool2D) {
   torch::Tensor input = torch::rand(
       {1, 4, 14, 14},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 3;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         // Test dilation through the CPU interop.
         for (int dilation = 1; dilation <= 2; ++dilation) {
           torch::Tensor output = torch::max_pool2d(
               input,
               /*kernel_size=*/{kernel_size, kernel_size},
               /*stride=*/{stride, stride},
               /*padding=*/{padding, padding},
               /*dilation=*/{dilation, dilation},
               /*ceil_mode=*/ceil_mode);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output = torch::max_pool2d(
                 lazy_input,
                 /*kernel_size=*/{kernel_size, kernel_size},
                 /*stride=*/{stride, stride},
                 /*padding=*/{padding, padding},
                 /*dilation=*/{dilation, dilation},
                 /*ceil_mode=*/ceil_mode);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxPool2DWithIndices) {
   torch::Tensor input = torch::rand(
       {1, 4, 14, 14},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 3;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         // Test dilation through the CPU interop.
         for (int dilation = 1; dilation <= 2; ++dilation) {
           auto outputs = torch::max_pool2d_with_indices(
               input,
               /*kernel_size=*/{kernel_size, kernel_size},
               /*stride=*/{stride, stride},
               /*padding=*/{padding, padding},
               /*dilation=*/{dilation, dilation},
               /*ceil_mode=*/ceil_mode);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             auto lazy_outputs = torch::max_pool2d_with_indices(
                 lazy_input,
                 /*kernel_size=*/{kernel_size, kernel_size},
                 /*stride=*/{stride, stride},
                 /*padding=*/{padding, padding},
                 /*dilation=*/{dilation, dilation},
                 /*ceil_mode=*/ceil_mode);
             AllClose(std::get<0>(outputs), std::get<0>(lazy_outputs));
             AllClose(std::get<1>(outputs), std::get<1>(lazy_outputs));
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxPool2DNonSquare) {
   torch::Tensor input = torch::rand(
       {1, 4, 14, 14},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 4;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         // Test dilation through the CPU interop.
         for (int dilation = 1; dilation <= 2; ++dilation) {
           torch::Tensor output = torch::max_pool2d(
               input,
               /*kernel_size=*/{kernel_size, kernel_size + 1},
               /*stride=*/{stride, stride + 1},
               /*padding=*/{padding, padding + 1},
               /*dilation=*/{dilation, dilation},
               /*ceil_mode=*/ceil_mode);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output = torch::max_pool2d(
                 lazy_input,
                 /*kernel_size=*/{kernel_size, kernel_size + 1},
                 /*stride=*/{stride, stride + 1},
                 /*padding=*/{padding, padding + 1},
                 /*dilation=*/{dilation, dilation},
                 /*ceil_mode=*/ceil_mode);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxPool3D) {
   torch::Tensor input = torch::rand(
       {1, 1, 8, 8, 8},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 3;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         // Test dilation through the CPU interop.
         for (int dilation = 1; dilation <= 2; ++dilation) {
           torch::Tensor output = torch::max_pool3d(
               input,
               /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
               /*stride=*/{stride, stride, stride},
               /*padding=*/{padding, padding, padding},
               /*dilation=*/{dilation, dilation, dilation},
               /*ceil_mode=*/ceil_mode);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output = torch::max_pool3d(
                 lazy_input,
                 /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
                 /*stride=*/{stride, stride, stride},
                 /*padding=*/{padding, padding, padding},
                 /*dilation=*/{dilation, dilation, dilation},
                 /*ceil_mode=*/ceil_mode);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxPool3DWithIndices) {
   torch::Tensor input = torch::rand(
       {1, 1, 8, 8, 8},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 3;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         // Test dilation through the CPU interop.
         for (int dilation = 1; dilation <= 2; ++dilation) {
           auto outputs = torch::max_pool3d_with_indices(
               input,
               /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
               /*stride=*/{stride, stride, stride},
               /*padding=*/{padding, padding, padding},
               /*dilation=*/{dilation, dilation, dilation},
               /*ceil_mode=*/ceil_mode);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             auto lazy_outputs = torch::max_pool3d_with_indices(
                 lazy_input,
                 /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
                 /*stride=*/{stride, stride, stride},
                 /*padding=*/{padding, padding, padding},
                 /*dilation=*/{dilation, dilation, dilation},
                 /*ceil_mode=*/ceil_mode);

             AllClose(std::get<0>(outputs), std::get<0>(lazy_outputs));
             AllClose(std::get<1>(outputs), std::get<1>(lazy_outputs));
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxPool3DIncompleteAttributes) {
   torch::Tensor input = torch::rand(
       {1, 1, 8, 8, 8},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 3;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         // Test dilation through the CPU interop.
         for (int dilation = 1; dilation <= 2; ++dilation) {
           torch::Tensor output = torch::max_pool3d(
               input,
               /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
               /*stride=*/{},
               /*padding=*/{padding},
               /*dilation=*/{dilation, dilation, dilation},
               /*ceil_mode=*/ceil_mode);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output = torch::max_pool3d(
                 lazy_input,
                 /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
                 /*stride=*/{},
                 /*padding=*/{padding},
                 /*dilation=*/{dilation, dilation, dilation},
                 /*ceil_mode=*/ceil_mode);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxPool3DNonSquare) {
   torch::Tensor input = torch::rand(
       {1, 1, 8, 8, 8},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 4;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         // Test dilation through the CPU interop.
         for (int dilation = 1; dilation <= 2; ++dilation) {
           torch::Tensor output = torch::max_pool3d(
               input,
               /*kernel_size=*/{kernel_size, kernel_size + 1, kernel_size},
               /*stride=*/{stride, stride + 1, stride},
               /*padding=*/{padding, padding + 1, padding},
               /*dilation=*/{dilation, dilation, dilation},
               /*ceil_mode=*/ceil_mode);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output = torch::max_pool3d(
                 lazy_input,
                 /*kernel_size=*/{kernel_size, kernel_size + 1, kernel_size},
                 /*stride=*/{stride, stride + 1, stride},
                 /*padding=*/{padding, padding + 1, padding},
                 /*dilation=*/{dilation, dilation, dilation},
                 /*ceil_mode=*/ceil_mode);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxPool2DNoBatch) {
   torch::Tensor input = torch::rand(
       {4, 14, 14}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 3;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         // Test dilation through the CPU interop.
         for (int dilation = 1; dilation <= 2; ++dilation) {
           torch::Tensor output = torch::max_pool2d(
               input,
               /*kernel_size=*/{kernel_size, kernel_size},
               /*stride=*/{stride, stride},
               /*padding=*/{padding, padding},
               /*dilation=*/{dilation, dilation},
               /*ceil_mode=*/ceil_mode);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output = torch::max_pool2d(
                 lazy_input,
                 /*kernel_size=*/{kernel_size, kernel_size},
                 /*stride=*/{stride, stride},
                 /*padding=*/{padding, padding},
                 /*dilation=*/{dilation, dilation},
                 /*ceil_mode=*/ceil_mode);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxPool3DNoBatch) {
   torch::Tensor input = torch::rand(
       {1, 8, 8, 8},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 3;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         // Test dilation through the CPU interop.
         for (int dilation = 1; dilation <= 2; ++dilation) {
           torch::Tensor output = torch::max_pool3d(
               input,
               /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
               /*stride=*/{stride, stride, stride},
               /*padding=*/{padding, padding, padding},
               /*dilation=*/{dilation, dilation, dilation},
               /*ceil_mode=*/ceil_mode);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output = torch::max_pool3d(
                 lazy_input,
                 /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
                 /*stride=*/{stride, stride, stride},
                 /*padding=*/{padding, padding, padding},
                 /*dilation=*/{dilation, dilation, dilation},
                 /*ceil_mode=*/ceil_mode);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestAvgPool1D) {
   torch::Tensor input = torch::rand(
       {4, 1, 28}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 2;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       for (bool count_include_pad : {true, false}) {
         // Test ceil_mode=true through the CPU interop.
         for (bool ceil_mode : {false, true}) {
           torch::Tensor output = torch::avg_pool1d(
               input,
               /*kernel_size=*/{kernel_size},
               /*stride=*/{stride},
               /*padding=*/{padding},
               /*ceil_mode=*/ceil_mode,
               /*count_include_pad=*/count_include_pad);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output = torch::avg_pool1d(
                 lazy_input,
                 /*kernel_size=*/{kernel_size},
                 /*stride=*/{stride},
                 /*padding=*/{padding},
                 /*ceil_mode=*/ceil_mode,
                 /*count_include_pad=*/count_include_pad);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestAvgPool2D) {
   torch::Tensor input = torch::rand(
       {2, 1, 14, 14},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 2;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       for (bool count_include_pad : {true, false}) {
         // Test ceil_mode=true through the CPU interop.
         for (bool ceil_mode : {false, true}) {
           torch::Tensor output = torch::avg_pool2d(
               input,
               /*kernel_size=*/{kernel_size, kernel_size},
               /*stride=*/{stride, stride},
               /*padding=*/{padding, padding},
               /*ceil_mode=*/ceil_mode,
               /*count_include_pad=*/count_include_pad);
           ForEachDevice([&](const torch::Device& device) {
             // torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output = torch::avg_pool2d(
                 lazy_input,
                 /*kernel_size=*/{kernel_size, kernel_size},
                 /*stride=*/{stride, stride},
                 /*padding=*/{padding, padding},
                 /*ceil_mode=*/ceil_mode,
                 /*count_include_pad=*/count_include_pad);
             AllClose(output, lazy_output.to(torch::kCPU));
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestAvgPool2DNonSquare) {
   torch::Tensor input = torch::rand(
       {2, 1, 14, 14},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 4;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       for (bool count_include_pad : {true, false}) {
         // Test ceil_mode=true through the CPU interop.
         for (bool ceil_mode : {false, true}) {
           torch::Tensor output = torch::avg_pool2d(
               input,
               /*kernel_size=*/{kernel_size, kernel_size + 1},
               /*stride=*/{stride, stride + 1},
               /*padding=*/{padding, padding + 1},
               /*ceil_mode=*/ceil_mode,
               /*count_include_pad=*/count_include_pad);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output = torch::avg_pool2d(
                 lazy_input,
                 /*kernel_size=*/{kernel_size, kernel_size + 1},
                 /*stride=*/{stride, stride + 1},
                 /*padding=*/{padding, padding + 1},
                 /*ceil_mode=*/ceil_mode,
                 /*count_include_pad=*/count_include_pad);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestAvgPool3D) {
   torch::Tensor input = torch::rand(
       {1, 1, 7, 7, 7},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 2;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       for (bool count_include_pad : {true, false}) {
         // Test ceil_mode=true through the CPU interop.
         for (bool ceil_mode : {false, true}) {
           torch::Tensor output = torch::avg_pool3d(
               input,
               /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
               /*stride=*/{stride, stride, stride},
               /*padding=*/{padding, padding, padding},
               /*ceil_mode=*/ceil_mode,
               /*count_include_pad=*/count_include_pad);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output = torch::avg_pool3d(
                 lazy_input,
                 /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
                 /*stride=*/{stride, stride, stride},
                 /*padding=*/{padding, padding, padding},
                 /*ceil_mode=*/ceil_mode,
                 /*count_include_pad=*/count_include_pad);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestAvgPool3DIncompleteAttributes) {
   torch::Tensor input = torch::rand(
       {1, 1, 7, 7, 7},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 2;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       for (bool count_include_pad : {true, false}) {
         // Test ceil_mode=true through the CPU interop.
         for (bool ceil_mode : {false, true}) {
           torch::Tensor output = torch::avg_pool3d(
               input,
               /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
               /*stride=*/{},
               /*padding=*/{padding, padding, padding},
               /*ceil_mode=*/ceil_mode,
               /*count_include_pad=*/count_include_pad);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output = torch::avg_pool3d(
                 lazy_input,
                 /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
                 /*stride=*/{},
                 /*padding=*/{padding, padding, padding},
                 /*ceil_mode=*/ceil_mode,
                 /*count_include_pad=*/count_include_pad);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestAvgPool3DNonSquare) {
   torch::Tensor input = torch::rand(
       {1, 1, 7, 7, 7},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 4;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       for (bool count_include_pad : {true, false}) {
         // Test ceil_mode=true through the CPU interop.
         for (bool ceil_mode : {false, true}) {
           torch::Tensor output = torch::avg_pool3d(
               input,
               /*kernel_size=*/{kernel_size, kernel_size + 1, kernel_size},
               /*stride=*/{stride, stride + 1, stride},
               /*padding=*/{padding, padding + 1, padding},
               /*ceil_mode=*/ceil_mode,
               /*count_include_pad=*/count_include_pad);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output = torch::avg_pool3d(
                 lazy_input,
                 /*kernel_size=*/{kernel_size, kernel_size + 1, kernel_size},
                 /*stride=*/{stride, stride + 1, stride},
                 /*padding=*/{padding, padding + 1, padding},
                 /*ceil_mode=*/ceil_mode,
                 /*count_include_pad=*/count_include_pad);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestAvgPool2DNoBatch) {
   torch::Tensor input = torch::rand(
       {1, 7, 7}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 2;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       for (bool count_include_pad : {true, false}) {
         // Test ceil_mode=true through the CPU interop.
         for (bool ceil_mode : {false, true}) {
           torch::Tensor output = torch::avg_pool2d(
               input,
               /*kernel_size=*/{kernel_size, kernel_size},
               /*stride=*/{stride, stride},
               /*padding=*/{padding, padding},
               /*ceil_mode=*/ceil_mode,
               /*count_include_pad=*/count_include_pad);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output = torch::avg_pool2d(
                 lazy_input,
                 /*kernel_size=*/{kernel_size, kernel_size},
                 /*stride=*/{stride, stride},
                 /*padding=*/{padding, padding},
                 /*ceil_mode=*/ceil_mode,
                 /*count_include_pad=*/count_include_pad);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestAvgPool3DNoBatch) {
   torch::Tensor input = torch::rand(
       {1, 7, 7, 7},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int kernel_size = 2;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       for (bool count_include_pad : {true, false}) {
         // Test ceil_mode=true through the CPU interop.
         for (bool ceil_mode : {false, true}) {
           torch::Tensor output = torch::avg_pool3d(
               input,
               /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
               /*stride=*/{stride, stride, stride},
               /*padding=*/{padding, padding, padding},
               /*ceil_mode=*/ceil_mode,
               /*count_include_pad=*/count_include_pad);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output = torch::avg_pool3d(
                 lazy_input,
                 /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
                 /*stride=*/{stride, stride, stride},
                 /*padding=*/{padding, padding, padding},
                 /*ceil_mode=*/ceil_mode,
                 /*count_include_pad=*/count_include_pad);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestAdaptiveAvgPool2D) {
   torch::Tensor input = torch::rand(
       {4, 1, 28, 28},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int64_t output_size : {7, 4}) {
     torch::Tensor output =
         torch::adaptive_avg_pool2d(input, {output_size, output_size});
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output =
           torch::adaptive_avg_pool2d(lazy_input, {output_size, output_size});
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestAdaptiveAvgPool3D) {
   torch::Tensor input = torch::rand(
       {9, 4, 56, 28, 28},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int64_t output_size : {7, 4}) {
     torch::Tensor output = torch::adaptive_avg_pool3d(
         input, {output_size, output_size, output_size});
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output = torch::adaptive_avg_pool3d(
           lazy_input, {output_size, output_size, output_size});
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestAdaptiveAvgPool3DNoBatch) {
   torch::Tensor input = torch::rand(
       {3, 56, 28, 28},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int64_t output_size : {7, 4}) {
     torch::Tensor output = torch::adaptive_avg_pool3d(
         input, {output_size, output_size, output_size});
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output = torch::adaptive_avg_pool3d(
           lazy_input, {output_size, output_size, output_size});
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestAdaptiveAvgPool2DNoBatch) {
   torch::Tensor input = torch::rand(
       {1, 56, 56}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int64_t output_size : {7, 8}) {
     torch::Tensor output =
         torch::adaptive_avg_pool2d(input, {output_size, output_size});
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output =
           torch::adaptive_avg_pool2d(lazy_input, {output_size, output_size});
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestMaxUnpool2D) {
   int kernel_size = 2;
   torch::Tensor input = torch::rand(
       {2, 2, 8, 8},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         // Test dilation through the CPU interop.
         for (int dilation = 1; dilation <= 2; ++dilation) {
           torch::Tensor output;
           torch::Tensor indices;
           std::tie(output, indices) = torch::max_pool2d_with_indices(
               input,
               /*kernel_size=*/{kernel_size, kernel_size},
               /*stride=*/{stride, stride},
               /*padding=*/{padding, padding},
               /*dilation=*/{dilation, dilation},
               /*ceil_mode=*/ceil_mode);

           std::vector<int64_t> output_size({input.size(2), input.size(3)});
           at::Tensor utensor =
               torch::max_unpool2d(output, indices, output_size);

           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_output = CopyToDevice(output, device);
             torch::Tensor lazy_indices = CopyToDevice(indices, device);
             at::Tensor lazy_utensor =
                 torch::max_unpool2d(lazy_output, lazy_indices, output_size);
             AllClose(utensor, lazy_utensor);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxUnpool3D) {
   int kernel_size = 2;
   torch::Tensor input = torch::rand(
       {1, 1, 4, 4, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         // Test dilation through the CPU interop.
         for (int dilation = 1; dilation <= 2; ++dilation) {
           torch::Tensor output;
           torch::Tensor indices;
           std::tie(output, indices) = torch::max_pool3d_with_indices(
               input,
               /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
               /*stride=*/{stride, stride, stride},
               /*padding=*/{padding, padding, padding},
               /*dilation=*/{dilation, dilation, dilation},
               /*ceil_mode=*/ceil_mode);

           std::vector<int64_t> output_size(
               {input.size(2), input.size(3), input.size(4)});
           at::Tensor utensor = torch::max_unpool3d(
               output,
               indices,
               output_size,
               /*stride=*/{stride, stride, stride},
               /*padding=*/{padding, padding, padding});

           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_output = CopyToDevice(output, device);
             torch::Tensor lazy_indices = CopyToDevice(indices, device);
             at::Tensor lazy_utensor = torch::max_unpool3d(
                 lazy_output,
                 lazy_indices,
                 output_size,
                 /*stride=*/{stride, stride, stride},
                 /*padding=*/{padding, padding, padding});
             AllClose(utensor, lazy_utensor);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestNllLoss) {
   // TODO(whc) debug divide-by-zero failure under ASAN
   GTEST_SKIP();

   int batch = 6;
   int classes = 2;
   // TODO(asuhan): Fix the torch::kDouble case.
   for (auto dtype : {torch::kFloat}) {
     for (int ignore_index : {-1, 0, 1, 5}) {
       for (bool def_weight : {false, true}) {
         torch::Tensor input = torch::rand(
             {batch, classes},
             torch::TensorOptions(dtype).device(DefaultDevice()));
         torch::Tensor target = torch::randint(
             std::min(ignore_index, 0),
             classes,
             {batch},
             torch::TensorOptions(torch::kLong).device(DefaultDevice()));
         torch::Tensor weight;
         if (def_weight) {
           weight = torch::rand(
               {classes}, torch::TensorOptions(dtype).device(DefaultDevice()));
         }
         for (torch::Reduction::Reduction reduction :
              {torch::Reduction::Mean,
               torch::Reduction::Sum,
               torch::Reduction::None}) {
           torch::Tensor output = torch::nll_loss(
               /*self=*/input,
               /*target=*/target,
               /*weight=*/weight,
               /*reduction=*/reduction,
               /*ignore_index=*/ignore_index);

           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_target = CopyToDevice(target, device);
             torch::Tensor lazy_weight =
                 def_weight ? CopyToDevice(weight, device) : torch::Tensor();
             torch::Tensor lazy_output = torch::nll_loss(
                 /*self=*/lazy_input,
                 /*target=*/lazy_target,
                 /*weight=*/lazy_weight,
                 /*reduction=*/reduction,
                 /*ignore_index=*/ignore_index);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestNllLoss2d) {
   int batch = 6;
   int classes = 2;
   int height = 3;
   int width = 3;
   // TODO(asuhan): Fix the torch::kDouble case.
   for (auto dtype : {torch::kFloat}) {
     for (int ignore_index : {-1, 0, 1, 5}) {
       for (bool def_weight : {false, true}) {
         torch::Tensor input = torch::rand(
             {batch, classes, height, width},
             torch::TensorOptions(dtype).device(DefaultDevice()));
         torch::Tensor target = torch::randint(
             std::min(ignore_index, 0),
             classes,
             {batch, height, width},
             torch::TensorOptions(torch::kLong).device(DefaultDevice()));
         torch::Tensor weight;
         if (def_weight) {
           weight = torch::rand(
               {classes}, torch::TensorOptions(dtype).device(DefaultDevice()));
         }
         for (torch::Reduction::Reduction reduction :
              {torch::Reduction::Mean,
               torch::Reduction::Sum,
               torch::Reduction::None}) {
           torch::Tensor output = torch::nll_loss2d(
               /*self=*/input,
               /*target=*/target,
               /*weight=*/weight,
               /*reduction=*/reduction,
               /*ignore_index=*/ignore_index);

           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_target = CopyToDevice(target, device);
             torch::Tensor lazy_weight =
                 def_weight ? CopyToDevice(weight, device) : torch::Tensor();
             torch::Tensor lazy_output = torch::nll_loss2d(
                 /*self=*/lazy_input,
                 /*target=*/lazy_target,
                 /*weight=*/lazy_weight,
                 /*reduction=*/reduction,
                 /*ignore_index=*/ignore_index);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestSmoothL1Loss) {
   torch::Tensor input = torch::randn(
       {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor target = torch::randn(
       {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (torch::Reduction::Reduction reduction :
        {torch::Reduction::None,
         torch::Reduction::Mean,
         torch::Reduction::Sum}) {
     for (double beta : {0.25, 1.}) {
       torch::Tensor output =
           torch::smooth_l1_loss(input, target, reduction, beta);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_input = CopyToDevice(input, device);
         torch::Tensor lazy_target = CopyToDevice(target, device);
         torch::Tensor lazy_output =
             torch::smooth_l1_loss(lazy_input, lazy_target, reduction, beta);
         AllClose(output, lazy_output);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestL1Loss) {
   torch::Tensor input = torch::randn(
       {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor target = torch::randn(
       {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (torch::Reduction::Reduction reduction :
        {torch::Reduction::None,
         torch::Reduction::Mean,
         torch::Reduction::Sum}) {
     torch::Tensor output = torch::l1_loss(input, target, reduction);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_target = CopyToDevice(target, device);
       torch::Tensor lazy_output =
           torch::l1_loss(lazy_input, lazy_target, reduction);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestL1LossBackward) {
   for (torch::Reduction::Reduction reduction :
        {torch::Reduction::None,
         torch::Reduction::Mean,
         torch::Reduction::Sum}) {
     auto testfn =
         [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
       return torch::l1_loss(inputs[0], inputs[1], reduction);
     };
     ForEachDevice([&](const torch::Device& device) {
       TestBackward(
           {torch::rand(
                {2, 4},
                torch::TensorOptions(torch::kFloat)
                    .device(DefaultDevice())
                    .requires_grad(true)),
            torch::rand(
                {2, 4},
                torch::TensorOptions(torch::kFloat).device(DefaultDevice()))},
           device,
           testfn);
     });
   }
 }

 TEST_F(LazyOpsTest, TestMseLoss) {
   torch::Tensor input = torch::randn(
       {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor target = torch::randn(
       {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (torch::Reduction::Reduction reduction :
        {torch::Reduction::None,
         torch::Reduction::Mean,
         torch::Reduction::Sum}) {
     torch::Tensor output = torch::mse_loss(input, target, reduction);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_target = CopyToDevice(target, device);
       torch::Tensor lazy_output =
           torch::mse_loss(lazy_input, lazy_target, reduction);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestMseLossBackward) {
   for (torch::Reduction::Reduction reduction :
        {torch::Reduction::None,
         torch::Reduction::Mean,
         torch::Reduction::Sum}) {
     auto testfn =
         [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
       return torch::mse_loss(inputs[0], inputs[1], reduction);
     };
     ForEachDevice([&](const torch::Device& device) {
       TestBackward(
           {torch::rand(
                {2, 4},
                torch::TensorOptions(torch::kFloat)
                    .device(DefaultDevice())
                    .requires_grad(true)),
            torch::rand(
                {2, 4},
                torch::TensorOptions(torch::kFloat).device(DefaultDevice()))},
           device,
           testfn);
     });
   }
 }

 TEST_F(LazyOpsTest, TestBatchNorm1D) {
   int num_features = 3;
   torch::Tensor input = torch::rand(
       {2, num_features, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor weight = torch::rand(
       {num_features},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor bias = torch::rand(
       {num_features},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor running_mean = torch::zeros(
       {num_features},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor running_var = torch::ones(
       {num_features},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   double momentum = 0.1;
   double eps = 0.5;
   torch::Tensor undef;
   for (bool training : {true, false}) {
     for (bool undef_weight_bias : {false, true}) {
       torch::Tensor output = torch::batch_norm(
           /*input=*/input,
           /*weight=*/undef_weight_bias ? undef : weight,
           /*bias=*/undef_weight_bias ? undef : bias,
           /*running_mean=*/running_mean,
           /*running_var=*/running_var,
           /*training=*/training,
           /*momentum=*/momentum,
           /*eps=*/eps,
           /*cudnn_enabled=*/false);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_input = CopyToDevice(input, device);
         torch::Tensor lazy_weight =
             undef_weight_bias ? undef : CopyToDevice(weight, device);
         torch::Tensor lazy_bias =
             undef_weight_bias ? undef : CopyToDevice(bias, device);
         torch::Tensor lazy_running_mean = CopyToDevice(running_mean, device);
         torch::Tensor lazy_running_var = CopyToDevice(running_var, device);
         torch::Tensor lazy_output = torch::batch_norm(
             /*input=*/lazy_input,
             /*weight=*/lazy_weight,
             /*bias=*/lazy_bias,
             /*running_mean=*/lazy_running_mean,
             /*running_var=*/lazy_running_var,
             /*training=*/training,
             /*momentum=*/momentum,
             /*eps=*/eps,
             /*cudnn_enabled=*/false);
         AllClose(output, lazy_output, /*rtol=*/1e-3, /*atol=*/1e-5);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestBatchNorm2D) {
   int num_features = 3;
   torch::Tensor input = torch::rand(
       {2, num_features, 4, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor weight = torch::rand(
       {num_features},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor bias = torch::rand(
       {num_features},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor running_mean = torch::zeros(
       {num_features},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor running_var = torch::ones(
       {num_features},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   double momentum = 0.1;
   double eps = 0.5;
   torch::Tensor undef;
   for (bool training : {true, false}) {
     for (bool undef_weight_bias : {false, true}) {
       torch::Tensor output = torch::batch_norm(
           /*input=*/input,
           /*weight=*/undef_weight_bias ? undef : weight,
           /*bias=*/undef_weight_bias ? undef : bias,
           /*running_mean=*/running_mean,
           /*running_var=*/running_var,
           /*training=*/training,
           /*momentum=*/momentum,
           /*eps=*/eps,
           /*cudnn_enabled=*/false);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_input = CopyToDevice(input, device);
         torch::Tensor lazy_weight =
             undef_weight_bias ? undef : CopyToDevice(weight, device);
         torch::Tensor lazy_bias =
             undef_weight_bias ? undef : CopyToDevice(bias, device);
         torch::Tensor lazy_running_mean = CopyToDevice(running_mean, device);
         torch::Tensor lazy_running_var = CopyToDevice(running_var, device);
         torch::Tensor lazy_output = torch::batch_norm(
             /*input=*/lazy_input,
             /*weight=*/lazy_weight,
             /*bias=*/lazy_bias,
             /*running_mean=*/lazy_running_mean,
             /*running_var=*/lazy_running_var,
             /*training=*/training,
             /*momentum=*/momentum,
             /*eps=*/eps,
             /*cudnn_enabled=*/false);
         AllClose(output, lazy_output, /*rtol=*/1e-3, /*atol=*/1e-5);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestDim) {
   torch::Tensor input = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     EXPECT_EQ(input.dim(), lazy_input.dim());
   });
 }

 TEST_F(LazyOpsTest, TestContiguous) {
   torch::Tensor input = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::native::contiguous(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::native::contiguous(lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestSqueezeAll) {
   torch::Tensor input = torch::rand(
       {2, 1, 3, 1},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::squeeze(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::squeeze(lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestSqueezeAllInPlace) {
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor input = torch::rand(
         {2, 1, 3, 1},
         torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor output = input.squeeze_();
     torch::Tensor lazy_output = lazy_input.squeeze_();
     AllClose(output, lazy_output);
     AllClose(input, lazy_input);
     ASSERT_EQ(input.dim(), lazy_input.dim());
     for (int64_t dim_idx = 0; dim_idx < input.dim(); ++dim_idx) {
       ASSERT_EQ(input.size(dim_idx), lazy_input.size(dim_idx));
     }
   });
 }

 TEST_F(LazyOpsTest, TestSqueezeOne) {
   torch::Tensor input = torch::rand(
       {2, 1, 3, 1},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = input.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor output = torch::squeeze(input, dim);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output = torch::squeeze(lazy_input, dim);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestSqueezeOneInPlace) {
   int rank = 4;
   for (int dim = -rank; dim < rank; ++dim) {
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor input = torch::rand(
           {2, 1, 3, 1},
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor output = input.squeeze_(dim);
       torch::Tensor lazy_output = lazy_input.squeeze_(dim);
       AllClose(output, lazy_output);
       AllClose(input, lazy_input);
       ASSERT_EQ(input.dim(), lazy_input.dim());
       for (int64_t dim_idx = 0; dim_idx < input.dim(); ++dim_idx) {
         ASSERT_EQ(input.size(dim_idx), lazy_input.size(dim_idx));
       }
     });
   }
 }

 TEST_F(LazyOpsTest, TestUnsqueeze) {
   torch::Tensor input = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = input.dim() + 1;
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor output = torch::unsqueeze(input, dim);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output = torch::unsqueeze(lazy_input, dim);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestUnsqueezeInPlace) {
   torch::Tensor input = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = input.dim() + 1;
   for (int dim = -rank; dim < rank; ++dim) {
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor output = input.unsqueeze_(dim);
       torch::Tensor lazy_output = lazy_input.unsqueeze_(dim);
       AllClose(output, lazy_output);
       AllClose(input, lazy_input);
       ASSERT_EQ(input.dim(), lazy_input.dim());
       for (int64_t dim_idx = 0; dim_idx < input.dim(); ++dim_idx) {
         ASSERT_EQ(input.size(dim_idx), lazy_input.size(dim_idx));
       }
     });
   }
 }

 TEST_F(LazyOpsTest, TestMaskedFill) {
   torch::Tensor input = torch::rand(
       {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor mask = torch::randint(
       0, 2, {2, 3}, torch::TensorOptions(torch::kBool).device(DefaultDevice()));
   torch::Scalar value(42);
   torch::Tensor result = torch::masked_fill(input, mask, value);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_mask = CopyToDevice(mask, device);
     torch::Tensor lazy_result =
         torch::masked_fill(lazy_input, lazy_mask, value);
     AllClose(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestMaskedFillInPlace) {
   torch::Scalar value(42);
   torch::Tensor mask = torch::randint(
       0, 2, {2, 3}, torch::TensorOptions(torch::kBool).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor input = torch::rand(
         {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_mask = CopyToDevice(mask, device);
     torch::Tensor result = input.masked_fill_(mask, value);
     torch::Tensor lazy_result = lazy_input.masked_fill_(lazy_mask, value);
     AllClose(result, lazy_result);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestMaskedFillBroadcast) {
   torch::Tensor input = torch::rand(
       {2, 5, 4, 3},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor mask = torch::randint(
       0, 2, {4, 1}, torch::TensorOptions(torch::kBool).device(DefaultDevice()));
   torch::Scalar value(42);
   torch::Tensor result = torch::masked_fill(input, mask, value);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_mask = CopyToDevice(mask, device);
     torch::Tensor lazy_result =
         torch::masked_fill(lazy_input, lazy_mask, value);
     AllClose(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestFill) {
   torch::Scalar value(42);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor input = torch::empty(
         {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor result = torch::fill_(input, value);
     torch::Tensor lazy_result = torch::fill_(lazy_input, value);
     AllClose(result, lazy_result);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestFillWithRank0) {
   torch::Tensor value = torch::scalar_tensor(42);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor input = torch::empty(
         {2, 3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor result = torch::fill_(input, value);
     torch::Tensor lazy_value = CopyToDevice(value, device);
     torch::Tensor lazy_result = torch::fill_(lazy_input, value);
     AllClose(result, lazy_result);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestPermute) {
   torch::Tensor input = torch::rand(
       {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<std::vector<int64_t>> dims_permutations = {
       {0, 1, 2}, {0, 2, 1}, {1, 0, 2}, {1, 2, 0}, {2, 0, 1}, {2, 1, 0}};
   int rank = input.dim();
   for (std::vector<int64_t> dims_permutation : dims_permutations) {
     for (bool negative_dims : {false, true}) {
       if (negative_dims) {
         std::for_each(
             dims_permutation.begin(),
             dims_permutation.end(),
             [rank](int64_t& dim) { dim -= rank; });
       }
       torch::Tensor output = input.permute(dims_permutation);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_input = CopyToDevice(input, device);
         torch::Tensor lazy_output = lazy_input.permute(dims_permutation);
         AllClose(output, lazy_output);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestPermuteMod) {
   std::vector<std::vector<int64_t>> dims_permutations = {
       {0, 1, 2}, {0, 2, 1}, {1, 0, 2}, {1, 2, 0}, {2, 0, 1}, {2, 1, 0}};
   std::vector<int64_t> input_sizes = {2, 3, 4};
   int rank = input_sizes.size();
   for (std::vector<int64_t> dims_permutation : dims_permutations) {
     for (bool negative_dims : {false, true}) {
       if (negative_dims) {
         std::for_each(
             dims_permutation.begin(),
             dims_permutation.end(),
             [rank](int64_t& dim) { dim -= rank; });
       }
       torch::Tensor input = torch::zeros(
           input_sizes,
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor one = torch::tensor(
           1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor output = input.permute(dims_permutation);
       output.add_(one, 1.0);
       input.add_(one, 1.0);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor xinput = torch::zeros(
             input_sizes,
             torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
         torch::Tensor lazy_input = CopyToDevice(xinput, device);
         torch::Tensor lazy_one = CopyToDevice(one, device);
         torch::Tensor lazy_output = lazy_input.permute(dims_permutation);
         lazy_output.add_(lazy_one, 1.0);
         lazy_input.add_(lazy_one, 1.0);
         AllClose(output, lazy_output);
         AllClose(input, lazy_input);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestFlip) {
   torch::Tensor input = torch::rand(
       {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<std::vector<int64_t>> dim_powerset = {
       {0}, {1}, {2}, {0, 1}, {1, 2}, {2, 0}, {0, 1, 2}};
   for (std::vector<int64_t> flip_dims : dim_powerset) {
     for (bool negative_dims : {false, true}) {
       if (negative_dims) {
         std::for_each(
             flip_dims.begin(), flip_dims.end(), [](int64_t& dim) { dim -= 3; });
       }
       torch::Tensor output = torch::flip(input, flip_dims);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_input = CopyToDevice(input, device);
         torch::Tensor lazy_output = torch::flip(lazy_input, flip_dims);
         AllClose(output, lazy_output);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestPixelShuffle) {
   torch::Tensor input = torch::rand(
       {5, 18, 4, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int upscale_factor = 3;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor output = torch::pixel_shuffle(input, upscale_factor);
     torch::Tensor lazy_output =
         torch::pixel_shuffle(lazy_input, upscale_factor);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestSumToSize) {
   torch::Tensor input = torch::rand(
       {4, 6, 3, 7},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<int64_t> out_size = {4, 1, 1, 7};
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor output = input.sum_to_size(out_size);
     torch::Tensor lazy_output = lazy_input.sum_to_size(out_size);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestTransposeDims) {
   torch::Tensor input = torch::rand(
       {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int dim0 = 0;
   int dim1 = 2;
   torch::Tensor output = torch::transpose(input, dim0, dim1);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::transpose(lazy_input, dim0, dim1);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestTransposeDimsMod) {
   std::vector<int64_t> input_sizes = {2, 3, 4};
   int dim0 = 0;
   int dim1 = 2;
   torch::Tensor input = torch::zeros(
       input_sizes, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor one = torch::tensor(
       1.0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::transpose(input, dim0, dim1);
   output.add_(one, 1.0);
   input.add_(one, 1.0);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor xinput = torch::zeros(
         input_sizes,
         torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor lazy_input = CopyToDevice(xinput, device);
     torch::Tensor lazy_one = CopyToDevice(one, device);
     torch::Tensor lazy_output = torch::transpose(lazy_input, dim0, dim1);
     lazy_output.add_(lazy_one, 1.0);
     lazy_input.add_(lazy_one, 1.0);
     AllClose(output, lazy_output);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestTransposeDimsInPlace) {
   torch::Tensor input = torch::rand(
       {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int dim0 = 0;
   int dim1 = 2;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor output = input.transpose_(dim0, dim1);
     torch::Tensor lazy_output = lazy_input.transpose_(dim0, dim1);
     AllClose(output, lazy_output);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestSplit) {
   torch::Tensor input = torch::rand(
       {7, 8, 9}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = input.dim();
   for (int split_size : {2, 3}) {
     for (int dim = -rank; dim < rank; ++dim) {
       std::vector<torch::Tensor> outputs = torch::split(input, split_size, dim);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_input = CopyToDevice(input, device);
         std::vector<torch::Tensor> lazy_outputs =
             torch::split(lazy_input, split_size, dim);
         ASSERT_EQ(outputs.size(), lazy_outputs.size());
         for (size_t i = 0; i < outputs.size(); ++i) {
           AllClose(outputs[i], lazy_outputs[i]);
         }
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestSplitEmpty) {
   torch::Tensor input = torch::rand(
       {0}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int split_size = 0;
   int dim = 0;
   std::vector<torch::Tensor> outputs = torch::split(input, split_size, dim);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     std::vector<torch::Tensor> lazy_outputs =
         torch::split(lazy_input, split_size, dim);
     ASSERT_EQ(outputs.size(), lazy_outputs.size());
     for (size_t i = 0; i < outputs.size(); ++i) {
       AllClose(outputs[i], lazy_outputs[i]);
     }
   });
 }

 TEST_F(LazyOpsTest, TestSplitWithSizes) {
   torch::Tensor input = torch::rand(
       {15, 15, 15},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = input.dim();
   for (int dim = -rank; dim < rank; ++dim) {
     std::vector<torch::Tensor> outputs =
         torch::split_with_sizes(input, {4, 5, 6}, dim);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       std::vector<torch::Tensor> lazy_outputs =
           torch::split_with_sizes(lazy_input, {4, 5, 6}, dim);
       ASSERT_EQ(outputs.size(), lazy_outputs.size());
       for (size_t i = 0; i < outputs.size(); ++i) {
         AllClose(outputs[i], lazy_outputs[i]);
       }
     });
   }
 }

 TEST_F(LazyOpsTest, TestCrossImplicitDim) {
   std::vector<std::vector<int64_t>> dim_sizes = {
       {4, 5, 3}, {4, 3, 5}, {3, 4, 5}};
   for (auto dim_size : dim_sizes) {
     torch::Tensor input = torch::rand(
         dim_size, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor other = torch::rand(
         dim_size, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     torch::Tensor result = torch::cross(input, other);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_other = CopyToDevice(other, device);
       torch::Tensor lazy_result = torch::cross(lazy_input, lazy_other);
       AllClose(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestCrossExplicitDim) {
   std::vector<int64_t> dim_size = {3, 3};
   torch::Tensor input = torch::rand(
       dim_size, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor other = torch::rand(
       dim_size, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   int rank = dim_size.size();
   for (int dim = -rank; dim < rank; ++dim) {
     torch::Tensor result = torch::cross(input, other, dim);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_other = CopyToDevice(other, device);
       torch::Tensor lazy_result = torch::cross(lazy_input, lazy_other, dim);
       AllClose(result, lazy_result);
     });
   }
 }

 TEST_F(LazyOpsTest, TestCrossZeroDim) {
   torch::Tensor input = torch::rand(
       {0, 1, 3, 0},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor result = torch::cross(input, input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_result = torch::cross(lazy_input, lazy_input);
     AllClose(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestTriu) {
   int size = 5;
   torch::Tensor input = torch::rand(
       {size, size},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // Test all diagonals and out of bounds (must be no-op).
   for (int diagonal = -size; diagonal <= size; ++diagonal) {
     torch::Tensor output = torch::triu(input, diagonal);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output = torch::triu(lazy_input, diagonal);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestTriuNonSquare) {
   int size = 5;
   torch::Tensor input = torch::rand(
       {size, size + 1},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // Test all diagonals and out of bounds (must be no-op).
   for (int diagonal = -size; diagonal <= size; ++diagonal) {
     torch::Tensor output = torch::triu(input, diagonal);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output = torch::triu(lazy_input, diagonal);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestTriuBatch) {
   int size = 5;
   int batch_size = 3;
   torch::Tensor input = torch::rand(
       {batch_size, size, size},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // Test all diagonals and out of bounds (must be no-op).
   for (int diagonal = -size; diagonal <= size; ++diagonal) {
     torch::Tensor output = torch::triu(input, diagonal);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output = torch::triu(lazy_input, diagonal);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestTril) {
   int size = 5;
   torch::Tensor input = torch::rand(
       {size, size},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // Test all diagonals and out of bounds (must be no-op).
   for (int diagonal = -size; diagonal <= size; ++diagonal) {
     torch::Tensor output = torch::tril(input, diagonal);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output = torch::tril(lazy_input, diagonal);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestTrilNonSquare) {
   int size = 5;
   torch::Tensor input = torch::rand(
       {size, size + 1},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // Test all diagonals and out of bounds (must be no-op).
   for (int diagonal = -size; diagonal <= size; ++diagonal) {
     torch::Tensor output = torch::tril(input, diagonal);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output = torch::tril(lazy_input, diagonal);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestTrilBatch) {
   int size = 5;
   int batch_size = 3;
   torch::Tensor input = torch::rand(
       {batch_size, size, size},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // Test all diagonals and out of bounds (must be no-op).
   for (int diagonal = -size; diagonal <= size; ++diagonal) {
     torch::Tensor output = torch::tril(input, diagonal);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output = torch::tril(lazy_input, diagonal);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestTriuInPlace) {
   int size = 5;
   // Test all diagonals and out of bounds (must be no-op).
   for (int diagonal = -size; diagonal <= size; ++diagonal) {
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor input = torch::rand(
           {size, size},
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor output = input.triu_(diagonal);
       torch::Tensor lazy_output = lazy_input.triu_(diagonal);
       AllClose(output, lazy_output);
       AllClose(input, lazy_input);
     });
   }
 }

 TEST_F(LazyOpsTest, TestTrilInPlace) {
   int size = 5;
   // Test all diagonals and out of bounds (must be no-op).
   for (int diagonal = -size; diagonal <= size; ++diagonal) {
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor input = torch::rand(
           {size, size},
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor output = input.tril_(diagonal);
       torch::Tensor lazy_output = lazy_input.tril_(diagonal);
       AllClose(output, lazy_output);
       AllClose(input, lazy_input);
     });
   }
 }

 TEST_F(LazyOpsTest, TestTrace) {
   int n = 5;
   torch::Tensor input = torch::rand(
       {n, n}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::trace(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::trace(lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestTraceWide) {
   int lines = 3;
   int cols = 5;
   torch::Tensor input = torch::rand(
       {lines, cols},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::trace(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::trace(lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestTraceNarrow) {
   int lines = 5;
   int cols = 3;
   torch::Tensor input = torch::rand(
       {lines, cols},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor output = torch::trace(input);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::trace(lazy_input);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestDiagRank1) {
   int size = 7;
   torch::Tensor input = torch::rand(
       {size}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // Test all diagonals and out of bounds (must be no-op).
   for (int diagonal = -2 * size; diagonal <= 2 * size; ++diagonal) {
     torch::Tensor output = torch::diag(input, diagonal);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output = torch::diag(lazy_input, diagonal);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestDiagRank2) {
   int size = 7;
   torch::Tensor input = torch::rand(
       {size, size},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // Test all diagonals and out of bounds (must be no-op).
   for (int diagonal = -size; diagonal <= size; ++diagonal) {
     torch::Tensor output = torch::diag(input, diagonal);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output = torch::diag(lazy_input, diagonal);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestDiagFlat) {
   torch::Tensor input = torch::rand(
       {4, 3, 6, 7},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int diagonal = -10; diagonal < 10; ++diagonal) {
     torch::Tensor output = torch::diagflat(input, diagonal);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output = torch::diagflat(lazy_input, diagonal);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestDiagonal) {
   int size = 5;
   torch::Tensor input = torch::rand(
       {size, size},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // Test all diagonals and out of bounds (must be no-op).
   for (int diagonal = -size; diagonal <= size; ++diagonal) {
     torch::Tensor output = torch::diagonal(input, diagonal);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output = torch::diagonal(lazy_input, diagonal);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestDiagonalUpdate) {
   int size = 5;
   // Test all diagonals and out of bounds (must be no-op).
   for (int diagonal = -size; diagonal <= size; ++diagonal) {
     auto input = torch::rand(
         {size, size},
         torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
     auto input_clone = input.clone();
     auto output = torch::diagonal(input, diagonal);
     output.add_(1);

     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input_clone, device);
       torch::Tensor lazy_output = torch::diagonal(lazy_input, diagonal);
       lazy_output.add_(1);

       AllClose(output, lazy_output);
       AllClose(input, lazy_input);
     });
   }
 }

 TEST_F(LazyOpsTest, TestDiagonalNonSquare) {
   int size = 5;
   torch::Tensor input = torch::rand(
       {size, size + 1},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // Test all diagonals and out of bounds (must be no-op).
   for (int diagonal = -size; diagonal <= size; ++diagonal) {
     torch::Tensor output = torch::diagonal(input, diagonal);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output = torch::diagonal(lazy_input, diagonal);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestDiagonalBatch) {
   int size = 5;
   int batch_size = 3;
   int dim1 = 1;
   int dim2 = 2;
   torch::Tensor input = torch::rand(
       {batch_size, size, size},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   // Test all diagonals and out of bounds (must be no-op).
   for (int diagonal = -size; diagonal <= size; ++diagonal) {
     torch::Tensor output =
         torch::diagonal(input, diagonal, /*dim1=*/dim1, /*dim1=*/dim2);
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor lazy_input = CopyToDevice(input, device);
       torch::Tensor lazy_output =
           torch::diagonal(lazy_input, diagonal, /*dim1=*/dim1, /*dim1=*/dim2);
       AllClose(output, lazy_output);
     });
   }
 }

 TEST_F(LazyOpsTest, TestFlatten) {
   torch::Tensor input = torch::rand({4, 7, 5, 3});
   int rank = input.dim();
   for (int pos_start_dim = 0; pos_start_dim < rank; ++pos_start_dim) {
     for (int pos_end_dim = pos_start_dim; pos_end_dim < rank; ++pos_end_dim) {
       for (bool negative_start_dim : {false, true}) {
         for (bool negative_end_dim : {false, true}) {
           int start_dim =
               negative_start_dim ? pos_start_dim - rank : pos_start_dim;
           int end_dim = negative_end_dim ? pos_end_dim - rank : pos_end_dim;
           torch::Tensor output = torch::flatten(input, start_dim, end_dim);
           ForEachDevice([&](const torch::Device& device) {
             torch::Tensor lazy_input = CopyToDevice(input, device);
             torch::Tensor lazy_output =
                 torch::flatten(lazy_input, start_dim, end_dim);
             AllClose(output, lazy_output);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestLogicalAnd) {
   for (torch::ScalarType scalar_type1 :
        {torch::kFloat,
         torch::kByte,
         torch::kChar,
         torch::kShort,
         torch::kInt,
         torch::kLong}) {
     torch::Tensor lhs = isFloatingType(scalar_type1)
         ? torch::rand({3, 4}, torch::TensorOptions(scalar_type1))
         : torch::randint(0, 100, {3, 4}, torch::TensorOptions(scalar_type1));
     for (torch::ScalarType scalar_type2 :
          {torch::kFloat,
           torch::kByte,
           torch::kChar,
           torch::kShort,
           torch::kInt,
           torch::kLong}) {
       torch::Tensor rhs = isFloatingType(scalar_type2)
           ? torch::rand({3, 4}, torch::TensorOptions(scalar_type2))
           : torch::randint(1, 100, {3, 4}, torch::TensorOptions(scalar_type2));
       torch::Tensor result = torch::logical_and(lhs, rhs);
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
         torch::Tensor lazy_rhs = CopyToDevice(rhs, device);
         torch::Tensor lazy_result = torch::logical_and(lazy_lhs, lazy_rhs);
         AllEqual(result, lazy_result);
       });
     }
   }

   ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
   ExpectCounterChanged("xla::logical_and_out", GetIgnoredCounters());
 }

 TEST_F(LazyOpsTest, TestBitwiseAnd) {
   torch::Tensor lhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   torch::Tensor rhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   torch::Tensor result = lhs.__and__(rhs);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
     torch::Tensor lazy_rhs = CopyToDevice(rhs, device);
     torch::Tensor lazy_result = lazy_lhs.__and__(lazy_rhs);
     AllEqual(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestBitwiseAndInPlace) {
   torch::Tensor lhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   torch::Tensor rhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
     torch::Tensor result = lhs.__iand__(rhs);
     torch::Tensor lazy_rhs = CopyToDevice(rhs, device);
     torch::Tensor lazy_result = lazy_lhs.__iand__(lazy_rhs);
     AllEqual(result, lazy_result);
     AllEqual(lhs, lazy_lhs);
   });
 }

 TEST_F(LazyOpsTest, TestBitwiseAndScalar) {
   torch::Tensor lhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   torch::Scalar rhs(123456789);
   torch::Tensor result = lhs.__and__(rhs);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
     torch::Tensor lazy_result = lazy_lhs.__and__(rhs);
     AllEqual(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestBitwiseAndScalarInPlace) {
   torch::Tensor lhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   torch::Scalar rhs(123456789);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
     torch::Tensor result = lhs.__iand__(rhs);
     torch::Tensor lazy_result = lazy_lhs.__iand__(rhs);
     AllEqual(result, lazy_result);
     AllEqual(lhs, lazy_lhs);
   });
 }

 TEST_F(LazyOpsTest, TestBitwiseAndPromotion) {
   torch::Tensor input = torch::rand(
       {4, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor view = input.reshape(-1);
   torch::Tensor result = torch::__and__(view.gt(0), view.ne(0));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_view = lazy_input.reshape(-1);
     torch::Tensor lazy_result =
         torch::__and__(lazy_view.gt(0), lazy_view.ne(0));
     AllEqual(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestBitwiseOr) {
   torch::Tensor lhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   torch::Tensor rhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   torch::Tensor result = lhs.__or__(rhs);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
     torch::Tensor lazy_rhs = CopyToDevice(rhs, device);
     torch::Tensor lazy_result = lazy_lhs.__or__(lazy_rhs);
     AllEqual(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestBitwiseOrInPlace) {
   torch::Tensor lhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   torch::Tensor rhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
     torch::Tensor result = lhs.__ior__(rhs);
     torch::Tensor lazy_rhs = CopyToDevice(rhs, device);
     torch::Tensor lazy_result = lazy_lhs.__ior__(lazy_rhs);
     AllEqual(result, lazy_result);
     AllEqual(lhs, lazy_lhs);
   });
 }

 TEST_F(LazyOpsTest, TestBitwiseOrScalar) {
   torch::Tensor lhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   torch::Scalar rhs(123456789);
   torch::Tensor result = lhs.__or__(rhs);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
     torch::Tensor lazy_result = lazy_lhs.__or__(rhs);
     AllEqual(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestBitwiseOrScalarInPlace) {
   torch::Tensor lhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   torch::Scalar rhs(123456789);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
     torch::Tensor result = lhs.__ior__(rhs);
     torch::Tensor lazy_result = lazy_lhs.__ior__(rhs);
     AllEqual(result, lazy_result);
     AllEqual(lhs, lazy_lhs);
   });
 }

 TEST_F(LazyOpsTest, TestBitwiseXor) {
   torch::Tensor lhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   torch::Tensor rhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   torch::Tensor result = lhs.__xor__(rhs);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
     torch::Tensor lazy_rhs = CopyToDevice(rhs, device);
     torch::Tensor lazy_result = lazy_lhs.__xor__(lazy_rhs);
     AllEqual(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestBitwiseXorInPlace) {
   torch::Tensor lhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   torch::Tensor rhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
     torch::Tensor result = lhs.__ixor__(rhs);
     torch::Tensor lazy_rhs = CopyToDevice(rhs, device);
     torch::Tensor lazy_result = lazy_lhs.__ixor__(lazy_rhs);
     AllEqual(result, lazy_result);
     AllEqual(lhs, lazy_lhs);
   });
 }

 TEST_F(LazyOpsTest, TestBitwiseXorScalar) {
   torch::Tensor lhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   torch::Scalar rhs(123456789);
   torch::Tensor result = lhs.__xor__(rhs);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
     torch::Tensor lazy_result = lazy_lhs.__xor__(rhs);
     AllEqual(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestBitwiseXorScalarInPlace) {
   torch::Tensor lhs = torch::randint(
       0,
       std::numeric_limits<int32_t>::max(),
       {4, 2},
       torch::TensorOptions(torch::kInt));
   torch::Scalar rhs(123456789);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_lhs = CopyToDevice(lhs, device);
     torch::Tensor result = lhs.__ixor__(rhs);
     torch::Tensor lazy_result = lazy_lhs.__ixor__(rhs);
     AllEqual(result, lazy_result);
     AllEqual(lhs, lazy_lhs);
   });
 }

 TEST_F(LazyOpsTest, TestLshift) {
   torch::Tensor input = torch::ones(
       {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
   torch::Tensor shift_amount = torch::randint(
       16,
       input.sizes(),
       torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
   torch::Tensor result = torch::__lshift__(input, shift_amount);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_shift_amount = CopyToDevice(shift_amount, device);
     torch::Tensor lazy_result =
         torch::__lshift__(lazy_input, lazy_shift_amount);
     AllClose(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestLshiftInPlace) {
   torch::Tensor input = torch::ones(
       {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor shift_amount = torch::randint(
         16,
         input.sizes(),
         torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
     torch::Tensor result = input.__ilshift__(shift_amount);
     torch::Tensor lazy_shift_amount = CopyToDevice(shift_amount, device);
     torch::Tensor lazy_result = lazy_input.__ilshift__(lazy_shift_amount);
     AllClose(result, lazy_result);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestLshiftScalar) {
   torch::Tensor input = torch::ones(
       {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
   torch::Scalar shift_amount = 3;
   torch::Tensor result = torch::__lshift__(input, shift_amount);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_result = torch::__lshift__(lazy_input, shift_amount);
     AllClose(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestLshiftScalarInPlace) {
   torch::Tensor input = torch::ones(
       {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
   torch::Scalar shift_amount = 3;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor result = input.__ilshift__(shift_amount);
     torch::Tensor lazy_result = lazy_input.__ilshift__(shift_amount);
     AllClose(result, lazy_result);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestRshift) {
   torch::Tensor input = torch::ones(
       {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
   torch::Tensor shift_amount = torch::randint(
       16,
       input.sizes(),
       torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
   torch::Tensor result = torch::__rshift__(input, shift_amount);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_shift_amount = CopyToDevice(shift_amount, device);
     torch::Tensor lazy_result =
         torch::__rshift__(lazy_input, lazy_shift_amount);
     AllClose(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestRshiftInPlace) {
   torch::Tensor input = torch::ones(
       {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor shift_amount = torch::randint(
         16,
         input.sizes(),
         torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
     torch::Tensor result = input.__irshift__(shift_amount);
     torch::Tensor lazy_shift_amount = CopyToDevice(shift_amount, device);
     torch::Tensor lazy_result = lazy_input.__irshift__(lazy_shift_amount);
     AllClose(result, lazy_result);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestRshiftScalar) {
   torch::Tensor input = torch::ones(
       {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
   torch::Scalar shift_amount = 3;
   torch::Tensor result = torch::__rshift__(input, shift_amount);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_result = torch::__rshift__(lazy_input, shift_amount);
     AllClose(result, lazy_result);
   });
 }

 TEST_F(LazyOpsTest, TestRshiftScalarInPlace) {
   torch::Tensor input = torch::ones(
       {4, 2}, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
   torch::Scalar shift_amount = 3;
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor result = input.__irshift__(shift_amount);
     torch::Tensor lazy_result = lazy_input.__irshift__(shift_amount);
     AllClose(result, lazy_result);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestMeshgrid) {
   torch::Tensor a = torch::rand(
       {3}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor b = torch::rand(
       {2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor c = torch::rand(
       {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   auto d = torch::meshgrid({a, b, c});
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_a = CopyToDevice(a, device);
     torch::Tensor lazy_b = CopyToDevice(b, device);
     torch::Tensor lazy_c = CopyToDevice(c, device);
     auto lazy_d = torch::meshgrid({lazy_a, lazy_b, lazy_c});
     EXPECT_EQ(d.size(), lazy_d.size());
     for (size_t i = 0; i < d.size(); ++i) {
       AllClose(d[i], lazy_d[i]);
     }
   });
 }

 TEST_F(LazyOpsTest, TestConstantPad) {
   torch::Tensor input = torch::rand(
       {4, 2, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<int64_t> pad{1, 2, 3, 4, 5, 6};
   float pad_value = 5;
   torch::Tensor output = torch::constant_pad_nd(input, pad, pad_value);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output =
         torch::constant_pad_nd(lazy_input, pad, pad_value);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestConstantPadIncomplete) {
   torch::Tensor input = torch::rand(
       {4, 2, 5}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<int64_t> pad{1, 2};
   float pad_value = 5;
   torch::Tensor output = torch::constant_pad_nd(input, pad, pad_value);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output =
         torch::constant_pad_nd(lazy_input, pad, pad_value);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestReflectionPad2dRank3) {
   torch::Tensor input = torch::rand(
       {2, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<int64_t> pad{2, 2, 2, 2};
   torch::Tensor output = torch::reflection_pad2d(input, pad);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::reflection_pad2d(lazy_input, pad);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestReflectionPad2dRank4) {
   torch::Tensor input = torch::rand(
       {2, 2, 3, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<int64_t> pad{2, 2, 2, 2};
   torch::Tensor output = torch::reflection_pad2d(input, pad);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::reflection_pad2d(lazy_input, pad);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestReflectionPad2dBackward) {
   std::vector<int64_t> pad{2, 3, 1, 2};
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::reflection_pad2d(inputs[0], pad);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {1, 2, 4, 4},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestReplicationPad1d) {
   torch::Tensor input = torch::rand(
       {1, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<int64_t> pad{1, 2};
   torch::Tensor output = torch::replication_pad1d(input, pad);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::replication_pad1d(lazy_input, pad);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestReplicationPad1dZeroPad) {
   torch::Tensor input = torch::rand(
       {1, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<int64_t> pad{1, 0};
   torch::Tensor output = torch::replication_pad1d(input, pad);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::replication_pad1d(lazy_input, pad);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestReplicationPad1dBackward) {
   std::vector<int64_t> pad{2, 3};
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::replication_pad1d(inputs[0], pad);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {2, 4},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestReplicationPad2d) {
   torch::Tensor input = torch::rand(
       {1, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<int64_t> pad{1, 2, 2, 1};
   torch::Tensor output = torch::replication_pad2d(input, pad);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::replication_pad2d(lazy_input, pad);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestReplicationPad2dZeroPad) {
   torch::Tensor input = torch::rand(
       {1, 3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<int64_t> pad{1, 0, 0, 1};
   torch::Tensor output = torch::replication_pad2d(input, pad);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::replication_pad2d(lazy_input, pad);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestReplicationPad2dBackward) {
   std::vector<int64_t> pad{2, 3, 1, 1};
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::replication_pad2d(inputs[0], pad);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {2, 3, 4},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestAsStrided) {
   torch::Tensor input = torch::rand(
       {128, 320}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<int64_t> size = {128, 20, 4, 4};
   std::vector<int64_t> stride = {320, 16, 4, 1};
   torch::Tensor output =
       torch::as_strided(input, /*size=*/size, /*stride=*/stride);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output =
         torch::as_strided(lazy_input, /*size=*/size, /*stride=*/stride);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestAsStridedInPlace) {
   torch::Tensor input = torch::rand(
       {128, 320}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<int64_t> size = {128, 20, 4, 4};
   std::vector<int64_t> stride = {320, 16, 4, 1};
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor output =
         torch::as_strided_(input, /*size=*/size, /*stride=*/stride);
     torch::Tensor lazy_output =
         torch::as_strided_(lazy_input, /*size=*/size, /*stride=*/stride);
     AllClose(output, lazy_output);
     AllClose(input, lazy_input);
   });
 }

 TEST_F(LazyOpsTest, TestAsStridedWithOffset) {
   torch::Tensor input = torch::rand(
       {4, 8, 2}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<int64_t> size = {4, 4, 2};
   std::vector<int64_t> stride = {8, 2, 1};
   int64_t storage_offset = 4;
   torch::Tensor output = torch::as_strided(
       input,
       /*size=*/size,
       /*stride=*/stride,
       /*storage_offset=*/storage_offset);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input, device);
     torch::Tensor lazy_output = torch::as_strided(
         lazy_input,
         /*size=*/size,
         /*stride=*/stride,
         /*storage_offset=*/storage_offset);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestAsStridedWithInplaceCopy) {
   torch::Tensor grad = torch::ones(
       {4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   std::vector<int64_t> size = {4};
   std::vector<int64_t> stride = {1};
   torch::Tensor output = torch::zeros({4}, grad.options());
   output.as_strided(size, stride).copy_(grad);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_grad = CopyToDevice(grad, device);
     torch::Tensor lazy_output = torch::zeros({4}, lazy_grad.options());
     lazy_output.as_strided(size, stride).copy_(lazy_grad);
     AllClose(output, lazy_output);
   });
 }

 TEST_F(LazyOpsTest, TestEmptyStrided) {
   std::vector<int64_t> size = {4, 4, 2};
   std::vector<int64_t> stride = {8, 2, 1};
   torch::Tensor output = torch::empty_strided(/*size=*/size, /*stride=*/stride);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_output =
         torch::empty_strided(/*size=*/size, /*stride=*/stride);
     EXPECT_EQ(output.sizes(), lazy_output.sizes());
     EXPECT_EQ(output.strides(), lazy_output.strides());
   });
 }

 TEST_F(LazyOpsTest, TestAvgPool2DBackward) {
   int kernel_size = 2;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       for (bool count_include_pad : {true, false}) {
         // Test ceil_mode=true through the CPU interop.
         for (bool ceil_mode : {false, true}) {
           auto testfn =
               [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
             return torch::avg_pool2d(
                 inputs[0],
                 /*kernel_size=*/{kernel_size, kernel_size},
                 /*stride=*/{stride, stride},
                 /*padding=*/{padding, padding},
                 /*ceil_mode=*/ceil_mode,
                 /*count_include_pad=*/count_include_pad);
           };

           ForEachDevice([&](const torch::Device& device) {
             TestBackward(
                 {torch::rand(
                     {1, 1, 7, 7},
                     torch::TensorOptions(torch::kFloat)
                         .device(DefaultDevice())
                         .requires_grad(true))},
                 device,
                 testfn);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestAvgPool3DBackward) {
   int kernel_size = 2;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       for (bool count_include_pad : {true, false}) {
         // Test ceil_mode=true through the CPU interop.
         for (bool ceil_mode : {false, true}) {
           auto testfn =
               [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
             return torch::avg_pool3d(
                 inputs[0],
                 /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
                 /*stride=*/{stride, stride, stride},
                 /*padding=*/{padding, padding, padding},
                 /*ceil_mode=*/ceil_mode,
                 /*count_include_pad=*/count_include_pad);
           };

           ForEachDevice([&](const torch::Device& device) {
             TestBackward(
                 {torch::rand(
                     {1, 1, 7, 7, 7},
                     torch::TensorOptions(torch::kFloat)
                         .device(DefaultDevice())
                         .requires_grad(true))},
                 device,
                 testfn);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestAvgPool2DNoBatchBackward) {
   int kernel_size = 2;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       for (bool count_include_pad : {true, false}) {
         // Test ceil_mode=true through the CPU interop.
         for (bool ceil_mode : {false, true}) {
           auto testfn =
               [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
             return torch::avg_pool2d(
                 inputs[0],
                 /*kernel_size=*/{kernel_size, kernel_size},
                 /*stride=*/{stride, stride},
                 /*padding=*/{padding, padding},
                 /*ceil_mode=*/ceil_mode,
                 /*count_include_pad=*/count_include_pad);
           };

           ForEachDevice([&](const torch::Device& device) {
             TestBackward(
                 {torch::rand(
                     {1, 7, 7},
                     torch::TensorOptions(torch::kFloat)
                         .device(DefaultDevice())
                         .requires_grad(true))},
                 device,
                 testfn);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestAvgPool3DNoBatchBackward) {
   int kernel_size = 2;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       for (bool count_include_pad : {true, false}) {
         // Test ceil_mode=true through the CPU interop.
         for (bool ceil_mode : {false, true}) {
           auto testfn =
               [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
             return torch::avg_pool3d(
                 inputs[0],
                 /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
                 /*stride=*/{stride, stride, stride},
                 /*padding=*/{padding, padding, padding},
                 /*ceil_mode=*/ceil_mode,
                 /*count_include_pad=*/count_include_pad);
           };

           ForEachDevice([&](const torch::Device& device) {
             TestBackward(
                 {torch::rand(
                     {1, 7, 7, 7},
                     torch::TensorOptions(torch::kFloat)
                         .device(DefaultDevice())
                         .requires_grad(true))},
                 device,
                 testfn);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestAdaptiveAvgPool3DNoBatchBackward) {
   if (IsCuda()) {
     GTEST_SKIP();
   }
   for (int64_t output_size : {7, 4}) {
     auto testfn =
         [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
       return torch::adaptive_avg_pool3d(
           inputs[0], {output_size, output_size, output_size});
     };
     ForEachDevice([&](const torch::Device& device) {
       TestBackward(
           {torch::rand(
               {1, 56, 28, 28},
               torch::TensorOptions(torch::kFloat)
                   .device(DefaultDevice())
                   .requires_grad(true))},
           device,
           testfn);
     });
   }
 }

 TEST_F(LazyOpsTest, TestAdaptiveAvgPool3DBackward) {
   if (IsCuda()) {
     GTEST_SKIP();
   }
   for (int64_t output_size : {7, 4}) {
     auto testfn =
         [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
       return torch::adaptive_avg_pool3d(
           inputs[0], {output_size, output_size, output_size});
     };
     ForEachDevice([&](const torch::Device& device) {
       TestBackward(
           {torch::rand(
               {4, 1, 56, 28, 28},
               torch::TensorOptions(torch::kFloat)
                   .device(DefaultDevice())
                   .requires_grad(true))},
           device,
           testfn);
     });
   }
 }

 TEST_F(LazyOpsTest, TestAdaptiveAvgPool2DBackward) {
   for (int64_t output_size : {7, 8}) {
     auto testfn =
         [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
       return torch::adaptive_avg_pool2d(inputs[0], {output_size, output_size});
     };
     ForEachDevice([&](const torch::Device& device) {
       TestBackward(
           {torch::rand(
               {4, 1, 56, 56},
               torch::TensorOptions(torch::kFloat)
                   .device(DefaultDevice())
                   .requires_grad(true))},
           device,
           testfn);
     });
   }
 }

 TEST_F(LazyOpsTest, TestAdaptiveAvgPool2DNoBatchBackward) {
   for (int64_t output_size : {7, 8}) {
     auto testfn =
         [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
       return torch::adaptive_avg_pool2d(inputs[0], {output_size, output_size});
     };
     ForEachDevice([&](const torch::Device& device) {
       TestBackward(
           {torch::rand(
               {1, 56, 56},
               torch::TensorOptions(torch::kFloat).requires_grad(true))},
           device,
           testfn);
     });
   }
 }

 TEST_F(LazyOpsTest, TestConv2D) {
   int in_channels = 4;
   int out_channels = 4;
   int kernel_size = 3;
   for (int stride = 1; stride <= 3; ++stride) {
     for (int padding = 0; padding <= 2; ++padding) {
       for (bool with_bias : {true, false}) {
         for (int dilation = 1; dilation <= 3; ++dilation) {
           for (int groups :
                {1, 2, 4}) { // covers normal, grouped, depthwise conv.
             ForEachDevice([&](const torch::Device& device) {
               torch::Tensor input = torch::rand(
                   {1, in_channels, 7, 7},
                   torch::TensorOptions(torch::kDouble).device(DefaultDevice()));
               torch::Tensor weight = torch::rand(
                   {out_channels,
                    in_channels / groups,
                    kernel_size,
                    kernel_size},
                   torch::TensorOptions(torch::kDouble).device(DefaultDevice()));
               torch::Tensor bias = with_bias
                   ? torch::rand(
                         {out_channels},
                         torch::TensorOptions(torch::kDouble)
                             .device(DefaultDevice()))
                   : torch::Tensor();

               torch::Tensor lazy_input = CopyToDevice(input, device);
               torch::Tensor lazy_weight = CopyToDevice(weight, device);
               torch::Tensor lazy_bias =
                   with_bias ? CopyToDevice(bias, device) : torch::Tensor();

               torch::Tensor output = torch::conv2d(
                   input,
                   weight,
                   bias,
                   /*stride=*/{stride, stride},
                   /*padding=*/{padding, padding},
                   /*dilation=*/{dilation, dilation},
                   groups);
               torch::Tensor lazy_output = torch::conv2d(
                   lazy_input,
                   lazy_weight,
                   lazy_bias,
                   /*stride=*/{stride, stride},
                   /*padding=*/{padding, padding},
                   /*dilation=*/{dilation, dilation},
                   groups);
               AllClose(output, lazy_output);
             });
           }
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestConv2DBackward) {
   int in_channels = 4;
   int out_channels = 4;
   int kernel_size = 3;
   for (int stride = 1; stride <= 3; ++stride) {
     for (int padding = 0; padding <= 2; ++padding) {
       for (bool with_bias : {true, false}) {
         for (int dilation = 1; dilation <= 3; ++dilation) {
           for (int groups :
                {1, 2, 4}) { // covers normal, grouped, depthwise conv.
             auto testfn =
                 [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
               return torch::conv2d(
                   inputs[0],
                   inputs[1],
                   inputs[2],
                   /*stride=*/{stride, stride},
                   /*padding=*/{padding, padding},
                   /*dilation=*/{dilation, dilation},
                   groups);
             };

             ForEachDevice([&](const torch::Device& device) {
               torch::Tensor bias = with_bias
                   ? torch::rand(
                         {out_channels},
                         torch::TensorOptions(torch::kDouble)
                             .device(DefaultDevice()))
                   : torch::Tensor();
               TestBackward(
                   {torch::rand(
                        {1, in_channels, 7, 7},
                        torch::TensorOptions(torch::kDouble)
                            .device(DefaultDevice())
                            .requires_grad(true)),
                    torch::rand(
                        {out_channels,
                         in_channels / groups,
                         kernel_size,
                         kernel_size},
                        torch::TensorOptions(torch::kDouble)
                            .device(DefaultDevice())
                            .requires_grad(true)),
                    bias},
                   device,
                   testfn);
             });
           }
         };
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestTransposedConv2DBackward) {
   int in_channels = 4;
   int out_channels = 4;
   int kernel_size = 3;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       for (int dilation = 1; dilation <= 2; ++dilation) {
         for (int output_padding = 0;
              output_padding < std::max(stride, dilation);
              ++output_padding) {
           for (bool with_bias : {true, false}) {
             for (int groups :
                  {1, 2, 4}) { // covers normal, grouped, depthwise conv.
               auto testfn = [&](const std::vector<torch::Tensor>& inputs)
                   -> torch::Tensor {
                 return torch::conv_transpose2d(
                     inputs[0],
                     inputs[1],
                     inputs[2],
                     /*stride=*/{stride, stride + 1},
                     /*padding=*/{padding, padding + 1},
                     /*output_padding=*/output_padding,
                     /*groups=*/groups,
                     /*dilation=*/{dilation, dilation + 1});
               };
               ForEachDevice([&](const torch::Device& device) {
                 torch::Tensor input = torch::rand(
                     {4, out_channels, 7, 7},
                     torch::TensorOptions(torch::kFloat)
                         .device(DefaultDevice())
                         .requires_grad(true));
                 torch::Tensor weight = torch::rand(
                     {out_channels,
                      in_channels / groups,
                      kernel_size,
                      kernel_size},
                     torch::TensorOptions(torch::kFloat)
                         .device(DefaultDevice())
                         .requires_grad(true));
                 torch::Tensor bias = with_bias
                     ? torch::rand(
                           {in_channels},
                           torch::TensorOptions(torch::kFloat)
                               .device(DefaultDevice())
                               .requires_grad(true))
                     : torch::Tensor();
                 TestBackward(
                     {input, weight, bias},
                     device,
                     testfn,
                     /*rtol=*/1e-5,
                     /*atol=*/1e-5);
               });
             }
           };
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestConv3DBackward) {
   int in_channels = 4;
   int out_channels = 4;
   int kernel_size = 3;
   for (int stride = 1; stride <= 3; ++stride) {
     for (int padding = 1; padding <= 2; ++padding) {
       for (bool with_bias : {true, false}) {
         for (int dilation = 1; dilation <= 2; ++dilation) {
           for (int groups :
                {1, 2, 4}) { // covers normal, grouped, depthwise conv.
             auto testfn =
                 [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
               return torch::conv3d(
                   inputs[0],
                   inputs[1],
                   inputs[2],
                   /*stride=*/{stride, stride, stride},
                   /*padding=*/{padding, padding, padding},
                   /*dilation=*/{dilation, dilation, dilation},
                   groups);
             };

             ForEachDevice([&](const torch::Device& device) {
               torch::Tensor bias = with_bias
                   ? torch::rand(
                         {out_channels},
                         torch::TensorOptions(torch::kDouble)
                             .device(DefaultDevice()))
                   : torch::Tensor();
               TestBackward(
                   {torch::rand(
                        {4, in_channels, 7, 7, 7},
                        torch::TensorOptions(torch::kDouble)
                            .device(DefaultDevice())
                            .requires_grad(true)),
                    torch::rand(
                        {out_channels,
                         in_channels / groups,
                         kernel_size,
                         kernel_size,
                         kernel_size},
                        torch::TensorOptions(torch::kDouble)
                            .device(DefaultDevice())
                            .requires_grad(true)),
                    bias},
                   device,
                   testfn);
             });
           }
         };
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestTransposedConv3DBackward) {
   int in_channels = 4;
   int out_channels = 4;
   int kernel_size = 3;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       for (int dilation = 1; dilation <= 2; ++dilation) {
         for (int output_padding = 0;
              output_padding < std::max(stride, dilation);
              ++output_padding) {
           for (bool with_bias : {true, false}) {
             for (int groups :
                  {1, 2, 4}) { // covers normal, grouped, depthwise conv.
               auto testfn = [&](const std::vector<torch::Tensor>& inputs)
                   -> torch::Tensor {
                 return torch::conv_transpose3d(
                     inputs[0],
                     inputs[1],
                     inputs[2],
                     /*stride=*/{stride, stride + 1, stride},
                     /*padding=*/{padding, padding + 1, stride},
                     /*output_padding=*/output_padding,
                     /*groups=*/groups,
                     /*dilation=*/{dilation, dilation + 1, dilation});
               };
               ForEachDevice([&](const torch::Device& device) {
                 torch::Tensor input = torch::rand(
                     {4, out_channels, 7, 7, 7},
                     torch::TensorOptions(torch::kDouble)
                         .device(DefaultDevice())
                         .requires_grad(true));
                 torch::Tensor weight = torch::rand(
                     {out_channels,
                      in_channels / groups,
                      kernel_size,
                      kernel_size,
                      kernel_size},
                     torch::TensorOptions(torch::kDouble)
                         .device(DefaultDevice())
                         .requires_grad(true));
                 torch::Tensor bias = with_bias
                     ? torch::rand(
                           {in_channels},
                           torch::TensorOptions(torch::kDouble)
                               .device(DefaultDevice())
                               .requires_grad(true))
                     : torch::Tensor();
                 TestBackward({input, weight, bias}, device, testfn);
               });
             }
           };
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxPool2DBackward) {
   int kernel_size = 3;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         auto testfn =
             [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
           return torch::max_pool2d(
               inputs[0],
               /*kernel_size=*/{kernel_size, kernel_size},
               /*stride=*/{stride, stride},
               /*padding=*/{padding, padding},
               /*dilation=*/{1, 1},
               /*ceil_mode=*/ceil_mode);
         };

         ForEachDevice([&](const torch::Device& device) {
           TestBackward(
               {torch::rand(
                   {1, 2, 8, 8},
                   torch::TensorOptions(torch::kFloat)
                       .device(DefaultDevice())
                       .requires_grad(true))},
               device,
               testfn);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxPool3DBackward) {
   int kernel_size = 3;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         auto testfn =
             [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
           return torch::max_pool3d(
               inputs[0],
               /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
               /*stride=*/{stride, stride, stride},
               /*padding=*/{padding, padding, padding},
               /*dilation=*/{1, 1, 1},
               /*ceil_mode=*/ceil_mode);
         };

         ForEachDevice([&](const torch::Device& device) {
           TestBackward(
               {torch::rand(
                   {1, 2, 4, 4, 4},
                   torch::TensorOptions(torch::kFloat)
                       .device(DefaultDevice())
                       .requires_grad(true))},
               device,
               testfn);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxPool2DNoBatchBackward) {
   int kernel_size = 3;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         auto testfn =
             [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
           return torch::max_pool2d(
               inputs[0],
               /*kernel_size=*/{kernel_size, kernel_size},
               /*stride=*/{stride, stride},
               /*padding=*/{padding, padding},
               /*dilation=*/{1, 1},
               /*ceil_mode=*/ceil_mode);
         };

         ForEachDevice([&](const torch::Device& device) {
           TestBackward(
               {torch::rand(
                   {2, 8, 8},
                   torch::TensorOptions(torch::kFloat)
                       .device(DefaultDevice())
                       .requires_grad(true))},
               device,
               testfn);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxPool3DNoBatchBackward) {
   int kernel_size = 3;
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         auto testfn =
             [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
           return torch::max_pool3d(
               inputs[0],
               /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
               /*stride=*/{stride, stride, stride},
               /*padding=*/{padding, padding, padding},
               /*dilation=*/{1, 1, 1},
               /*ceil_mode=*/ceil_mode);
         };

         ForEachDevice([&](const torch::Device& device) {
           TestBackward(
               {torch::rand(
                   {2, 4, 4, 4},
                   torch::TensorOptions(torch::kFloat)
                       .device(DefaultDevice())
                       .requires_grad(true))},
               device,
               testfn);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxUnpool2DBackward) {
   int kernel_size = 2;
   torch::Tensor input = torch::rand(
       {2, 2, 8, 8},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         for (int dilation = 1; dilation <= 2; ++dilation) {
           torch::Tensor output;
           torch::Tensor indices;
           std::tie(output, indices) = torch::max_pool2d_with_indices(
               input,
               /*kernel_size=*/{kernel_size, kernel_size},
               /*stride=*/{stride, stride},
               /*padding=*/{padding, padding},
               /*dilation=*/{dilation, dilation},
               /*ceil_mode=*/ceil_mode);

           std::vector<int64_t> output_size({input.size(2), input.size(3)});
           auto testfn =
               [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
             return torch::max_unpool2d(inputs[0], inputs[1], output_size);
           };

           ForEachDevice([&](const torch::Device& device) {
             TestBackward(
                 {output.requires_grad_(true), indices}, device, testfn);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestMaxUnpool3DBackward) {
   int kernel_size = 2;
   torch::Tensor input = torch::rand(
       {1, 1, 4, 4, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (int stride = 1; stride <= 2; ++stride) {
     for (int padding = 0; padding <= 1; ++padding) {
       // Test ceil_mode=true through the CPU interop.
       for (bool ceil_mode : {false, true}) {
         for (int dilation = 1; dilation <= 2; ++dilation) {
           torch::Tensor output;
           torch::Tensor indices;
           std::tie(output, indices) = torch::max_pool3d_with_indices(
               input,
               /*kernel_size=*/{kernel_size, kernel_size, kernel_size},
               /*stride=*/{stride, stride, stride},
               /*padding=*/{padding, padding, padding},
               /*dilation=*/{dilation, dilation, dilation},
               /*ceil_mode=*/ceil_mode);

           std::vector<int64_t> output_size(
               {input.size(2), input.size(3), input.size(4)});
           auto testfn =
               [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
             return torch::max_unpool3d(
                 inputs[0],
                 inputs[1],
                 output_size,
                 /*stride=*/{stride, stride, stride},
                 /*padding=*/{padding, padding, padding});
           };

           ForEachDevice([&](const torch::Device& device) {
             TestBackward(
                 {output.requires_grad_(true), indices}, device, testfn);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestTanhBackward) {
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::tanh(inputs[0]);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {2, 2},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn,
         /*rtol=*/1e-3,
         /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestSigmoidBackward) {
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::sigmoid(inputs[0]);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {2, 2},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestLogSigmoidBackward) {
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::log_sigmoid(inputs[0]);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {2, 2},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn,
         /*rtol=*/1e-3,
         /*atol=*/1e-5);
   });
 }

 TEST_F(LazyOpsTest, TestLogSoftmaxBackward) {
   for (int dim = -4; dim < 4; ++dim) {
     auto testfn =
         [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
       return torch::log_softmax(inputs[0], dim);
     };

     ForEachDevice([&](const torch::Device& device) {
       TestBackward(
           {torch::rand(
               {5, 3, 4, 2},
               torch::TensorOptions(torch::kFloat)
                   .device(DefaultDevice())
                   .requires_grad(true))},
           device,
           testfn,
           /*rtol=*/1e-3,
           /*atol=*/1e-4);
     });
   }
 }

 TEST_F(LazyOpsTest, TestSoftmaxBackward) {
   for (int dim = -4; dim < 4; ++dim) {
     auto testfn =
         [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
       return torch::softmax(inputs[0], dim);
     };

     ForEachDevice([&](const torch::Device& device) {
       TestBackward(
           {torch::rand(
               {5, 3, 4, 2},
               torch::TensorOptions(torch::kFloat)
                   .device(DefaultDevice())
                   .requires_grad(true))},
           device,
           testfn,
           /*rtol=*/1e-3,
           /*atol=*/1e-4);
     });
   }
 }

 TEST_F(LazyOpsTest, TestSoftplusBackward) {
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::softplus(inputs[0]);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {2, 1, 4, 6},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn,
         /*rtol=*/1e-4);
   });
 }

 TEST_F(LazyOpsTest, TestReluBackward) {
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::relu(inputs[0]);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {2, 1, 4, 6},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestRreluBackward) {
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::rrelu(inputs[0]);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {2, 1, 4, 6},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestHardshrinkBackward) {
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::hardshrink(inputs[0]);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::randn(
             {100},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestSoftshrinkBackward) {
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::softshrink(inputs[0]);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::randn(
             {100},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestHardtanhBackward) {
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::hardtanh(inputs[0]);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::randn(
             {100},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestEluBackward) {
   torch::Scalar alpha = 0.5;
   torch::Scalar scale = 2.5;
   torch::Scalar input_scale = 1.5;
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::elu(inputs[0], alpha, scale, input_scale);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {2, 1, 4, 6},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestGeluBackward) {
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::gelu(inputs[0]);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {2, 3},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
   ExpectCounterChanged("lazy::gelu_backward", GetIgnoredCounters());
 }

 TEST_F(LazyOpsTest, TestLeakyReluBackward) {
   double negative_slope = 0.01;
   auto testfn = [=](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::leaky_relu(inputs[0], negative_slope);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {2, 1, 4, 6},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestTransposeBackward) {
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::t(inputs[0]);
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {2, 3},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestAddMatMulBackward) {
   int in_channels = 32;
   int out_channels = 320;
   int labels = 50;
   // Test beta != 1. through the CPU interop.
   for (double beta : {1., 2.}) {
     auto testfn =
         [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
       return torch::addmm(inputs[0], inputs[1], inputs[2], /*beta=*/beta);
     };
     ForEachDevice([&](const torch::Device& device) {
       TestBackward(
           {torch::rand(
                {labels},
                torch::TensorOptions(torch::kFloat)
                    .device(DefaultDevice())
                    .requires_grad(true)),
            torch::rand(
                {in_channels, out_channels},
                torch::TensorOptions(torch::kFloat)
                    .device(DefaultDevice())
                    .requires_grad(true)),
            torch::rand(
                {out_channels, labels},
                torch::TensorOptions(torch::kFloat)
                    .device(DefaultDevice())
                    .requires_grad(true))},
           device,
           testfn);
     });
   }
 }

 TEST_F(LazyOpsTest, TestBinaryCrossEntropyBackward) {
   int batch = 6;
   int classes = 2;
   // TODO(asuhan): Fix the torch::kDouble case.
   for (auto dtype : {torch::kFloat}) {
     for (bool def_weight : {false, true}) {
       torch::Tensor input = torch::rand(
           {batch, classes}, torch::TensorOptions(dtype).requires_grad(true));
       torch::Tensor target =
           torch::rand({batch, classes}, torch::TensorOptions(dtype));
       torch::Tensor weight;
       if (def_weight) {
         weight = torch::rand({batch, classes}, torch::TensorOptions(dtype));
       }
       for (torch::Reduction::Reduction reduction :
            {torch::Reduction::Mean,
             torch::Reduction::Sum,
             torch::Reduction::None}) {
         auto testfn =
             [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
           return torch::binary_cross_entropy(
               /*self=*/inputs[0],
               /*target=*/inputs[1],
               /*weight=*/inputs[2],
               /*reduction=*/reduction);
         };
         ForEachDevice([&](const torch::Device& device) {
           TestBackward(
               {input, target, weight},
               device,
               testfn,
               /*rtol=*/1e-4,
               /*atol=*/1e-7);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestNllLossBackward) {
   // TODO(whc) debug divide-by-zero failure under ASAN
   GTEST_SKIP();

   int batch = 6;
   int classes = 2;
   // TODO(asuhan): Fix the torch::kDouble case.
   for (auto dtype : {torch::kFloat}) {
     for (int ignore_index : {-1, 0, 1, 5}) {
       for (bool def_weight : {false, true}) {
         torch::Tensor input = torch::rand(
             {batch, classes},
             torch::TensorOptions(dtype)
                 .device(DefaultDevice())
                 .requires_grad(true));
         torch::Tensor target = torch::randint(
             std::min(ignore_index, 0),
             classes,
             {batch},
             torch::TensorOptions(torch::kLong).device(DefaultDevice()));
         torch::Tensor weight;
         if (def_weight) {
           weight = torch::rand(
               {classes}, torch::TensorOptions(dtype).device(DefaultDevice()));
         }
         for (torch::Reduction::Reduction reduction :
              {torch::Reduction::Mean,
               torch::Reduction::Sum,
               torch::Reduction::None}) {
           auto testfn =
               [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
             return torch::nll_loss(
                 /*self=*/inputs[0],
                 /*target=*/inputs[1],
                 /*weight=*/inputs[2],
                 /*reduction=*/reduction,
                 /*ignore_index=*/ignore_index);
           };
           ForEachDevice([&](const torch::Device& device) {
             TestBackward(
                 {input, target, weight},
                 device,
                 testfn,
                 /*rtol=*/1e-5,
                 /*atol=*/1e-8);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestNllLoss2dBackward) {
   int batch = 6;
   int classes = 2;
   int height = 3;
   int width = 3;
   // TODO(asuhan): Fix the torch::kDouble case.
   for (auto dtype : {torch::kFloat}) {
     for (int ignore_index : {-1, 0, 1, 5}) {
       for (bool def_weight : {false, true}) {
         torch::Tensor input = torch::rand(
             {batch, classes, height, width},
             torch::TensorOptions(dtype)
                 .device(DefaultDevice())
                 .requires_grad(true));
         torch::Tensor target = torch::randint(
             std::min(ignore_index, 0),
             classes,
             {batch, height, width},
             torch::TensorOptions(torch::kLong).device(DefaultDevice()));
         torch::Tensor weight;
         if (def_weight) {
           weight = torch::rand(
               {classes}, torch::TensorOptions(dtype).device(DefaultDevice()));
         }
         for (torch::Reduction::Reduction reduction :
              {torch::Reduction::Mean,
               torch::Reduction::Sum,
               torch::Reduction::None}) {
           auto testfn =
               [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
             return torch::nll_loss2d(
                 /*self=*/inputs[0],
                 /*target=*/inputs[1],
                 /*weight=*/inputs[2],
                 /*reduction=*/reduction,
                 /*ignore_index=*/ignore_index);
           };
           ForEachDevice([&](const torch::Device& device) {
             TestBackward(
                 {input, target, weight},
                 device,
                 testfn,
                 /*rtol=*/1e-5,
                 /*atol=*/1e-8);
           });
         }
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestSmoothL1LossBackward) {
   torch::Tensor input = torch::randn(
       {2, 4},
       torch::TensorOptions(torch::kFloat)
           .device(DefaultDevice())
           .requires_grad(true));
   torch::Tensor target = torch::randn(
       {2, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   for (torch::Reduction::Reduction reduction :
        {torch::Reduction::None,
         torch::Reduction::Mean,
         torch::Reduction::Sum}) {
     for (double beta : {0.25, 1.}) {
       auto testfn =
           [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
         return torch::smooth_l1_loss(
             /*input=*/inputs[0],
             /*target=*/inputs[1],
             /*reduction=*/reduction,
             /*beta=*/beta);
       };
       ForEachDevice([&](const torch::Device& device) {
         TestBackward(
             {input, target},
             device,
             testfn,
             /*rtol=*/1e-5,
             /*atol=*/1e-8);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestViewBackward) {
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return inputs[0].view({-1, 320});
   };
   ForEachDevice([&](const torch::Device& device) {
     TestBackward(
         {torch::rand(
             {32, 20, 4, 4},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true))},
         device,
         testfn);
   });
 }

 TEST_F(LazyOpsTest, TestBatchNorm2DBackward) {
   double momentum = 0.1;
   double eps = 0.5;
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::batch_norm(
         /*input=*/inputs[0],
         /*weight=*/inputs[1],
         /*bias=*/inputs[2],
         /*running_mean=*/inputs[3],
         /*running_var=*/inputs[4],
         /*training=*/true,
         /*momentum=*/momentum,
         /*eps=*/eps,
         /*cudnn_enabled=*/false);
   };
   int num_features = 3;
   torch::Tensor undef;
   for (bool undef_weight_bias : {false, true}) {
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor input = torch::rand(
           {2, num_features, 4, 4},
           torch::TensorOptions(torch::kFloat)
               .device(DefaultDevice())
               .requires_grad(true));
       torch::Tensor weight = undef_weight_bias
           ? undef
           : torch::rand(
                 {num_features},
                 torch::TensorOptions(torch::kFloat)
                     .device(DefaultDevice())
                     .requires_grad(true));
       torch::Tensor bias = undef_weight_bias
           ? undef
           : torch::rand(
                 {num_features},
                 torch::TensorOptions(torch::kFloat)
                     .device(DefaultDevice())
                     .requires_grad(true));
       torch::Tensor running_mean = torch::zeros(
           {num_features},
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor running_var = torch::ones(
           {num_features},
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       TestBackward(
           {input, weight, bias, running_mean, running_var},
           device,
           testfn,
           /*rtol=*/1e-3,
           /*atol=*/1e-4);
     });
   }
 }

 TEST_F(LazyOpsTest, TestBatchNorm3DBackward) {
   double momentum = 0.1;
   double eps = 0.5;
   auto testfn = [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
     return torch::batch_norm(
         /*input=*/inputs[0],
         /*weight=*/inputs[1],
         /*bias=*/inputs[2],
         /*running_mean=*/inputs[3],
         /*running_var=*/inputs[4],
         /*training=*/true,
         /*momentum=*/momentum,
         /*eps=*/eps,
         /*cudnn_enabled=*/false);
   };
   int num_features = 3;
   torch::Tensor undef;
   for (bool undef_weight_bias : {false, true}) {
     ForEachDevice([&](const torch::Device& device) {
       torch::Tensor input = torch::rand(
           {2, num_features, 4, 4, 2},
           torch::TensorOptions(torch::kFloat)
               .device(DefaultDevice())
               .requires_grad(true));
       torch::Tensor weight = undef_weight_bias
           ? undef
           : torch::rand(
                 {num_features},
                 torch::TensorOptions(torch::kFloat)
                     .device(DefaultDevice())
                     .requires_grad(true));
       torch::Tensor bias = undef_weight_bias
           ? undef
           : torch::rand(
                 {num_features},
                 torch::TensorOptions(torch::kFloat)
                     .device(DefaultDevice())
                     .requires_grad(true));
       torch::Tensor running_mean = torch::zeros(
           {num_features},
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       torch::Tensor running_var = torch::ones(
           {num_features},
           torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
       TestBackward(
           {input, weight, bias, running_mean, running_var},
           device,
           testfn,
           /*rtol=*/1e-3,
           /*atol=*/1e-3);
     });
   }
 }

 TEST_F(LazyOpsTest, TestBCEWithLogitsBackward) {
   int batch = 10;
   int classes = 5;
   torch::Tensor undef;
   for (torch::Reduction::Reduction reduction :
        {torch::Reduction::None,
         torch::Reduction::Mean,
         torch::Reduction::Sum}) {
     auto testfn =
         [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
       return torch::binary_cross_entropy_with_logits(
           /*input=*/inputs[0],
           /*target=*/inputs[1],
           /*weight=*/inputs[2],
           /*pos_weight=*/inputs[3],
           /*reduction=*/reduction);
     };
     for (bool undef_weight : {false, true}) {
       for (bool undef_pos_weight : {false, true}) {
         torch::Tensor input = torch::rand(
             {batch, classes},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true));
         torch::Tensor target = torch::rand(
             {batch, classes},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true));
         torch::Tensor weight = undef_weight
             ? undef
             : torch::rand(
                   {classes},
                   torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
         torch::Tensor pos_weight = undef_pos_weight
             ? undef
             : torch::rand(
                   {classes},
                   torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
         ForEachDevice([&](const torch::Device& device) {
           TestBackward(
               {input, target, weight, pos_weight},
               device,
               testfn,
               /*rtol=*/1e-3,
               /*atol=*/1e-5);
         });
       }
     }
   }
 }

 TEST_F(LazyOpsTest, TestKlDivBackward) {
   torch::Tensor input = torch::rand(
       {4, 3},
       torch::TensorOptions(torch::kFloat)
           .device(DefaultDevice())
           .requires_grad(true));
   torch::Tensor target = torch::rand(
       {4, 3},
       torch::TensorOptions(torch::kFloat)
           .device(DefaultDevice())
           .requires_grad(true));
   for (torch::Reduction::Reduction reduction :
        {torch::Reduction::Mean,
         torch::Reduction::Sum,
         torch::Reduction::None}) {
     auto testfn =
         [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
       return torch::kl_div(/*self=*/inputs[0], /*target=*/inputs[1], reduction);
     };
     ForEachDevice([&](const torch::Device& device) {
       TestBackward(
           {input, target},
           device,
           testfn,
           /*rtol=*/1e-4,
           /*atol=*/1e-5);
     });
   }
 }

 TEST_F(LazyOpsTest, TestEmbeddingBackward) {
   int num_weights = 32;
   for (int padding_idx = -1; padding_idx < num_weights; ++padding_idx) {
     for (bool scale_grad_by_freq : {false, true}) {
       auto testfn =
           [&](const std::vector<torch::Tensor>& inputs) -> torch::Tensor {
         return torch::embedding(
             inputs[0],
             inputs[1],
             /*padding_idx=*/padding_idx,
             /*scale_grad_by_freq=*/scale_grad_by_freq,
             /*sparse=*/false);
       };
       ForEachDevice([&](const torch::Device& device) {
         torch::Tensor weight = torch::rand(
             {num_weights, 7},
             torch::TensorOptions(torch::kFloat)
                 .device(DefaultDevice())
                 .requires_grad(true));
         torch::Tensor indices = torch::randint(
             num_weights,
             {3, 9, 4},
             torch::TensorOptions(torch::kLong).device(DefaultDevice()));
         TestBackward(
             {weight, indices},
             device,
             testfn,
             /*rtol=*/1e-5,
             /*atol=*/1e-8);
       });
     }
   }
 }

 TEST_F(LazyOpsTest, TestAmpForeachNonFiniteCheckAndUnscale) {
   if (IsCuda()) {
     // TODO(whc) debug failure on cuda
     GTEST_SKIP();
   }

   torch::Tensor grads0 = torch::tensor(
       {1, 2, 3, 4},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor grads1 = torch::tensor(
       {1.0, 2.0, std::nan("1"), 4.0},
       torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor inv_scale = torch::scalar_tensor(
       0.2, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor found_inf = torch::scalar_tensor(
       0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor grads_output0 = grads0 * inv_scale;
   torch::Tensor found_inf_output0 = torch::scalar_tensor(
       0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor found_inf_output1 = torch::scalar_tensor(
       1, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ForEachDevice([&](const torch::Device& device) {
     if (grads0.device() == at::kCPU) {
       GTEST_SKIP();
     }
     torch::Tensor lazy_grads0 = CopyToDevice(grads0, device);
     torch::Tensor lazy_inv_scale = CopyToDevice(inv_scale, device);
     torch::Tensor lazy_found_inf = CopyToDevice(found_inf, device);
     torch::_amp_foreach_non_finite_check_and_unscale_(
         lazy_grads0, lazy_found_inf, lazy_inv_scale);
     AllClose(grads_output0, lazy_grads0, /*rtol=*/1e-2, /*atol=*/1e-4);
     AllEqual(found_inf_output0, lazy_found_inf);

     torch::Tensor lazy_grads1 = CopyToDevice(grads1, device);
     torch::_amp_foreach_non_finite_check_and_unscale_(
         lazy_grads1, lazy_found_inf, lazy_inv_scale);
     AllEqual(found_inf_output1, lazy_found_inf);
   });
 }

 TEST_F(LazyOpsTest, TestAmpUpdateScale) {
   torch::Tensor growth_tracker = torch::scalar_tensor(
       0, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
   torch::Tensor current_scale = torch::scalar_tensor(
       4, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor found_inf = torch::scalar_tensor(
       1, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor not_found_inf = torch::scalar_tensor(
       0, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   float scale_growth_factor = 2.0;
   float scale_backoff_factor = 0.5;
   int growth_interval = 3;

   torch::Tensor growth_tracker_result0 = torch::scalar_tensor(
       1, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
   torch::Tensor current_scale_result0 = torch::scalar_tensor(
       4, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor growth_tracker_result1 = torch::scalar_tensor(
       2, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
   torch::Tensor current_scale_result1 = torch::scalar_tensor(
       4, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor growth_tracker_result2 = torch::scalar_tensor(
       0, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
   torch::Tensor current_scale_result2 = torch::scalar_tensor(
       8, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor growth_tracker_result3 = torch::scalar_tensor(
       0, torch::TensorOptions(torch::kInt32).device(DefaultDevice()));
   torch::Tensor current_scale_result3 = torch::scalar_tensor(
       4, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));

   ForEachDevice([&](const torch::Device& device) {
     if (growth_tracker.device() == at::kCPU) {
       GTEST_SKIP();
     }
     torch::Tensor lazy_growth_tracker = CopyToDevice(growth_tracker, device);
     torch::Tensor lazy_current_scale = CopyToDevice(current_scale, device);
     torch::Tensor lazy_found_inf = CopyToDevice(found_inf, device);
     torch::Tensor lazy_not_found_inf = CopyToDevice(not_found_inf, device);

     torch::_amp_update_scale_(
         lazy_current_scale,
         lazy_growth_tracker,
         lazy_not_found_inf,
         scale_growth_factor,
         scale_backoff_factor,
         growth_interval);
     AllClose(
         current_scale_result0,
         lazy_current_scale,
         /*rtol=*/1e-2,
         /*atol=*/1e-4);
     AllEqual(growth_tracker_result0, lazy_growth_tracker);

     torch::_amp_update_scale_(
         lazy_current_scale,
         lazy_growth_tracker,
         lazy_not_found_inf,
         scale_growth_factor,
         scale_backoff_factor,
         growth_interval);
     AllClose(
         current_scale_result1,
         lazy_current_scale,
         /*rtol=*/1e-2,
         /*atol=*/1e-4);
     AllEqual(growth_tracker_result1, lazy_growth_tracker);

     // torch::_amp_update_scale_ returns the reference of current_scale
     lazy_current_scale = torch::_amp_update_scale_(
         lazy_current_scale,
         lazy_growth_tracker,
         lazy_not_found_inf,
         scale_growth_factor,
         scale_backoff_factor,
         growth_interval);
     AllClose(
         current_scale_result2,
         lazy_current_scale,
         /*rtol=*/1e-2,
         /*atol=*/1e-4);
     AllEqual(growth_tracker_result2, lazy_growth_tracker);

     lazy_current_scale = torch::_amp_update_scale_(
         lazy_current_scale,
         lazy_growth_tracker,
         lazy_found_inf,
         scale_growth_factor,
         scale_backoff_factor,
         growth_interval);
     AllClose(
         current_scale_result3,
         lazy_current_scale,
         /*rtol=*/1e-2,
         /*atol=*/1e-4);
     AllEqual(growth_tracker_result3, lazy_growth_tracker);
   });
   ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
   ExpectCounterChanged("lazy::_amp_update_scale_", GetIgnoredCounters());
 }

 TEST_F(LazyOpsTest, TestEarlySyncLiveTensors) {
   torch::Tensor scalar_tensor = torch::scalar_tensor(
       1., torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar scalar1 = scalar_tensor.item();
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_scalar_tensor = CopyToDevice(scalar_tensor, device);
     torch::Scalar scalar2 = lazy_scalar_tensor.item();
     ASSERT_EQ(scalar1.to<float>(), scalar2.to<float>());
   });
   if (DebugUtil::ExperimentEnabled("early_sync")) {
     ExpectCounterChanged("EarlySyncLiveTensorsCount", GetIgnoredCounters());
   } else {
     ExpectCounterNotChanged("EarlySyncLiveTensorsCount", GetIgnoredCounters());
   }
   ExpectCounterChanged("aten::_local_scalar_dense", GetIgnoredCounters());
 }

 TEST_F(LazyOpsTest, TestLerp) {
   torch::Tensor start = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor end = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor weight = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor res = torch::lerp(start, end, weight);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_start = CopyToDevice(start, device);
     torch::Tensor lazy_end = CopyToDevice(end, device);
     torch::Tensor lazy_weight = CopyToDevice(weight, device);
     torch::Tensor lazy_res = torch::lerp(lazy_start, lazy_end, lazy_weight);
     AllClose(res, lazy_res);
   });
   ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
   ExpectCounterChanged("lazy::lerp", GetIgnoredCounters());
 }

 TEST_F(LazyOpsTest, TestLerpScalar) {
   torch::Tensor start = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor end = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar weight = torch::Scalar(3.0);
   torch::Tensor res = torch::lerp(start, end, weight);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_start = CopyToDevice(start, device);
     torch::Tensor lazy_end = CopyToDevice(end, device);
     torch::Tensor lazy_res = torch::lerp(lazy_start, lazy_end, weight);
     AllClose(res, lazy_res);
   });
   ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
   ExpectCounterChanged("lazy::lerp", GetIgnoredCounters());
 }

 TEST_F(LazyOpsTest, TestLerpInplace) {
   torch::Tensor input = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor end = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor weight = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor input_copy = input.clone();
   input.lerp_(end, weight);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input_copy, device);
     torch::Tensor lazy_end = CopyToDevice(end, device);
     torch::Tensor lazy_weight = CopyToDevice(weight, device);
     lazy_input.lerp_(lazy_end, lazy_weight);
     AllClose(lazy_input, input);
   });
   ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
   ExpectCounterChanged("lazy::lerp", GetIgnoredCounters());
 }

 TEST_F(LazyOpsTest, TestLerpScalarInplace) {
   torch::Tensor input = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor end = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar weight = torch::Scalar(3.0);
   torch::Tensor input_copy = input.clone();
   input.lerp_(end, weight);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_input = CopyToDevice(input_copy, device);
     torch::Tensor lazy_end = CopyToDevice(end, device);
     lazy_input.lerp_(lazy_end, weight);
     AllClose(lazy_input, input);
   });
   ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
   ExpectCounterChanged("lazy::lerp", GetIgnoredCounters());
 }

 TEST_F(LazyOpsTest, TestLerpOut) {
   torch::Tensor start = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor end = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor weight = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor res = torch::empty(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   ;
   torch::lerp_out(res, start, end, weight);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_start = CopyToDevice(start, device);
     torch::Tensor lazy_end = CopyToDevice(end, device);
     torch::Tensor lazy_weight = CopyToDevice(weight, device);
     torch::Tensor lazy_res = torch::empty({3, 4}, lazy_start.options());
     torch::lerp_out(lazy_res, lazy_start, lazy_end, lazy_weight);
     AllClose(res, lazy_res);
   });
   ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
   ExpectCounterChanged("lazy::lerp", GetIgnoredCounters());
 }

 TEST_F(LazyOpsTest, TestLerpScalarOut) {
   torch::Tensor start = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Tensor end = torch::rand(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::Scalar weight = torch::Scalar(3.0);
   torch::Tensor res = torch::empty(
       {3, 4}, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   torch::lerp_out(res, start, end, weight);
   ForEachDevice([&](const torch::Device& device) {
     torch::Tensor lazy_start = CopyToDevice(start, device);
     torch::Tensor lazy_end = CopyToDevice(end, device);
     torch::Tensor lazy_res = torch::empty({3, 4}, lazy_start.options());
     torch::lerp_out(lazy_res, lazy_start, lazy_end, weight);
     AllClose(res, lazy_res);
   });
   ExpectCounterNotChanged("aten::.*", GetIgnoredCounters());
   ExpectCounterChanged("lazy::lerp", GetIgnoredCounters());
 }

 TEST_F(LazyOpsTest, IsAliasOf) {
   auto a = torch::empty(
       4, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));
   auto b = torch::empty(
       4, torch::TensorOptions(torch::kFloat).device(DefaultDevice()));

   ForEachDevice([&](const torch::Device& device) {
     auto lazy_a = CopyToDevice(a, device);
     auto lazy_b = CopyToDevice(b, device);
     EXPECT_EQ(!a.is_alias_of(b), !lazy_a.is_alias_of(lazy_b));

     auto c = a.view({2, 2});
     auto lazy_c = lazy_a.view({2, 2});
     EXPECT_EQ(a.is_alias_of(c), lazy_a.is_alias_of(lazy_c));

     auto d = c.view({1, 4});
     auto lazy_d = lazy_c.view({1, 4});
     EXPECT_EQ(d.is_alias_of(c), lazy_d.is_alias_of(lazy_c));
     EXPECT_EQ(d.is_alias_of(a), lazy_d.is_alias_of(lazy_a));
   });
 }

 #endif // FBCODE_CAFFE2

 } // namespace lazy
 } // namespace torch