| #include <gtest/gtest.h> |
| |
| #include <limits> |
| #include <memory> |
| #include <sstream> |
| #include <stdexcept> |
| #include <unordered_map> |
| |
| #include <test/cpp/tensorexpr/test_base.h> |
| |
| #include <c10/util/irange.h> |
| #include <test/cpp/tensorexpr/padded_buffer.h> |
| #include <torch/csrc/jit/tensorexpr/analysis.h> |
| #include <torch/csrc/jit/tensorexpr/eval.h> |
| #include <torch/csrc/jit/tensorexpr/ir.h> |
| #include <torch/csrc/jit/tensorexpr/ir_printer.h> |
| #include <torch/csrc/jit/tensorexpr/ir_simplifier.h> |
| #include <torch/csrc/jit/tensorexpr/loopnest.h> |
| #include <torch/csrc/jit/tensorexpr/tensor.h> |
| #include <torch/csrc/jit/testing/file_check.h> |
| |
| namespace torch { |
| namespace jit { |
| |
| using namespace torch::jit::tensorexpr; |
| |
| TEST(Reductions, ReduceSum0D_1) { |
| const int M = 10; |
| |
| BufHandle b("b", {M}, kFloat); |
| std::vector<float> in(M); |
| for (const auto j : c10::irange(M)) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(M, -1.f); |
| |
| Tensor c = Reduce("sum", {M}, Sum(), b, {}); |
| LoopNest loop({c}); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| for (const auto i : c10::irange(M)) { |
| ASSERT_EQ(out[i], in[i]); |
| } |
| } |
| |
| TEST(Reductions, ReduceSum0D_2) { |
| BufHandle b("b", {}, kFloat); |
| std::vector<float> in(1); |
| in[0] = 77.7; |
| |
| std::vector<float> out(1, -1.f); |
| |
| Tensor c = Reduce("sum", {}, Sum(), b, {}); |
| LoopNest loop({c}); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], in[0]); |
| } |
| |
| // Sum an array to a single value. |
| TEST(Reductions, ReduceSum1D) { |
| BufHandle b("b", {10}, kFloat); |
| std::vector<float> in(10); |
| for (const auto j : c10::irange(10)) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(1, -1.f); |
| |
| Tensor c = Reduce("sum", {}, Sum(), b, {10}); |
| LoopNest loop({c}); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], 45); |
| } |
| // Sum a 2D tensor to a 1D tensor with dynamic shapes. |
| TEST(Reductions, ReduceSum2D) { |
| const int M = 3; |
| const int N = 7; |
| |
| VarHandle m("m", kInt); |
| VarHandle n("n", kInt); |
| |
| BufHandle b("b", {m, n}, kFloat); |
| std::vector<float> in(M * N); |
| for (const auto i : c10::irange(M)) { |
| for (const auto j : c10::irange(N)) { |
| in[i * N + j] = j; |
| } |
| } |
| |
| std::vector<float> out(M, -1.f); |
| |
| Tensor c = Reduce("sum", {M}, Sum(), b, {N}); |
| LoopNest loop({c}); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c, n, m}); |
| |
| cg.call({in, out, 5, 7}); |
| |
| float expected = 0; |
| for (const auto i : c10::irange(N)) { |
| // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) |
| expected += i; |
| } |
| |
| for (const auto i : c10::irange(M)) { |
| ASSERT_EQ(out[i], expected); |
| } |
| } |
| |
| // Sum a 3D tensor to both a 2D and 1D tensor, then reduce the 2D tensor flat to |
| // check our work. |
| TEST(Reductions, ReduceSum3D) { |
| const int M = 10; |
| VarHandle m("m", kInt); |
| |
| BufHandle b("b", {2, 3, m}, kFloat); |
| |
| Tensor c = Reduce("sum", {2, 3}, Sum(), b, {m}); |
| LoopNest loop({c}); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c, m}); |
| |
| std::vector<float> bData(2 * 3 * M, 0); |
| std::vector<float> cData(2 * 3, 6.0f); |
| std::vector<float> dData(2, 1.0f); |
| std::vector<float> eData(2, 1.0f); |
| |
| for (int i = 0; i < 2 * 3; ++i) { |
| for (const auto j : c10::irange(M)) { |
| bData[i * M + j] = j; |
| } |
| } |
| |
| cg.call({bData, cData, M}); |
| float expected = 0; |
| for (const auto i : c10::irange(M)) { |
| // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) |
| expected += i; |
| } |
| |
| for (int i = 0; i < 2 * 3; ++i) { |
| ASSERT_EQ(cData[i], expected); |
| } |
| |
| Tensor d = Reduce("sum2", {2}, Sum(), b, {3, m}); |
| LoopNest loop2({d}); |
| loop2.prepareForCodegen(); |
| StmtPtr s2 = loop2.root_stmt(); |
| s2 = IRSimplifier::simplify(s2); |
| |
| SimpleIREvaluator cg2(s2, {b, d, m}); |
| cg2.call({bData, dData, M}); |
| |
| // We're combining an additional dimension of 3, so the sum is 3x. |
| expected = expected * 3; |
| |
| for (const auto i : c10::irange(2)) { |
| ASSERT_EQ(dData[i], expected); |
| } |
| |
| // This is the same as just reducing the original result across that axis. |
| BufHandle c_buf(c.buf()); |
| Tensor e = Reduce("sum3", {2}, Sum(), c_buf, {3}); |
| LoopNest loop3({e}); |
| loop3.prepareForCodegen(); |
| StmtPtr s3 = loop3.root_stmt(); |
| s3 = IRSimplifier::simplify(s3); |
| |
| SimpleIREvaluator cg3(s3, {c, e}); |
| cg3.call({cData, eData}); |
| |
| for (const auto i : c10::irange(2)) { |
| ASSERT_EQ(eData[i], expected); |
| } |
| } |
| |
| // Sum a large (10 D) Tensor 5 dimensions in. |
| TEST(Reductions, ReduceSum10D) { |
| BufHandle in_("in_", {2, 3, 2, 3, 2, 3, 2, 3, 2, 3}, kFloat); |
| const int InputSize = 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3 * 2 * 3; |
| BufHandle out_("out_", {2, 3, 2, 3, 2}, kFloat); |
| const int OutputSize = 2 * 3 * 2 * 3 * 2; |
| |
| std::vector<float> in(InputSize, 1.f); |
| std::vector<float> out(OutputSize, -1.f); |
| |
| Tensor c = Reduce("sum", {2, 3, 2, 3, 2}, Sum(), in_, {3, 2, 3, 2, 3}); |
| LoopNest loop({c}); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {in_, c}); |
| |
| cg.call({in, out}); |
| |
| // NOLINTNEXTLINE(bugprone-integer-division) |
| float expected = InputSize / OutputSize; |
| for (const auto i : c10::irange(OutputSize)) { |
| ASSERT_EQ(out[i], expected); |
| } |
| } |
| |
| // Reduce via Mul rather than Add using a custom Reducer. |
| TEST(Reductions, ReduceProduct) { |
| const int M = 4; |
| const int N = 4; |
| |
| BufHandle b("b", {M, N}, kFloat); |
| std::vector<float> in(M * N); |
| for (const auto i : c10::irange(M)) { |
| for (const auto j : c10::irange(N)) { |
| in[i * N + j] = 2 + j; |
| } |
| } |
| |
| std::vector<float> out(M, -1.f); |
| |
| Reducer product( |
| ExprHandle(1.f), [](ExprHandle a, ExprHandle b) { return a * b; }); |
| |
| Tensor c = Reduce("product", {M}, product, b, {N}); |
| LoopNest loop({c}); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| |
| float expected = 1; |
| for (const auto i : c10::irange(N)) { |
| // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) |
| expected *= 2 + i; |
| } |
| |
| for (const auto i : c10::irange(M)) { |
| ASSERT_EQ(out[i], expected); |
| } |
| } |
| |
| // Maximum reductions. |
| TEST(Reductions, ReduceMax) { |
| BufHandle in_("b", {10}, kFloat); |
| |
| std::vector<float> in(10); |
| std::vector<float> out(1, -1.f); |
| for (const auto j : c10::irange(10)) { |
| in[j] = j; |
| } |
| |
| Tensor dm1 = Reduce("max", {}, Maximum(kFloat), in_, {10}); |
| |
| LoopNest loop({dm1}); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| SimpleIREvaluator cg(s, {in_, dm1}); |
| |
| cg.call({in, out}); |
| |
| ASSERT_EQ(out[0], 9); |
| |
| BufHandle in2_("b", {2, 5}, kFloat); |
| std::vector<float> out2(2, -1.f); |
| |
| Tensor m2d = Reduce("max", {2}, Maximum(kFloat), in2_, {5}); |
| |
| LoopNest loop2({m2d}); |
| loop2.prepareForCodegen(); |
| s = loop2.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg2(s, {in2_, m2d}); |
| cg2.call({in, out2}); |
| |
| ASSERT_EQ(out2[0], 4); |
| ASSERT_EQ(out2[1], 9); |
| } |
| |
| // Minimum reduction, with custom initialization. |
| TEST(Reductions, ReduceMinCustomInitializer) { |
| VarHandle minInit("minInit", kFloat); |
| BufHandle in_("b", {10}, kFloat); |
| |
| std::vector<float> in(10); |
| std::vector<float> out(1, -1.f); |
| for (const auto j : c10::irange(10)) { |
| in[j] = 10 + j; |
| } |
| |
| Tensor min = Reduce( |
| "min", |
| {}, |
| Minimum(ExprHandle(minInit)), |
| [&](ParameterList& v) { return in_.load(v); }, |
| {10}); |
| |
| LoopNest loop({min}); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {in_, min, minInit}); |
| |
| // Works normally (note that out data starts lower than the correct |
| // minimum). |
| cg.call({in, out, std::numeric_limits<float>::max()}); |
| ASSERT_EQ(out[0], 10); |
| |
| // With an initalizer lower than the min, that's the min. |
| cg.call({in, out, 5.f}); |
| ASSERT_EQ(out[0], 5); |
| } |
| |
| // Example implementation of Any/All. |
| // TODO: this is very awkward without logical And/Or operators. |
| TEST(Reductions, ReduceAnyAll) { |
| VarHandle searchValue("searchValue", kInt); |
| BufHandle b("b", {4, 10}, kInt); |
| |
| Reducer anyEqSV(ExprHandle(0), [](ExprHandle a, ExprHandle b) { |
| return CompareSelect::make(a, 1, 1, b, kEQ); |
| }); |
| |
| Tensor any = Reduce( |
| "anyEqual", |
| {4}, |
| anyEqSV, |
| [&](const auto& i, const auto& j) { |
| return CompareSelect::make(b.load(i, j), searchValue, kEQ); |
| }, |
| {10}); |
| |
| LoopNest loop({any}); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, any, searchValue}); |
| |
| std::vector<int> in(40, 0); |
| std::vector<int> out(4, 0); |
| |
| // input has 0-39 in 4 rows. |
| for (const auto i : c10::irange(40)) { |
| in[i] = i; |
| } |
| cg.call({in, out, 1}); |
| |
| // only the first row has 1 |
| ASSERT_EQ(out[0], 1); |
| ASSERT_EQ(out[1], 0); |
| ASSERT_EQ(out[2], 0); |
| ASSERT_EQ(out[3], 0); |
| |
| cg.call({in, out, 15}); |
| |
| // 15 in the 3rd row |
| ASSERT_EQ(out[0], 0); |
| ASSERT_EQ(out[1], 1); |
| ASSERT_EQ(out[2], 0); |
| ASSERT_EQ(out[3], 0); |
| |
| Reducer allGTSV(ExprHandle(1), [](ExprHandle a, ExprHandle b) { |
| return CompareSelect::make(a, 0, 0, b, kEQ); |
| }); |
| |
| Tensor allGreaterThan = Reduce( |
| "allGreaterThan", |
| {4}, |
| allGTSV, |
| [&](const auto& i, const auto& j) { |
| return CompareSelect::make(b.load(i, j), searchValue, kGT); |
| }, |
| {10}); |
| |
| LoopNest loop2({allGreaterThan}); |
| loop2.prepareForCodegen(); |
| s = loop2.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg2(s, {b, allGreaterThan, searchValue}); |
| |
| cg2.call({in, out, 11}); |
| |
| // 11 is in row 2. |
| ASSERT_EQ(out[0], 0); |
| ASSERT_EQ(out[1], 0); |
| ASSERT_EQ(out[2], 1); |
| ASSERT_EQ(out[3], 1); |
| |
| cg2.call({in, out, -3}); |
| |
| // All are positive. |
| ASSERT_EQ(out[0], 1); |
| ASSERT_EQ(out[1], 1); |
| ASSERT_EQ(out[2], 1); |
| ASSERT_EQ(out[3], 1); |
| } |
| |
| TEST(Reductions, ReduceMatmul2D) { |
| BufHandle tA("tA", {3, 2}, kFloat); |
| BufHandle tB("tB", {2, 3}, kFloat); |
| |
| std::vector<float> tA_(6); |
| std::vector<float> tB_(6); |
| |
| std::vector<float> out(9, -1.f); |
| for (const auto i : c10::irange(3)) { |
| for (const auto j : c10::irange(2)) { |
| tA_[i * 2 + j] = i * 2 + j; |
| tB_[j * 3 + i] = i * 2 + j; |
| } |
| } |
| |
| Tensor mm = Reduce( |
| "mm", |
| {3, 3}, |
| Sum(), |
| [&](const ExprHandle& m, const ExprHandle& n, const ExprHandle& k) { |
| return tA.load(m, k) * tB.load(k, n); |
| }, |
| {2}); |
| |
| LoopNest loop({mm}); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {tA, tB, mm}); |
| cg.call({tA_, tB_, out}); |
| |
| std::vector<float> expected( |
| {1.f, 3.f, 5.f, 3.f, 13.f, 23.f, 5.f, 23.f, 41.f}); |
| |
| for (const auto i : c10::irange(9)) { |
| ASSERT_EQ(out[i], expected[i]); |
| } |
| } |
| |
| TEST(Reductions, ReduceRfactorLike) { |
| BufHandle in("in", {10, 10}, kFloat); |
| std::vector<float> in_(100); |
| for (const auto i : c10::irange(100)) { |
| in_[i] = i; |
| } |
| std::vector<float> in_rf_(10, -2.f); |
| std::vector<float> out(1, -1.f); |
| |
| Tensor l1 = Reduce("l1", {10}, Sum(), in, {10}); |
| BufHandle in_rf(l1.buf()); |
| |
| Tensor l2 = Reduce("l2", {}, Sum(), in_rf, {10}); |
| |
| LoopNest loop({l1, l2}); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {in, l1, l2}); |
| cg.call({in_, in_rf_, out}); |
| |
| ASSERT_EQ(out[0], 99 * 50); |
| } |
| |
| TEST(Reductions, ReduceAsProducer) { |
| const int M = 10; |
| VarHandle m("m", kInt); |
| |
| BufHandle a("a", {2, 3}, kFloat); |
| BufHandle b("b", {2, 3, m}, kFloat); |
| |
| Tensor c = Reduce("sum", {2, 3}, Sum(), b, {m}); |
| Tensor d = |
| Compute("scale", {2, 3}, [&](const VarHandle& l, const VarHandle& n) { |
| return c.load(l, n) * a.load(l, n); |
| }); |
| LoopNest loop({d}, {c, d}); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {a, b, d, m}); |
| |
| std::vector<float> aData(2 * 3, 0); |
| std::vector<float> bData(2 * 3 * M, 0); |
| std::vector<float> dData(2 * 3, 6.0f); |
| |
| for (int i = 0; i < 2 * 3; ++i) { |
| aData[i] = 6 - i; |
| for (const auto j : c10::irange(M)) { |
| bData[i * M + j] = j; |
| } |
| } |
| |
| cg.call({aData, bData, dData, M}); |
| float expected = 0; |
| for (const auto i : c10::irange(M)) { |
| // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) |
| expected += i; |
| } |
| for (int i = 0; i < 2 * 3; ++i) { |
| ASSERT_EQ(dData[i], expected * (6 - i)); |
| } |
| } |
| |
| TEST(Reductions, ReduceAsConsumer) { |
| const int M = 10; |
| VarHandle m("m", kInt); |
| |
| BufHandle a("a", {2, 3, m}, kFloat); |
| BufHandle b("b", {2, 3, m}, kFloat); |
| |
| Tensor c = Compute( |
| "scale", |
| {2, 3, m}, |
| [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { |
| return b.load(l, n, m) * a.load(l, n, m); |
| }); |
| Tensor d = Reduce("sum", {2}, Sum(), c, {3, m}); |
| LoopNest loop({d}, {c, d}); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {a, b, d, m}); |
| |
| std::vector<float> aData(2 * 3 * M, 0); |
| std::vector<float> bData(2 * 3 * M, 0); |
| std::vector<float> dData(2, 6.0f); |
| |
| for (int i = 0; i < 2 * 3; ++i) { |
| for (const auto j : c10::irange(M)) { |
| bData[i * M + j] = j + 1; |
| aData[i * M + j] = 6 - i; |
| } |
| } |
| |
| cg.call({aData, bData, dData, M}); |
| // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) |
| float expected[2] = {0, 0}; |
| for (const auto i : c10::irange(2)) { |
| for (const auto j : c10::irange(3)) { |
| for (const auto k : c10::irange(M)) { |
| // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) |
| expected[i] += (k + 1) * (6 - (i * 3 + j)); |
| } |
| } |
| } |
| |
| for (const auto i : c10::irange(2)) { |
| ASSERT_EQ(dData[i], expected[i]); |
| } |
| } |
| |
| TEST(Reductions, SplitReduceAxis) { |
| BufHandle in("in", {16, 8}, kFloat); |
| |
| std::vector<float> in_(16 * 8); |
| for (const auto i : c10::irange(16)) { |
| for (const auto j : c10::irange(8)) { |
| in_[i * 8 + j] = i; |
| } |
| } |
| std::vector<float> out(16, -1.f); |
| |
| Tensor tensor = Reduce("sum", {16}, Sum(), in, {8}); |
| LoopNest l({tensor}); |
| std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor); |
| LoopNest::splitWithTail(loops[1], 2); |
| |
| l.prepareForCodegen(); |
| |
| StmtPtr s = l.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {in, tensor}); |
| cg.call({in_, out}); |
| |
| for (const auto i : c10::irange(16)) { |
| ASSERT_EQ(out[i], i * 8); |
| } |
| } |
| |
| TEST(Reductions, SplitNonReduceAxis) { |
| BufHandle in("in", {16, 8}, kFloat); |
| |
| std::vector<float> in_(16 * 8); |
| for (const auto i : c10::irange(16)) { |
| for (const auto j : c10::irange(8)) { |
| in_[i * 8 + j] = i; |
| } |
| } |
| std::vector<float> out(16, -1.f); |
| Tensor tensor = Reduce("sum", {16}, Sum(), in, {8}); |
| LoopNest l({tensor}); |
| std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor); |
| LoopNest::splitWithTail(loops[0], 2); |
| LoopNest::splitWithTail(loops[0], 2); |
| |
| l.prepareForCodegen(); |
| |
| StmtPtr s = l.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {in, tensor}); |
| cg.call({in_, out}); |
| |
| for (const auto i : c10::irange(16)) { |
| ASSERT_EQ(out[i], i * 8); |
| } |
| } |
| |
| TEST(Reductions, ReorderedReductionInitializer) { |
| /* From the quip: |
| for k in 0..1: // blockIdx |
| for m in 0..128: |
| for n in 0..64: // threadIdx |
| SumOp(c(k, n), 0, a(k, m, n), {m}) |
| */ |
| |
| BufHandle in("in", {1, 12, 6}, kFloat); |
| std::vector<float> in_(12 * 6, 1.f); |
| |
| Tensor tensor_ = Reduce("sum", {1, 12}, Sum(), in, {6}); |
| LoopNest l_({tensor_}); |
| |
| l_.prepareForCodegen(); |
| StmtPtr s_ = Stmt::clone(l_.root_stmt()); |
| s_ = IRSimplifier::simplify(s_); |
| |
| Tensor tensor = Reduce("sum", {1, 12}, Sum(), in, {6}); |
| LoopNest l({tensor}); |
| |
| auto loops = l.getLoopStmtsFor(tensor); |
| loops[0]->set_gpu_block_index(0); |
| loops[1]->set_gpu_thread_index(0); |
| |
| LoopNest::reorderAxis(loops[1], loops[2]); |
| |
| StmtPtr s = l.root_stmt(); |
| // NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores) |
| s = IRSimplifier::simplify(s); |
| |
| l.prepareForCodegen(); |
| |
| s = l.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| std::vector<float> out1(16, -1.f); |
| SimpleIREvaluator cg(s_, {in, tensor_}); |
| cg.call({in_, out1}); |
| |
| std::vector<float> out2(16, -1.f); |
| SimpleIREvaluator cg2(s, {in, tensor}); |
| cg2.call({in_, out2}); |
| |
| for (const auto i : c10::irange(16)) { |
| ASSERT_EQ(out1[i], out2[i]); |
| } |
| } |
| |
| TEST(Reductions, ReduceRfactor) { |
| const int M = 10; |
| const int N = 10; |
| VarHandle m("m", kInt); |
| VarHandle n("n", kInt); |
| |
| BufHandle b("b", {m, n}, kFloat); |
| std::vector<float> in(M * N); |
| for (int j = 0; j < M * N; ++j) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(1, -1.f); |
| |
| Tensor c = Reduce("sum", {}, Sum(), b, {m, n}); |
| LoopNest loop({c}); |
| std::vector<ForPtr> loops = loop.getLoopStmtsFor(c); |
| auto c_body = loop.getAllWritesToBuf(c.buf())[1]; |
| ASSERT_TRUE(loop.rfactor(c_body, loops.at(0))); |
| auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| ASSERT_EQ(rc.size(), 2); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c, m, n}); |
| |
| cg.call({in, out, M, N}); |
| ASSERT_EQ(out[0], 4950); |
| } |
| |
| TEST(Reductions, Reduce3DRfactorInner) { |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| VarHandle m("m", kInt); |
| VarHandle n("n", kInt); |
| VarHandle k("k", kInt); |
| |
| BufHandle b("b", {m, n, k}, kFloat); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(1, -1.f); |
| |
| Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k}); |
| LoopNest loop({c}); |
| std::vector<ForPtr> loops = loop.getLoopStmtsFor(c); |
| auto c_body = loop.getAllWritesToBuf(c.buf())[1]; |
| ASSERT_FALSE(loop.rfactor(c_body, loops.at(2))); |
| auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| ASSERT_EQ(rc.size(), 1); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c, m, n, k}); |
| |
| cg.call({in, out, M, N, K}); |
| ASSERT_EQ(out[0], 499500); |
| } |
| |
| TEST(Reductions, Reduce3DRfactorOuter) { |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| VarHandle m("m", kInt); |
| VarHandle n("n", kInt); |
| VarHandle k("k", kInt); |
| |
| BufHandle b("b", {m, n, k}, kFloat); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(1, -1.f); |
| |
| Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k}); |
| LoopNest loop({c}); |
| std::vector<ForPtr> loops = loop.getLoopStmtsFor(c); |
| auto c_body = loop.getAllWritesToBuf(c.buf())[1]; |
| ASSERT_TRUE(loop.rfactor(c_body, loops.at(0))); |
| auto rc = NodeFinder<ReduceOp>::find(loop.root_stmt()); |
| ASSERT_EQ(rc.size(), 2); |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c, m, n, k}); |
| cg.call({in, out, M, N, K}); |
| ASSERT_EQ(out[0], 499500); |
| } |
| |
| TEST(Reductions, ReduceRepeatedInternalRfactor) { |
| BufHandle in_("in_", {2, 3, 4, 5, 6}, kFloat); |
| const int InputSize = 2 * 3 * 4 * 5 * 6; |
| |
| std::vector<float> in(InputSize, 1.f); |
| std::vector<float> out(1, -1.f); |
| std::vector<float> ref(1, -1.f); |
| |
| Tensor c = Reduce("sum", {}, Sum(), in_, {2, 3, 4, 5, 6}); |
| LoopNest orig_loop({c}); |
| |
| // Try rfactoring N outer loops |
| for (const auto rfac_number : c10::irange(1, 5)) { |
| LoopNest refloop(orig_loop); |
| LoopNest loop(orig_loop); |
| refloop.prepareForCodegen(); |
| SimpleIREvaluator ref_cg( |
| IRSimplifier::simplify(refloop.root_stmt()), {in_, c}); |
| ref_cg.call({in, ref}); |
| |
| BufPtr tmp_buf = c.buf(); |
| |
| for (const auto idx : c10::irange(rfac_number)) { |
| auto reduce = loop.getAllWritesToBuf(tmp_buf)[1]; |
| ASSERT_TRUE(loop.rfactor( |
| reduce, loop.getLoopStmtsFor(tmp_buf).at(idx), &tmp_buf)); |
| } |
| |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {in_, c}); |
| cg.call({in, out}); |
| |
| ASSERT_EQ(ref[0], out[0]); |
| } |
| } |
| |
| // Split a reduction axis with a tail loop. |
| TEST(Reductions, ReduceSplitTail) { |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| |
| BufHandle b("b", {M, N, K}, kFloat); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| for (const auto i : c10::irange(3)) { |
| std::vector<float> out(M, -1.f); |
| |
| Tensor c = Reduce("sum", {M}, Sum(), b, {N, K}); |
| LoopNest loop({c}); |
| std::vector<ForPtr> loops = loop.getLoopStmtsFor(c); |
| LoopNest::splitWithTail(loops[i], 8); |
| |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], 4950); |
| } |
| } |
| |
| // Split a reduction axis cleanly so there is no tail loop. |
| TEST(Reductions, ReduceSplitNoTail) { |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| BufHandle b("b", {M, N, K}, kFloat); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| for (const auto i : c10::irange(3)) { |
| std::vector<float> out(M, -1.f); |
| |
| Tensor c = Reduce("sum", {M}, Sum(), b, {N, K}); |
| LoopNest loop({c}); |
| std::vector<ForPtr> loops = loop.getLoopStmtsFor(c); |
| LoopNest::splitWithTail(loops[i], 5); |
| |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], 4950); |
| } |
| } |
| |
| // Split a reduction axis with only a tail loop (the split loop will be size 0 |
| // and eliminated out). |
| TEST(Reductions, ReduceOverSplitTail) { |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| |
| BufHandle b("b", {M, N, K}, kFloat); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| for (const auto i : c10::irange(3)) { |
| std::vector<float> out(M, -1.f); |
| |
| Tensor c = Reduce("sum", {M}, Sum(), b, {N, K}); |
| LoopNest loop({c}); |
| std::vector<ForPtr> loops = loop.getLoopStmtsFor(c); |
| LoopNest::splitWithTail(loops[i], 16); |
| |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], 4950); |
| } |
| } |
| |
| // Split a reduction axis with a mask. |
| TEST(Reductions, ReduceSplitMask) { |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| |
| BufHandle b("b", {M, N, K}, kFloat); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| for (const auto i : c10::irange(3)) { |
| std::vector<float> out(M, -1.f); |
| |
| Tensor c = Reduce("sum", {M}, Sum(), b, {N, K}); |
| LoopNest loop({c}); |
| std::vector<ForPtr> loops = loop.getLoopStmtsFor(c); |
| LoopNest::splitWithMask(loops[i], 8); |
| |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], 4950); |
| } |
| } |
| |
| // Split a reduction axis cleanly not requiring a mask. |
| TEST(Reductions, ReduceSplitNoMask) { |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| BufHandle b("b", {M, N, K}, kFloat); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| for (const auto i : c10::irange(3)) { |
| std::vector<float> out(M, -1.f); |
| |
| Tensor c = Reduce("sum", {M}, Sum(), b, {N, K}); |
| LoopNest loop({c}); |
| std::vector<ForPtr> loops = loop.getLoopStmtsFor(c); |
| LoopNest::splitWithMask(loops[i], 5); |
| |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], 4950); |
| } |
| } |
| |
| // Split a reduction axis with all logic in the mask. |
| TEST(Reductions, ReduceOverSplitMask) { |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| |
| BufHandle b("b", {M, N, K}, kFloat); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| for (const auto i : c10::irange(3)) { |
| std::vector<float> out(M, -1.f); |
| |
| Tensor c = Reduce("sum", {M}, Sum(), b, {N, K}); |
| LoopNest loop({c}); |
| std::vector<ForPtr> loops = loop.getLoopStmtsFor(c); |
| LoopNest::splitWithMask(loops[i], 16); |
| |
| loop.prepareForCodegen(); |
| StmtPtr s = loop.root_stmt(); |
| s = IRSimplifier::simplify(s); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], 4950); |
| } |
| } |
| |
| // Test an rfactor when there are two ReduceOps in the graph due to a |
| // splitWithTail. |
| TEST(Reductions, ReduceSplitRfactor) { |
| const int M = 2; |
| const int N = 10; |
| const int K = 10; |
| const int SPLIT_FACTOR = 4; |
| |
| BufHandle b("b", {M, N, K}, kFloat); |
| std::vector<float> in(M * N * K); |
| for (const auto m : c10::irange(M)) { |
| for (int j = 0; j < N * K; ++j) { |
| in[m * N * K + j] = j; |
| } |
| } |
| |
| std::vector<float> out(M, -1.f); |
| |
| Tensor c = Reduce("sum", {M}, Sum(), b, {N, K}); |
| LoopNest loop({c}); |
| std::vector<ForPtr> loops = loop.getLoopStmtsFor(c); |
| LoopNest::splitWithTail(loops[2], SPLIT_FACTOR); |
| |
| auto c_body = loop.getAllWritesToBuf(c.buf())[2]; |
| auto all_loops = loop.getAllLoopNestsWritingToBuf(c.buf()); |
| ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(2).size() == 3); |
| LoopNest::reorderAxis(all_loops[2][1], all_loops[2][2]); |
| all_loops = loop.getAllLoopNestsWritingToBuf(c.buf()); |
| ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(2).size() == 3); |
| ASSERT_TRUE(loop.rfactor(c_body, all_loops[2][1])); |
| loop.prepareForCodegen(); |
| loop.simplify(); |
| StmtPtr s = loop.root_stmt(); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| for (const auto i : c10::irange(M)) { |
| (void)i; // Suppress unused variable warning |
| ASSERT_EQ(out[0], 4950); |
| } |
| } |
| |
| // Test an rfactor which ends up being eliminated since the total loop size is |
| // smaller than the split factor. |
| TEST(Reductions, ReduceOverSplitRfactor) { |
| const int N = 10; |
| const int K = 10; |
| const int SPLIT_FACTOR = 16; |
| |
| BufHandle b("b", {N, K}, kFloat); |
| std::vector<float> in(N * K); |
| for (int j = 0; j < N * K; ++j) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(1, -1.f); |
| |
| Tensor c = Reduce("sum", {}, Sum(), b, {N, K}); |
| LoopNest loop({c}); |
| std::vector<ForPtr> loops = loop.getLoopStmtsFor(c); |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| ForPtr i, t; |
| LoopNest::splitWithTail(loops[1], SPLIT_FACTOR, &i, &t); |
| LoopNest::reorderAxis(loops[0], i); |
| |
| auto all_loops = loop.getAllLoopNestsWritingToBuf(c.buf()); |
| ASSERT_TRUE(all_loops.size() == 3 && all_loops.at(1).size() == 3); |
| auto c_body = loop.getAllWritesToBuf(c.buf())[1]; |
| ASSERT_TRUE(loop.rfactor(c_body, all_loops[1][0])); |
| LoopNest::reorderAxis(all_loops[1][0], all_loops[1][2]); |
| |
| loop.prepareForCodegen(); |
| loop.simplify(); |
| StmtPtr s = loop.root_stmt(); |
| |
| SimpleIREvaluator cg(s, {b, c}); |
| |
| cg.call({in, out}); |
| ASSERT_EQ(out[0], 4950); |
| |
| std::ostringstream oss; |
| oss << *cg.stmt(); |
| |
| // Check the IR to verify the rfactored reduce is eliminated. |
| // TODO: The alloc free should be eliminated here since it is size 0. |
| /* |
| const std::string& verification_pattern = |
| R"IR( |
| # CHECK: Allocate(tmp_buf); // dtype=float, dims=[0] |
| # CHECK: sum[0] = 0.f; |
| # CHECK: for (int n = 0; n < 10; n++) { |
| # CHECK: for (int k_tail = 0; k_tail < 10; k_tail++) { |
| # CHECK: sum[0] = (sum[0]) + (b[k_tail + 10 * n]); |
| # CHECK: } |
| # CHECK: } |
| # CHECK: Free(tmp_buf);)IR"; |
| */ |
| // TODO: rfactor output is not consistent yet, will fix (@nickg). |
| // torch::jit::testing::FileCheck().run(verification_pattern, oss.str()); |
| } |
| |
| TEST(Reductions, ReduceInlineReduction) { |
| const int M = 4; |
| const int N = 5; |
| const int K = 6; |
| |
| BufHandle a_buf("a", {M}, kFloat); |
| BufHandle b_buf("b", {M, N, K}, kFloat); |
| |
| Tensor x = Reduce("x", {M}, Sum(), b_buf, {N, K}); |
| Tensor y = Compute( |
| "y", {M}, [&](const VarHandle& m) { return a_buf.load(m) + x.load(m); }); |
| |
| PaddedBuffer<float> a_v(M); |
| PaddedBuffer<float> b_v(M, N, K); |
| |
| for (const auto i : c10::irange(M)) { |
| a_v(i) = i * i; |
| } |
| for (const auto i : c10::irange(M)) { |
| for (const auto j : c10::irange(N)) { |
| for (const auto k : c10::irange(K)) { |
| b_v(i, j, k) = j * j * k; |
| } |
| } |
| } |
| |
| LoopNest l1({y}, {x, y}); |
| // Cannot inline a reduction computation |
| ASSERT_FALSE(l1.computeInline(x.buf())); |
| } |
| |
| TEST(Reductions, ReduceInlineConsumer) { |
| const int M = 4; |
| const int N = 5; |
| const int K = 6; |
| |
| BufHandle a_buf("a", {M, N, K}, kFloat); |
| BufHandle b_buf("b", {M, N, K}, kFloat); |
| |
| Tensor x = Compute( |
| "x", |
| {M, N, K}, |
| [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { |
| return a_buf.load(m, n, k) + b_buf.load(m, n, k); |
| }); |
| Tensor y = Reduce("y", {M}, Sum(), x, {N, K}); |
| |
| PaddedBuffer<float> a_v(M, N, K); |
| PaddedBuffer<float> b_v(M, N, K); |
| |
| for (const auto i : c10::irange(M)) { |
| for (const auto j : c10::irange(N)) { |
| for (const auto k : c10::irange(K)) { |
| a_v(i, j, k) = i * i + k; |
| b_v(i, j, k) = j * j + k; |
| } |
| } |
| } |
| |
| LoopNest l1({y}, {x, y}); |
| LoopNest l2(l1); |
| l2.computeInline(x.buf()); |
| |
| l1.prepareForCodegen(); |
| l2.prepareForCodegen(); |
| |
| StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt()); |
| StmtPtr stmt2 = IRSimplifier::simplify(l2.root_stmt()); |
| |
| SimpleIREvaluator eval1(stmt1, {a_buf, b_buf, y}); |
| SimpleIREvaluator eval2(stmt2, {a_buf, b_buf, y}); |
| |
| PaddedBuffer<float> y_1(M); |
| PaddedBuffer<float> y_2(M); |
| |
| eval1(a_v, b_v, y_1); |
| eval2(a_v, b_v, y_2); |
| ExpectAllNear(y_1, y_2, 1e-5); |
| std::ostringstream oss1, oss2; |
| oss1 << *stmt1; |
| oss2 << *stmt2; |
| ASSERT_GT(oss1.str().size(), oss2.str().size()); |
| } |
| |
| TEST(Reductions, ReduceInlineReducerInternal) { |
| const int M = 4; |
| const int N = 5; |
| const int K = 6; |
| |
| BufHandle a_buf("a", {M, N, K}, kFloat); |
| BufHandle b_buf("b", {M, N, K}, kFloat); |
| |
| Tensor x = Compute( |
| "x", |
| {M, N, K}, |
| [&](const VarHandle& m, const VarHandle& n, const VarHandle& k) { |
| return a_buf.load(m, n, k) + b_buf.load(m, n, k); |
| }); |
| |
| Reducer minimum(ExprHandle(0.f), [&](ExprHandle a, ExprHandle b) { |
| return Add::make(ExprHandle(1.f), Min::make(a, b, false)); |
| }); |
| Tensor y = Reduce("y", {M}, minimum, x, {N, K}); |
| |
| PaddedBuffer<float> a_v(M, N, K); |
| PaddedBuffer<float> b_v(M, N, K); |
| |
| for (const auto i : c10::irange(M)) { |
| for (const auto j : c10::irange(N)) { |
| for (const auto k : c10::irange(K)) { |
| a_v(i, j, k) = i * i + k; |
| b_v(i, j, k) = j * j + k; |
| } |
| } |
| } |
| |
| LoopNest l1({y}, {x, y}); |
| LoopNest l2(l1); |
| l2.computeInline(x.buf()); |
| |
| l1.prepareForCodegen(); |
| l2.prepareForCodegen(); |
| |
| StmtPtr stmt1 = IRSimplifier::simplify(l1.root_stmt()); |
| StmtPtr stmt2 = IRSimplifier::simplify(l2.root_stmt()); |
| |
| SimpleIREvaluator eval1(stmt1, {a_buf, b_buf, y}); |
| SimpleIREvaluator eval2(stmt2, {a_buf, b_buf, y}); |
| |
| PaddedBuffer<float> y_1(M); |
| PaddedBuffer<float> y_2(M); |
| |
| eval1(a_v, b_v, y_1); |
| eval2(a_v, b_v, y_2); |
| ExpectAllNear(y_1, y_2, 1e-5); |
| std::ostringstream oss1, oss2; |
| oss1 << *stmt1; |
| oss2 << *stmt2; |
| ASSERT_GT(oss1.str().size(), oss2.str().size()); |
| } |
| |
| TEST(Reductions, ReductionCacheAccessesOperatorAxis) { |
| int L = 4; |
| int N = 3; |
| int M = 2; |
| |
| BufHandle a("a", {L, N, M}, kFloat); |
| BufHandle b("b", {L, N, M}, kFloat); |
| |
| Tensor c = Compute( |
| "scale", |
| {L, N, M}, |
| [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { |
| return b.load(l, n, m) * a.load(l, n, m); |
| }); |
| Tensor d = Reduce("sum", {L}, Sum(), c, {N, M}); |
| |
| Tensor e = Compute("scale", {L}, [&](const VarHandle& l) { |
| return b.load(0, 0, l) * d.load(l); |
| }); |
| |
| LoopNest l({e}, {c, d, e}); |
| LoopNest l_before(l); |
| l_before.prepareForCodegen(); |
| SimpleIREvaluator cg_before( |
| LoopNest::sanitizeNames(l_before.root_stmt()), {a, b, e}); |
| |
| StmtPtr d_loop = l.getLoopStmtsFor(d)[0]; |
| l.cacheAccesses(d.buf(), "d_local", d_loop); |
| l.prepareForCodegen(); |
| |
| StmtPtr result = |
| LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); |
| SimpleIREvaluator cg_after(result, {a, b, e}); |
| |
| std::ostringstream oss; |
| oss << *cg_after.stmt(); |
| const std::string& expected_ir = |
| R"IR( |
| #CHECK: Allocate(d_local); // dtype=float, dims=[4] |
| #CHECK: for (int i_2 |
| #CHECK: d_local[i_2] = 0.f |
| #CHECK: for (int |
| #CHECK: for (int |
| #CHECK: d_local[i_2] = (d_local[i_2]) + (scale[ |
| #CHECK: } |
| #CHECK: } |
| #CHECK: } |
| #CHECK: for (int i_3 |
| #CHECK: sum[i_3] = d_local[i_3] |
| #CHECK: Free(d_local); |
| #CHECK-NOT: d_local |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| |
| PaddedBuffer<float> a_v(L, M, N, "a"); |
| PaddedBuffer<float> b_v(L, M, N, "b"); |
| PaddedBuffer<float> c_v(L, M, N, "c"); |
| PaddedBuffer<float> d_v(L, "d"); |
| PaddedBuffer<float> e_before(L, "e_before"); |
| PaddedBuffer<float> e_after(L, "e_after"); |
| |
| for (const auto l : c10::irange(L)) { |
| for (const auto m : c10::irange(M)) { |
| for (const auto n : c10::irange(N)) { |
| a_v(l, m, n) = at::randn({1}).item().to<float>(); |
| b_v(l, m, n) = at::randn({1}).item().to<float>(); |
| } |
| } |
| } |
| |
| cg_before.call({a_v, b_v, e_before}); |
| cg_after.call({a_v, b_v, e_after}); |
| |
| // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) |
| ExpectAllNear(e_before, e_after, 1e-5); |
| } |
| |
| TEST(Reductions, ReductionCacheAccessesOuterReduceAxis) { |
| int L = 4; |
| int N = 3; |
| int M = 2; |
| |
| BufHandle a("a", {L, N, M}, kFloat); |
| BufHandle b("b", {L, N, M}, kFloat); |
| |
| Tensor c = Compute( |
| "scale", |
| {L, N, M}, |
| [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { |
| return b.load(l, n, m) * a.load(l, n, m); |
| }); |
| Tensor d = Reduce("sum", {L}, Sum(), c, {N, M}); |
| |
| Tensor e = Compute("scale", {L}, [&](const VarHandle& l) { |
| return b.load(0, 0, l) * d.load(l); |
| }); |
| |
| LoopNest l({e}, {c, d, e}); |
| LoopNest l_before(l); |
| l_before.prepareForCodegen(); |
| SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e}); |
| |
| StmtPtr d_loop = l.getLoopStmtsFor(d)[1]; |
| l.cacheAccesses(d.buf(), "d_local", d_loop); |
| l.prepareForCodegen(); |
| |
| StmtPtr result = |
| LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); |
| SimpleIREvaluator cg_after(result, {a, b, e}); |
| |
| std::ostringstream oss; |
| oss << *cg_after.stmt(); |
| const std::string& expected_ir = |
| R"IR( |
| #CHECK: Allocate(d_local); // dtype=float, dims=[1] |
| #CHECK: sum[i_1] = 0 |
| #CHECK: d_local[0] = sum[i_1] |
| #CHECK: for (int j_1 |
| #CHECK: for (int k_1 |
| #CHECK: d_local[0] = (d_local[0]) + (scale[ |
| #CHECK: } |
| #CHECK: } |
| #CHECK: sum[i_1] = d_local[0] |
| #CHECK: Free(d_local); |
| #CHECK-NOT: d_local |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| |
| PaddedBuffer<float> a_v(L, M, N, "a"); |
| PaddedBuffer<float> b_v(L, M, N, "b"); |
| PaddedBuffer<float> c_v(L, M, N, "c"); |
| PaddedBuffer<float> d_v(L, "d"); |
| PaddedBuffer<float> e_before(L, "e_before"); |
| PaddedBuffer<float> e_after(L, "e_after"); |
| |
| for (const auto l : c10::irange(L)) { |
| for (const auto m : c10::irange(M)) { |
| for (const auto n : c10::irange(N)) { |
| a_v(l, m, n) = at::randn({1}).item().to<float>(); |
| b_v(l, m, n) = at::randn({1}).item().to<float>(); |
| } |
| } |
| } |
| |
| cg_before.call({a_v, b_v, e_before}); |
| cg_after.call({a_v, b_v, e_after}); |
| |
| // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) |
| ExpectAllNear(e_before, e_after, 1e-5); |
| } |
| |
| TEST(Reductions, ReductionCacheAccessesInnerReduceAxis) { |
| int L = 4; |
| int N = 3; |
| int M = 2; |
| |
| BufHandle a("a", {L, N, M}, kFloat); |
| BufHandle b("b", {L, N, M}, kFloat); |
| |
| Tensor c = Compute( |
| "scale", |
| {L, N, M}, |
| [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { |
| return b.load(l, n, m) * a.load(l, n, m); |
| }); |
| Tensor d = Reduce("sum", {L}, Sum(), c, {N, M}); |
| |
| Tensor e = Compute("scale", {L}, [&](const VarHandle& l) { |
| return b.load(0, 0, l) * d.load(l); |
| }); |
| |
| LoopNest l({e}, {c, d, e}); |
| LoopNest l_before(l); |
| l_before.prepareForCodegen(); |
| SimpleIREvaluator cg_before(l_before.root_stmt(), {a, b, e}); |
| |
| StmtPtr d_loop = l.getLoopStmtsFor(d)[2]; |
| l.cacheAccesses(d.buf(), "d_local", d_loop); |
| l.prepareForCodegen(); |
| |
| StmtPtr result = |
| LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); |
| SimpleIREvaluator cg_after(result, {a, b, e}); |
| |
| std::ostringstream oss; |
| oss << *cg_after.stmt(); |
| const std::string& expected_ir = |
| R"IR( |
| #CHECK: Allocate(d_local); // dtype=float, dims=[1] |
| #CHECK: sum[i_1] = 0 |
| #CHECK: for (int |
| #CHECK: d_local[0] = 0 |
| #CHECK: for (int |
| #CHECK: d_local[0] = (d_local[0]) + (scale[ |
| #CHECK: } |
| #CHECK: sum[i_1] = (sum[i_1]) + (d_local[0]) |
| #CHECK: } |
| #CHECK: Free(d_local); |
| #CHECK-NOT: d_local |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| |
| PaddedBuffer<float> a_v(L, M, N, "a"); |
| PaddedBuffer<float> b_v(L, M, N, "b"); |
| PaddedBuffer<float> c_v(L, M, N, "c"); |
| PaddedBuffer<float> d_v(L, "d"); |
| PaddedBuffer<float> e_before(L, "e_before"); |
| PaddedBuffer<float> e_after(L, "e_after"); |
| |
| for (const auto l : c10::irange(L)) { |
| for (const auto m : c10::irange(M)) { |
| for (const auto n : c10::irange(N)) { |
| a_v(l, m, n) = at::randn({1}).item().to<float>(); |
| b_v(l, m, n) = at::randn({1}).item().to<float>(); |
| } |
| } |
| } |
| |
| cg_before.call({a_v, b_v, e_before}); |
| cg_after.call({a_v, b_v, e_after}); |
| |
| // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers) |
| ExpectAllNear(e_before, e_after, 1e-5); |
| } |
| |
| TEST(Reductions, ReductionCacheBodyAccess) { |
| BufHandle a("a", {24, 32, 12}, kFloat); |
| BufHandle b("b", {24, 32, 12}, kFloat); |
| |
| Tensor c = Compute( |
| "scale", |
| {24, 32, 12}, |
| [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { |
| return b.load(l, n, m) * a.load(l, n, m); |
| }); |
| Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12}); |
| |
| Tensor e = Compute("scale", {24}, [&](const VarHandle& l) { |
| return b.load(0, 0, l) * d.load(l); |
| }); |
| |
| LoopNest l({e}, {c, d, e}); |
| |
| StmtPtr d_loop = l.getLoopStmtsFor(d)[1]; |
| l.cacheAccesses(c.buf(), "scale_local", d_loop); |
| |
| l.prepareForCodegen(); |
| StmtPtr result = |
| LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); |
| SimpleIREvaluator cg(result, {a, b, e}); |
| |
| std::ostringstream oss; |
| oss << *cg.stmt(); |
| const std::string& expected_ir = |
| R"IR( |
| #CHECK: Allocate(scale_local); // dtype=float, dims=[1, 32, 12] |
| #CHECK: for (int j_1 = 0; j_1 < 32; j_1++) { |
| #CHECK: for (int k_1 = 0; k_1 < 12; k_1++) { |
| #CHECK: scale_local[k_1 + 12 * j_1] = scale[(k_1 + 12 * j_1) + 384 * i_1]; |
| #CHECK: sum[i_1] = (sum[i_1]) + (scale_local[k_2 + 12 * j_2]); |
| #CHECK: scale_1[i_2] = (b[i_2]) * (sum[i_2]); |
| #CHECK: Free(scale_local); |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| } |
| |
| TEST(Reductions, ReductionCacheConsumerAccess) { |
| BufHandle a("a", {24, 32, 12}, kFloat); |
| BufHandle b("b", {24, 32, 12}, kFloat); |
| |
| Tensor c = Compute( |
| "scale", |
| {24, 32, 12}, |
| [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { |
| return b.load(l, n, m) * a.load(l, n, m); |
| }); |
| Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12}); |
| |
| Tensor e = Compute("scale", {24}, [&](const VarHandle& l) { |
| return b.load(0, 0, l) * d.load(l); |
| }); |
| |
| LoopNest l({e}, {c, d, e}); |
| |
| LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4); |
| |
| StmtPtr e_loop = l.getLoopStmtsFor(e)[1]; |
| l.cacheAccesses(d.buf(), "sum_local", e_loop); |
| l.prepareForCodegen(); |
| |
| StmtPtr result = |
| LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); |
| SimpleIREvaluator cg(result, {a, b, e}); |
| |
| std::ostringstream oss; |
| oss << *cg.stmt(); |
| const std::string& expected_ir = |
| R"IR( |
| #CHECK: Alias(sum_local,scale); |
| #CHECK: sum[i_1] = (sum[i_1]) + (scale[ |
| #CHECK: for (int j_2 = 0; j_2 < 4 |
| #CHECK: sum_local[j_2] = sum[j_2 + 4 * i_2]; |
| #CHECK: scale_1[j_3 + 4 * i_2] = (b[j_3 + 4 * i_2]) * (sum_local[j_3]); |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| } |
| |
| TEST(Reductions, ReductionSplitCacheConsumerAccess) { |
| BufHandle a("a", {24, 32, 12}, kFloat); |
| BufHandle b("b", {24, 32, 12}, kFloat); |
| |
| Tensor c = Compute( |
| "scale", |
| {24, 32, 12}, |
| [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { |
| return b.load(l, n, m) * a.load(l, n, m); |
| }); |
| Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12}); |
| |
| Tensor e = Compute("scale", {24}, [&](const VarHandle& l) { |
| return b.load(0, 0, l) * d.load(l); |
| }); |
| |
| LoopNest l({e}, {c, d, e}); |
| |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| ForPtr inner; |
| |
| // Split outer reduction axis. |
| LoopNest::splitWithMask(l.getLoopStmtsFor(d)[0], 4, &inner); |
| |
| // Split reduction consumer. |
| LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4, &inner); |
| |
| l.cacheAccesses(d.buf(), "sum_local", inner); |
| l.prepareForCodegen(); |
| |
| StmtPtr result = |
| LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); |
| SimpleIREvaluator cg(result, {a, b, e}); |
| |
| // reduction changes but cache does not. |
| std::ostringstream oss; |
| oss << *cg.stmt(); |
| const std::string& expected_ir = |
| R"IR( |
| #CHECK: Alias(sum_local,scale); |
| #CHECK: sum[j_1 + 4 * i_1] = (sum[j_1 + 4 * i_1]) + (scale[((l + 12 * k_1) + 1536 * i_1) + 384 * j_1]); |
| #CHECK: for (int i_2 = 0; i_2 < 6 |
| #CHECK: for (int j_2 = 0; j_2 < 4 |
| #CHECK: sum_local[j_2] = sum[j_2 + 4 * i_2]; |
| #CHECK: for (int j_3 = 0; j_3 < 4 |
| #CHECK: scale_1[j_3 + 4 * i_2] = (b[j_3 + 4 * i_2]) * (sum_local[j_3]); |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| } |
| |
| TEST(Reductions, ReductionReorderCacheConsumerAccess) { |
| BufHandle a("a", {24, 32, 12}, kFloat); |
| BufHandle b("b", {24, 32, 12}, kFloat); |
| |
| Tensor c = Compute( |
| "scale", |
| {24, 32, 12}, |
| [&](const VarHandle& l, const VarHandle& n, const VarHandle& m) { |
| return b.load(l, n, m) * a.load(l, n, m); |
| }); |
| Tensor d = Reduce("sum", {24}, Sum(), c, {32, 12}); |
| |
| Tensor e = Compute("scale", {24}, [&](const VarHandle& l) { |
| return b.load(0, 0, l) * d.load(l); |
| }); |
| |
| LoopNest l({e}, {c, d, e}); |
| |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| ForPtr inner; |
| |
| // reorder outer reduction axes. |
| auto loops = l.getLoopStmtsFor(d); |
| LoopNest::reorderAxis(loops[0], loops[1]); |
| |
| // Split reduction consumer. |
| LoopNest::splitWithMask(l.getLoopStmtsFor(e)[0], 4, &inner); |
| |
| l.cacheAccesses(d.buf(), "sum_local", inner); |
| l.prepareForCodegen(); |
| |
| StmtPtr result = |
| LoopNest::sanitizeNames(IRSimplifier::simplify(l.root_stmt())); |
| SimpleIREvaluator cg(result, {a, b, e}); |
| |
| // neither reduction body not cache changes. |
| std::ostringstream oss; |
| oss << *cg.stmt(); |
| const std::string& expected_ir = |
| R"IR( |
| #CHECK: sum[j_1] = (sum[j_1]) + (scale[(k_1 + 12 * i_2) + 384 * j_1]); |
| #CHECK: for (int i_3 = 0; i_3 < 6; |
| #CHECK: for (int j_2 = 0; j_2 < 4; |
| #CHECK: sum_local[j_2] = sum[j_2 + 4 * i_3]; |
| #CHECK: for (int j_3 = 0; j_3 < 4; |
| #CHECK: scale_1[j_3 + 4 * i_3] = (b[j_3 + 4 * i_3]) * (sum_local[j_3]); |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| } |
| |
| TEST(Reductions, ReductionRfactorCacheTempOuter) { |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| VarHandle m("m", kInt); |
| VarHandle n("n", kInt); |
| VarHandle k("k", kInt); |
| |
| BufHandle b("B", {m, n, k}, kFloat); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(1, -1.f); |
| |
| Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k}); |
| LoopNest loop({c}); |
| |
| std::vector<ForPtr> loops = loop.getLoopStmtsFor(c); |
| LoopNest::reorderAxis(loops.at(0), loops.at(1)); |
| loops = loop.getLoopStmtsFor(c); |
| auto c_body = loop.getAllWritesToBuf(c.buf())[1]; |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| BufPtr rfac_buf; |
| ASSERT_TRUE(loop.rfactor(c_body, loops.at(0), &rfac_buf)); |
| loop.distributeLoop(loops.at(0)); |
| |
| auto all_loops = loop.getAllLoopNestsWritingToBuf(rfac_buf); |
| ASSERT_TRUE(all_loops.size() == 2 && all_loops.at(1).size() == 3); |
| LoopNest::reorderAxis(all_loops[1][0], all_loops[1][1]); |
| |
| all_loops = loop.getAllLoopNestsWritingToBuf(rfac_buf); |
| LoopNest::cacheAccesses(rfac_buf, "tmp", all_loops[1][1]); |
| loop.simplify(); |
| loop.prepareForCodegen(); |
| StmtPtr s = LoopNest::sanitizeNames(loop.root_stmt()); |
| SimpleIREvaluator cg(s, {b, c, m, n, k}); |
| |
| std::ostringstream oss; |
| oss << *cg.stmt(); |
| const std::string& expected_ir = |
| R"IR( |
| #CHECK: Allocate(sum_rfac); // dtype=float, dims=[n] |
| #CHECK: Allocate(tmp); // dtype=float, dims=[n] |
| #CHECK: for (int i_1 = 0; i_1 < m |
| #CHECK: for (int j = 0; j < n |
| #CHECK: tmp[j] = 0 |
| #CHECK: } |
| #CHECK: for (int j_1 = 0; j_1 < n |
| #CHECK: for (int k |
| #CHECK: tmp[j_1] = (tmp[j_1]) + (B[ |
| #CHECK: } |
| #CHECK: } |
| #CHECK: for (int j_2 = 0; j_2 < n |
| #CHECK: sum_rfac[j_2] = (sum_rfac[j_2]) + (tmp[j_2]); |
| #CHECK: } |
| #CHECK: Free(tmp); |
| #CHECK-NOT: tmp |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| |
| cg.call({in, out, M, N, K}); |
| ASSERT_EQ(out[0], 499500); |
| } |
| |
| TEST(Reductions, ReductionRfactorCacheTempInner) { |
| const int M = 10; |
| const int N = 10; |
| const int K = 10; |
| VarHandle m("m", kInt); |
| VarHandle n("n", kInt); |
| VarHandle k("k", kInt); |
| |
| BufHandle b("B", {m, n, k}, kFloat); |
| std::vector<float> in(M * N * K); |
| for (int j = 0; j < M * N * K; ++j) { |
| in[j] = j; |
| } |
| |
| std::vector<float> out(1, -1.f); |
| |
| Tensor c = Reduce("sum", {}, Sum(), b, {m, n, k}); |
| LoopNest loop({c}); |
| std::vector<ForPtr> loops = loop.getLoopStmtsFor(c); |
| auto c_body = loop.getAllWritesToBuf(c.buf())[1]; |
| |
| LoopNest::reorderAxis(loops.at(0), loops.at(1)); |
| loops = loop.getLoopStmtsFor(c); |
| // NOLINTNEXTLINE(cppcoreguidelines-init-variables) |
| BufPtr rfac_buf; |
| ASSERT_TRUE(loop.rfactor(c_body, loops.at(0), &rfac_buf)); |
| loop.distributeLoop(loops.at(0)); |
| auto all_loops = loop.getAllLoopNestsWritingToBuf(rfac_buf); |
| ASSERT_TRUE(all_loops.size() == 2 && all_loops.at(1).size() == 3); |
| LoopNest::reorderAxis(all_loops[1][0], all_loops[1][1]); |
| |
| all_loops = loop.getAllLoopNestsWritingToBuf(rfac_buf); |
| ASSERT_TRUE(all_loops.size() == 2 && all_loops.at(1).size() == 3); |
| LoopNest::cacheAccesses(rfac_buf, "tmp", all_loops[1][2]); |
| loop.prepareForCodegen(); |
| loop.simplify(); |
| StmtPtr s = LoopNest::sanitizeNames(loop.root_stmt()); |
| SimpleIREvaluator cg(s, {b, c, m, n, k}); |
| |
| std::ostringstream oss; |
| oss << *cg.stmt(); |
| const std::string& expected_ir = |
| R"IR( |
| #CHECK: Allocate(sum_rfac); // dtype=float, dims=[n] |
| #CHECK: Allocate(tmp); // dtype=float, dims=[1] |
| #CHECK: for (int i_1 = 0; i_1 < m |
| #CHECK: for (int j = 0; j < n |
| #CHECK: tmp[0] = 0 |
| #CHECK: for (int k |
| #CHECK: tmp[0] = (tmp[0]) + (B[ |
| #CHECK: } |
| #CHECK: sum_rfac[j] = (sum_rfac[j]) + (tmp[0]); |
| #CHECK: Free(tmp); |
| #CHECK-NOT: tmp |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| |
| cg.call({in, out, M, N, K}); |
| ASSERT_EQ(out[0], 499500); |
| } |
| |
| TEST(Reductions, ReductionVectorize) { |
| std::vector<float> in_(8 * 8); |
| for (const auto i : c10::irange(8)) { |
| for (const auto j : c10::irange(8)) { |
| in_[i * 8 + j] = i; |
| } |
| } |
| std::vector<float> out_before(8, -1.f); |
| std::vector<float> out_after(8, -1.f); |
| |
| BufHandle in("in", {8, 8}, kFloat); |
| |
| Tensor tensor = Reduce("sum", {8}, Sum(), in, {8}); |
| LoopNest l_before({tensor}); |
| LoopNest l(l_before); |
| l_before.prepareForCodegen(); |
| SimpleIREvaluator cg_before(l_before.root_stmt(), {in, tensor}); |
| cg_before.call({in_, out_before}); |
| |
| ASSERT_TRUE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[0])); |
| |
| StmtPtr s = l.root_stmt(); |
| s = LoopNest::sanitizeNames(IRSimplifier::simplify(s)); |
| |
| std::ostringstream oss; |
| oss << *s; |
| const std::string& expected_ir = |
| R"IR( |
| #CHECK: sum[Ramp(0, 1, 8)] = Broadcast(0.f, 8); |
| #CHECK: for (int i = 0; i < 8; i++) { |
| #CHECK: sum[Ramp(0, 1, 8)] = ReduceOp((sum[Ramp(0, 1, 8)]) + (in[Ramp(i, 8, 8)]), reduce_args={i}); |
| #CHECK: } |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| |
| // Vectorizing should not change result. |
| l.prepareForCodegen(); |
| s = IRSimplifier::simplify(l.root_stmt()); |
| SimpleIREvaluator cg_after(s, {in, tensor}); |
| cg_after.call({in_, out_after}); |
| for (const auto i : c10::irange(8)) { |
| ASSERT_EQ(out_before[i], out_after[i]); |
| } |
| } |
| |
| TEST(Reductions, ReductionVectorizeInner) { |
| BufHandle in("in", {8, 8}, kFloat); |
| |
| Tensor tensor = Reduce("sum", {8}, Sum(), in, {8}); |
| LoopNest l({tensor}); |
| |
| ASSERT_FALSE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[1])); |
| } |
| |
| TEST(Reductions, ReductionVectorizeRfactor) { |
| std::vector<float> in_(8 * 8); |
| for (const auto i : c10::irange(8)) { |
| for (const auto j : c10::irange(8)) { |
| in_[i * 8 + j] = i; |
| } |
| } |
| std::vector<float> out_before(1, -1.f); |
| std::vector<float> out_after(1, -1.f); |
| |
| BufHandle in("in", {8, 8}, kFloat); |
| |
| Tensor tensor = Reduce("sum", {}, Sum(), in, {8, 8}); |
| |
| LoopNest l_before({tensor}); |
| LoopNest l(l_before); |
| l_before.prepareForCodegen(); |
| SimpleIREvaluator cg_before(l_before.root_stmt(), {in, tensor}); |
| cg_before.call({in_, out_before}); |
| |
| ASSERT_FALSE(LoopNest::vectorize(l.getLoopStmtsFor(tensor)[1])); |
| |
| // But if we rfactor this so it's not a reduce axis we can vectorize that |
| // loop. |
| std::vector<ForPtr> loops = l.getLoopStmtsFor(tensor); |
| LoopNest::reorderAxis(loops[0], loops[1]); |
| loops = l.getLoopStmtsFor(tensor); |
| auto tensor_body = l.getAllWritesToBuf(tensor.buf())[1]; |
| BufPtr rfac_buf = nullptr; |
| ASSERT_TRUE(LoopNest::rfactor(tensor_body, loops.at(0), &rfac_buf)); |
| |
| LoopNest::distributeLoop(loops.at(0)); |
| auto rfac_loops = l.getAllLoopNestsWritingToBuf(rfac_buf); |
| |
| ASSERT_TRUE(LoopNest::vectorize(rfac_loops[1][0])); |
| l.simplify(); |
| |
| StmtPtr s = LoopNest::sanitizeNames(l.root_stmt()); |
| |
| std::ostringstream oss; |
| oss << *s; |
| const std::string& expected_ir = |
| R"IR( |
| #CHECK: sum = 0.f; |
| #CHECK: for (int i = 0; i < 8; i++) { |
| #CHECK: sum_rfac[i] = 0.f; |
| #CHECK: } |
| #CHECK: for (int i_1 = 0; i_1 < 8; i_1++) { |
| #CHECK: sum_rfac[Ramp(0, 1, 8)] = ReduceOp((sum_rfac[Ramp(0, 1, 8)]) + (in[Ramp(8 * i_1, 1, 8)]), reduce_args={i_1}); |
| #CHECK: } |
| #CHECK: for (int i_2 = 0; i_2 < 8; i_2++) { |
| #CHECK: sum = ReduceOp((sum) + (sum_rfac[i_2]), reduce_args={i_2}); |
| #CHECK: } |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| |
| // Vectorizing should not change result. |
| l.prepareForCodegen(); |
| s = IRSimplifier::simplify(l.root_stmt()); |
| SimpleIREvaluator cg_after(s, {in, tensor}); |
| cg_after.call({in_, out_after}); |
| |
| ASSERT_EQ(out_before[0], out_after[0]); |
| } |
| |
| TEST(Reductions, InitFunction) { |
| constexpr int M = 32; |
| constexpr int N = 16; |
| BufHandle A("A", {M, N}, kFloat); |
| BufHandle B("B", {N}, kFloat); |
| Tensor C = Reduce( |
| "C", |
| {N}, |
| Sum(), |
| [&](const std::vector<VarHandle>& v) { return B.load(v[0]); }, |
| [&](const std::vector<VarHandle>& v) { return A.load(v[1], v[0]); }, |
| {M}); |
| LoopNest nest({C}); |
| nest.prepareForCodegen(); |
| StmtPtr s = LoopNest::sanitizeNames(IRSimplifier::simplify(nest.root_stmt())); |
| std::ostringstream oss; |
| oss << *s << "\n"; |
| const std::string& expected_ir = |
| R"IR( |
| #CHECK: for (int i = 0; i < 16; i++) { |
| #CHECK: C[i] = B[i]; |
| #CHECK: for (int j = 0; j < 32; j++) { |
| #CHECK: C[i] = (C[i]) + (A[i + 16 * j]); |
| #CHECK: } |
| #CHECK: } |
| )IR"; |
| torch::jit::testing::FileCheck().run(expected_ir, oss.str()); |
| } |
| } // namespace jit |
| } // namespace torch |