mlir/lib/Dialect/Linalg/Transforms/Sparsification.cpp - toolchain/llvm-project - Git at Google

 //===- Sparsification.cpp - Implementation of linalg sparsification -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements lowering annotated linalg dialect to sparse code.
 //
 // The concept of letting a compiler generate sparse code automatically was
 // pioneered for dense linear algebra code in Fortran by [Bik96] in MT1 and
 // formalized to tensor algebra by [Kjolstad17,20] for the Sparse Tensor
 // Algebra Compiler (TACO). The implementation in this file closely follows
 // the "sparse iteration theory" that forms the foundation of TACO. A rewriting
 // rule is applied to each tensor expression in linalg (MLIR's tensor index
 // notation) where the sparsity of tensors is indicated with annotation using
 // a per-dimension specification of sparse/dense storage together with a
 // specification of the order on the dimensions. Subsequently, a topologically
 // sorted iteration graph, reflecting the required order on indices with respect
 // to the dimensions of each tensor, is constructed to ensure that all tensors
 // are visited in natural index order. Next, iteration lattices are constructed
 // for the tensor expression for every index in topological order. Each
 // iteration lattice point consists of a conjunction of tensor indices together
 // with a tensor (sub)expression that needs to be evaluated for that
 // conjunction. Within the lattice, iteration points are ordered according to
 // the way indices are exhausted. As such these iteration lattices drive actual
 // sparse code generation, which consists of a tedious but relatively
 // straightforward one-to-one mapping from iteration lattices to combinations
 // of for-loops, while-loops, and if-statements.
 //
 // [Bik96] Aart J.C. Bik. Compiler Support for Sparse Matrix Computations.
 // PhD thesis, Leiden University, May 1996 (aartbik.com/sparse.php).
 // [Kjolstad17] Fredrik Berg Kjolstad, Shoaib Ashraf Kamil, Stephen Chou,
 // David Lugato, and Saman Amarasinghe. The Tensor Algebra Compiler.
 // Proceedings of the ACM on Programming Languages, October 2017.
 // [Kjolstad20] Fredrik Berg Kjolstad. Sparse Tensor Algebra Compilation.
 // PhD thesis, MIT, February, 2020 (tensor-compiler.org).
 //
 // Implementation detail: We use llvm::SmallVector for vectors with
 // variable lengths and std::vector for vectors with fixed lengths.
 //===----------------------------------------------------------------------===//

 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Matchers.h"

 using namespace mlir;

 namespace {

 enum class Kind { kTensor, kInvariant, kMulF, kMulI, kAddF, kAddI };
 enum class Dim { kSparse, kDense, kUndef };

 /// Tensor expression. Represents a MLIR expression in tensor index notation.
 /// For tensors, e0 denotes the tensor index. For invariants, the IR value is
 /// stored directly. For binary operations, e0 and e1 denote the index of the
 /// children tensor expressions.
 struct TensorExp {
   TensorExp(Kind k, unsigned x, unsigned y, Value v)
       : kind(k), e0(x), e1(y), val(v) {
     assert((kind == Kind::kTensor && e0 != -1u && e1 == -1u && !val) ||
            (kind == Kind::kInvariant && e0 == -1u && e1 == -1u && val) ||
            (kind >= Kind::kMulF && e0 != -1u && e1 != -1u && !val));
   }
   Kind kind;
   /// Indices of children expression(s).
   unsigned e0;
   unsigned e1;
   /// Direct link to IR for an invariant. During code generation,
   /// field is used to cache "hoisted" loop invariant tensor loads.
   Value val;
 };

 /// Lattice point. Each lattice point consists of a conjunction of tensor
 /// loop indices (encoded in a bitvector) and the index of the corresponding
 /// tensor expression.
 struct LatPoint {
   LatPoint(unsigned n, unsigned e, unsigned b) : bits(n, false), exp(e) {
     bits.set(b);
   }
   LatPoint(const llvm::BitVector &b, unsigned e) : bits(b), exp(e) {}
   /// Conjunction of tensor loop indices as bitvector. This represents
   /// all indices involved in the tensor expression
   llvm::BitVector bits;
   /// Simplified conjunction of tensor loop indices as bitvector. This
   /// represents a simplified condition under which this tensor expression
   /// must execute. Pre-computed during codegen to avoid repeated eval.
   llvm::BitVector simple;
   /// Index of the tensor expresssion.
   unsigned exp;
 };

 /// A class to handle all iteration lattice operations. This class abstracts
 /// away from some implementation details of storing iteration lattices and
 /// tensor expressions. This allows for fine-tuning performance characteristics
 /// independently from the basic algorithm if bottlenecks are identified.
 class Merger {
 public:
   /// Constructs a merger for the given number of tensors and loops. The
   /// user supplies the number of tensors involved in the kernel, with the
   /// last tensor in this set denoting the output tensor. The merger adds an
   /// additional synthetic tensor at the end of this set to represent all
   /// invariant expressions in the kernel.
   Merger(unsigned t, unsigned l)
       : outTensor(t - 1), numTensors(t + 1), numLoops(l),
         dims(t + 1, std::vector<Dim>(l, Dim::kUndef)) {}

   /// Adds a tensor expression. Returns its index.
   unsigned addExp(Kind k, unsigned e0, unsigned e1 = -1u, Value v = Value()) {
     unsigned e = tensorExps.size();
     tensorExps.push_back(TensorExp(k, e0, e1, v));
     return e;
   }
   unsigned addExp(Kind k, Value v) { return addExp(k, -1u, -1u, v); }

   /// Adds an iteration lattice point. Returns its index.
   unsigned addLat(unsigned t, unsigned i, unsigned e) {
     assert(t < numTensors && i < numLoops);
     unsigned p = latPoints.size();
     latPoints.push_back(LatPoint(numLoops * numTensors, e, numTensors * i + t));
     return p;
   }

   /// Adds a new, initially empty, set. Returns its index.
   unsigned addSet() {
     unsigned s = latSets.size();
     latSets.emplace_back(SmallVector<unsigned, 16>());
     return s;
   }

   /// Computes a single conjunction of two lattice points by taking the "union"
   /// of loop indices (effectively constructing a larger "intersection" of those
   /// indices) with a newly constructed tensor (sub)expression of given kind.
   /// Returns the index of the new lattice point.
   unsigned conjLatPoint(Kind kind, unsigned p0, unsigned p1) {
     unsigned p = latPoints.size();
     llvm::BitVector nb = llvm::BitVector(latPoints[p0].bits);
     nb |= latPoints[p1].bits;
     unsigned e = addExp(kind, latPoints[p0].exp, latPoints[p1].exp);
     latPoints.push_back(LatPoint(nb, e));
     return p;
   }

   /// Conjunctive merge of two lattice sets L0 and L1 is conjunction of
   /// cartesian product. Returns the index of the new set.
   unsigned takeConj(Kind kind, unsigned s0, unsigned s1) {
     unsigned s = addSet();
     for (unsigned p0 : latSets[s0])
       for (unsigned p1 : latSets[s1])
         latSets[s].push_back(conjLatPoint(kind, p0, p1));
     return s;
   }

   /// Disjunctive merge of two lattice sets L0 and L1 is (L0 /\_op L1, L0, L1).
   /// Returns the index of the new set.
   unsigned takeDisj(Kind kind, unsigned s0, unsigned s1) {
     unsigned s = takeConj(kind, s0, s1);
     for (unsigned p : latSets[s0])
       latSets[s].push_back(p);
     for (unsigned p : latSets[s1])
       latSets[s].push_back(p);
     return s;
   }

   /// Optimizes the iteration lattice points in the given set. This
   /// method should be called right before code generation to avoid
   /// generating redundant loops and conditions.
   unsigned optimizeSet(unsigned s0) {
     unsigned s = addSet();
     assert(latSets[s0].size() != 0);
     unsigned p0 = latSets[s0][0];
     for (unsigned p1 : latSets[s0]) {
       bool add = true;
       llvm::BitVector simple = simplifyCond(s0, p1);
       if (p0 != p1) {
         // Is this a straightforward copy?
         unsigned e = latPoints[p1].exp;
         if (exp(e).kind == Kind::kTensor && exp(e).e0 == outTensor)
           continue;
         // Only dense exhausted?
         llvm::BitVector tmp = latPoints[p1].bits;
         tmp ^= latPoints[p0].bits;
         if (!hasAnyDimOf(tmp, Dim::kSparse))
           continue;
         // Duplication of an earlier conjunction?
         for (unsigned p2 : latSets[s]) {
           tmp = simple;
           tmp ^= latPoints[p2].simple;
           if (tmp.count() == 0) {
             add = false;
             break;
           }
         }
         assert(!add || latGT(p0, p1));
       }
       if (add) {
         latSets[s].push_back(p1);
         latPoints[latSets[s].back()].simple = simple;
       }
     }
     return s;
   }

   /// Simplifies the conditions in a conjunction of a given lattice point
   /// within the given set using just two basic rules:
   /// (1) multiple dense conditions are reduced to single dense, and
   /// (2) a *singleton* sparse/dense is reduced to sparse/random access.
   llvm::BitVector simplifyCond(unsigned s, unsigned p0) {
     // First determine if this lattice point is a *singleton*, i.e.,
     // the last point in a lattice, no other is less than this one.
     bool isSingleton = true;
     for (unsigned p1 : latSets[s]) {
       if (p0 != p1 && latGT(p0, p1)) {
         unsigned e = latPoints[p1].exp;
         if (exp(e).kind == Kind::kTensor && exp(e).e0 == outTensor)
           continue;
         llvm::BitVector tmp = latPoints[p1].bits;
         tmp ^= latPoints[p0].bits;
         if (hasAnyDimOf(tmp, Dim::kSparse)) {
           isSingleton = false;
           break;
         }
       }
     }
     // Now apply the two basic rules.
     llvm::BitVector simple = latPoints[p0].bits;
     bool reset = isSingleton && hasAnyDimOf(simple, Dim::kSparse);
     for (unsigned b = 0, be = simple.size(); b < be; b++) {
       if (simple[b] && !isDim(b, Dim::kSparse)) {
         if (reset)
           simple.reset(b);
         reset = true;
       }
     }
     return simple;
   }

   /// Returns true if Li > Lj.
   bool latGT(unsigned i, unsigned j) const {
     const llvm::BitVector &bitsi = latPoints[i].bits;
     const llvm::BitVector &bitsj = latPoints[j].bits;
     assert(bitsi.size() == bitsj.size());
     if (bitsi.count() > bitsj.count()) {
       for (unsigned b = 0, be = bitsj.size(); b < be; b++)
         if (bitsj[b] && !bitsi[b])
           return false;
       return true;
     }
     return false;
   }

   /// Bit translation.
   unsigned tensor(unsigned b) const { return b % numTensors; }
   unsigned index(unsigned b) const { return b / numTensors; }

   /// Returns true if bit corresponds to queried dim.
   bool isDim(unsigned b, Dim d) const { return isDim(tensor(b), index(b), d); }

   /// Returns true if tensor access at given index has queried dim.
   bool isDim(unsigned t, unsigned i, Dim d) const {
     assert(t < numTensors && i < numLoops);
     return dims[t][i] == d;
   }

   /// Returns true if any set bit corresponds to queried dim.
   bool hasAnyDimOf(const llvm::BitVector &bits, Dim d) const {
     for (unsigned b = 0, be = bits.size(); b < be; b++)
       if (bits[b] && isDim(b, d))
         return true;
     return false;
   }

   // Returns true if tensor has any sparse dimension.
   bool isSparseTensor(unsigned t) const {
     return llvm::any_of(dims[t], [](Dim d) { return d == Dim::kSparse; });
   }

   // Setter
   void setDim(unsigned t, unsigned i, Dim d) { dims[t][i] = d; }

   /// Getters.
   TensorExp &exp(unsigned e) { return tensorExps[e]; }
   LatPoint &lat(unsigned l) { return latPoints[l]; }
   SmallVector<unsigned, 16> &set(unsigned s) { return latSets[s]; }

 private:
   const unsigned outTensor;
   const unsigned numTensors;
   const unsigned numLoops;

   std::vector<std::vector<Dim>> dims;
   llvm::SmallVector<TensorExp, 32> tensorExps;
   llvm::SmallVector<LatPoint, 16> latPoints;
   llvm::SmallVector<SmallVector<unsigned, 16>, 8> latSets;
 };

 // Code generation.
 struct CodeGen {
   CodeGen(linalg::SparsificationOptions o, unsigned numTensors,
           unsigned numLoops)
       : options(o), loops(numLoops), sizes(numLoops), buffers(numTensors),
         pointers(numTensors, std::vector<Value>(numLoops)),
         indices(numTensors, std::vector<Value>(numLoops)),
         highs(numTensors, std::vector<Value>(numLoops)),
         pidxs(numTensors, std::vector<Value>(numLoops)),
         idxs(numTensors, std::vector<Value>(numLoops)), redExp(-1u), redVal(),
         curVecLength(1), curVecMask() {}
   /// Sparsification options.
   linalg::SparsificationOptions options;
   /// Universal dense indices and upper bounds (by index). The loops array
   /// is updated with the value of the universal dense index in the current
   /// loop. The sizes array is set once with the inferred dimension sizes.
   std::vector<Value> loops;
   std::vector<Value> sizes;
   /// Buffers for storing dense and sparse numerical values (by tensor).
   /// This array is set once during bufferization of all tensors.
   std::vector<Value> buffers;
   /// Sparse storage schemes (1-D): pointers and indices (by tensor and index).
   /// This array is set once during bufferization of all sparse tensors.
   std::vector<std::vector<Value>> pointers;
   std::vector<std::vector<Value>> indices;
   /// Sparse iteration information (by tensor and index). These arrays
   /// are updated to remain current within the current loop.
   std::vector<std::vector<Value>> highs;
   std::vector<std::vector<Value>> pidxs;
   std::vector<std::vector<Value>> idxs;
   /// Current reduction, updated during code generation. When indices of a
   /// reduction are exhausted,  all inner loops can "scalarize" the reduction.
   // TODO: currently only done for (a chain of) innermost for-loops, where it
   // is most effective; we could generalize to more outer and while-loops.
   unsigned redExp;
   Value redVal;
   // Current vector length and mask.
   unsigned curVecLength;
   Value curVecMask;
 };

 } // namespace

 /// Helper method to inspect sparse annotations in the linalg operation.
 /// Fills the per-dimension sparsity information for all tensors.
 static void findSparseAnnotations(Merger &merger, linalg::GenericOp op) {
   unsigned numTensors = op.getNumShapedOperands();
   ArrayAttr sparseAttr = op.sparseAttr();
   for (unsigned t = 0; t < numTensors; t++) {
     auto map = op.getIndexingMap(t);
     auto dimAttr = sparseAttr[t].cast<ArrayAttr>();
     // For each tensor, we accept a per-dimension Sparse or Dense annotation.
     // This is translated to the loop index that indexes that dimension.
     unsigned rank = op.getShapedType(t).getRank();
     for (unsigned d = 0; d < rank; d++) {
       unsigned idx = map.getDimPosition(d);
       if (isSparseDim(dimAttr[d])) {
         merger.setDim(t, idx, Dim::kSparse);
       } else {
         assert(isDenseDim(dimAttr[d]));
         merger.setDim(t, idx, Dim::kDense);
       }
     }
   }
 }

 /// A DFS helper to compute a topological sort. Note that recursion is
 /// bounded by the number of implicit loops, which is always small.
 /// Returns false when a cycle is detected.
 static bool topSortDFS(unsigned i, std::vector<unsigned> &visit,
                        std::vector<unsigned> &topSort,
                        std::vector<std::vector<bool>> &adjM) {
   if (visit[i] != 0)
     return visit[i] != 1; // 1 denotes cycle!
   visit[i] = 1;
   for (unsigned j = 0, e = visit.size(); j < e; j++)
     if (adjM[i][j])
       if (!topSortDFS(j, visit, topSort, adjM))
         return false;
   visit[i] = 2;
   topSort.push_back(i);
   return true;
 }

 /// Computes a topologically sorted iteration graph for the linalg operation.
 /// Ensures all tensors are visited in natural index order. This is essential
 /// for sparse storage formats since these only support access along fixed
 /// dimensions. Even for dense storage formats, however, the natural index
 /// order yields innermost unit-stride access with better spatial locality.
 static bool computeIterationGraph(Merger &merger, linalg::GenericOp op,
                                   std::vector<unsigned> &topSort,
                                   bool sparseOnly) {
   // Set up an n x n from/to adjacency matrix of the iteration graph
   // for the implicit loop indices i_0 .. i_n-1.
   unsigned n = op.getNumLoops();
   std::vector<std::vector<bool>> adjM(n, std::vector<bool>(n, false));

   // Iterate over the indexing maps of every tensor in the tensor expression.
   unsigned numTensors = op.getNumShapedOperands();
   for (unsigned t = 0; t < numTensors; t++) {
     auto map = op.getIndexingMap(t);
     assert(map.getNumDims() == n);
     // Skip dense tensor constraints when sparse only is requested.
     if (sparseOnly && !merger.isSparseTensor(t))
       continue;
     // At the moment, we take the index variables in the tensor access
     // expression in the order in which they appear (conceptually a
     // "row-major" layout of every tensor). So, a tensor access A_ijk
     // forces the ordering i < j < k on the loop indices.
     // TODO: support affine map to define alternative dimension orders.
     for (unsigned d = 1, e = map.getNumResults(); d < e; d++) {
       unsigned f = map.getDimPosition(d - 1);
       unsigned t = map.getDimPosition(d);
       adjM[f][t] = true;
     }
   }

   // Topologically sort the iteration graph to determine loop order.
   // Report failure for a cyclic iteration graph.
   topSort.clear();
   topSort.reserve(n);
   std::vector<unsigned> visit(n, 0);
   for (unsigned i = 0; i < n; i++)
     if (visit[i] == 0)
       if (!topSortDFS(i, visit, topSort, adjM))
         return false; // cycle!
   std::reverse(std::begin(topSort), std::end(topSort));
   return true;
 }

 /// Traverses the SSA tree (possibly a DAG) to build a tensor expression.
 /// This simplifies constructing (sub)expressions during iteration lattice
 /// building (compared to using the SSA representation everywhere).
 static Optional<unsigned> buildTensorExp(Merger &merger, linalg::GenericOp op,
                                          Value val) {
   if (auto arg = val.dyn_cast<BlockArgument>()) {
     unsigned argN = arg.getArgNumber();
     if (arg.getOwner()->getParentOp() == op) {
       // Any parameter of the generic op is considered a tensor,
       // indexed by the implicit loop bounds.
       auto map = op.getIndexingMap(argN);
       if (map.isProjectedPermutation())
         return merger.addExp(Kind::kTensor, argN);
       // Cannot handle (yet).
       return None;
     }
     // Any parameter of a higher op is invariant.
     return merger.addExp(Kind::kInvariant, val);
   }
   Operation *def = val.getDefiningOp();
   if (def->getBlock() != &op.region().front()) {
     // Something defined outside is invariant.
     return merger.addExp(Kind::kInvariant, val);
   } else if (def->getNumOperands() == 2) {
     // Construct binary operations if subexpressions could be built.
     auto x = buildTensorExp(merger, op, def->getOperand(0));
     auto y = buildTensorExp(merger, op, def->getOperand(1));
     if (x.hasValue() && y.hasValue()) {
       unsigned e0 = x.getValue();
       unsigned e1 = y.getValue();
       if (isa<MulFOp>(def))
         return merger.addExp(Kind::kMulF, e0, e1);
       if (isa<MulIOp>(def))
         return merger.addExp(Kind::kMulI, e0, e1);
       if (isa<AddFOp>(def))
         return merger.addExp(Kind::kAddF, e0, e1);
       if (isa<AddIOp>(def))
         return merger.addExp(Kind::kAddI, e0, e1);
     }
   }
   // Cannot build (yet).
   return None;
 }

 /// Builds the iteration lattices in a bottom-up traversal given the remaining
 /// tensor (sub)expression and the next loop index in the iteration graph.
 static unsigned buildLattices(Merger &merger, linalg::GenericOp op,
                               unsigned exp, unsigned idx) {
   Kind kind = merger.exp(exp).kind;
   if (kind == Kind::kTensor || kind == Kind::kInvariant) {
     // Either the index is really used in the tensor expression, or it is
     // set to the undefined index in that dimension. An invariant expression
     // is set to a synthetic tensor with undefined indices only.
     unsigned s = merger.addSet();
     unsigned t =
         kind == Kind::kTensor ? merger.exp(exp).e0 : op.getNumShapedOperands();
     merger.set(s).push_back(merger.addLat(t, idx, exp));
     return s;
   }
   unsigned s0 = buildLattices(merger, op, merger.exp(exp).e0, idx);
   unsigned s1 = buildLattices(merger, op, merger.exp(exp).e1, idx);
   switch (kind) {
   case Kind::kTensor:
   case Kind::kInvariant:
     llvm_unreachable("handled above");
   case Kind::kMulF:
   case Kind::kMulI:
     return merger.takeConj(kind, s0, s1);
   case Kind::kAddF:
   case Kind::kAddI:
     return merger.takeDisj(kind, s0, s1);
   }
 }

 /// Maps sparse integer option to actual integral storage type.
 static Type genIntType(PatternRewriter &rewriter, linalg::SparseIntType tp) {
   switch (tp) {
   case linalg::SparseIntType::kNative:
     return rewriter.getIndexType();
   case linalg::SparseIntType::kI64:
     return rewriter.getIntegerType(64);
   case linalg::SparseIntType::kI32:
     return rewriter.getIntegerType(32);
   }
 }

 /// Local bufferization of all dense and sparse data structures.
 /// This code enables testing the first prototype sparse compiler.
 // TODO: replace this with a proliferated bufferization strategy
 static void genBuffers(Merger &merger, CodeGen &codegen,
                        PatternRewriter &rewriter, linalg::GenericOp op) {
   Location loc = op.getLoc();
   unsigned numTensors = op.getNumShapedOperands();
   unsigned numInputs = op.getNumInputs();
   assert(numTensors == numInputs + 1);

   // For now, set all unknown dimensions to 999.
   // TODO: compute these values (using sparsity or by reading tensor)
   Value unknown = rewriter.create<ConstantIndexOp>(loc, 999);

   // For every tensor, find lower and upper bound on dimensions, set the
   // same bounds on loop indices, and allocate dense or sparse buffer(s).
   SmallVector<Value, 4> args;
   for (unsigned t = 0; t < numTensors; t++) {
     auto tensorType = op.getShapedType(t);
     auto shape = tensorType.getShape();
     auto map = op.getIndexingMap(t);
     // Scan all dimensions of current tensor.
     bool allDense = true;
     args.clear();
     for (unsigned d = 0, rank = shape.size(); d < rank; d++) {
       unsigned i = map.getDimPosition(d);
       // Handle sparse storage schemes.
       if (merger.isDim(t, i, Dim::kSparse)) {
         allDense = false;
         auto dynShape = {ShapedType::kDynamicSize};
         auto ptrTp = MemRefType::get(
             dynShape, genIntType(rewriter, codegen.options.ptrType));
         auto indTp = MemRefType::get(
             dynShape, genIntType(rewriter, codegen.options.indType));
         codegen.pointers[t][i] = rewriter.create<AllocaOp>(loc, ptrTp, unknown);
         codegen.indices[t][i] = rewriter.create<AllocaOp>(loc, indTp, unknown);
       }
       // Find lower and upper bound in current dimension.
       Value up;
       if (shape[d] == TensorType::kDynamicSize) {
         Value arg = t < numInputs ? op.getInput(t) : op.getOutput(0);
         up = rewriter.create<DimOp>(loc, arg, d);
         args.push_back(up);
       } else {
         up = rewriter.create<ConstantIndexOp>(loc, shape[d]);
       }
       codegen.sizes[i] = codegen.highs[t][i] = up;
     }
     // Allocate dense or sparse buffer for numerical values.
     if (allDense) {
       auto denseTp = MemRefType::get(shape, tensorType.getElementType());
       codegen.buffers[t] = rewriter.create<AllocaOp>(loc, denseTp, args);
     } else {
       auto sparseTp = MemRefType::get({ShapedType::kDynamicSize},
                                       tensorType.getElementType());
       codegen.buffers[t] = rewriter.create<AllocaOp>(loc, sparseTp, unknown);
     }
   }
 }

 /// Constructs vector type from pointer.
 static VectorType vectorType(CodeGen &codegen, Value ptr) {
   Type etp = ptr.getType().cast<MemRefType>().getElementType();
   return VectorType::get(codegen.curVecLength, etp);
 }

 /// Constructs vector iteration mask.
 static Value genVectorMask(CodeGen &codegen, PatternRewriter &rewriter,
                            Value iv, Value lo, Value hi, Value step) {
   Location loc = iv.getLoc();
   VectorType mtp =
       VectorType::get(codegen.curVecLength, rewriter.getIntegerType(1));
   // Special case if the vector length evenly divides the trip count (for
   // example, "for i = 0, 128, 16"). A constant all-true mask is generated
   // so that all subsequent masked memory operations are immediately folded
   // into unconditional memory operations.
   IntegerAttr loInt, hiInt, stepInt;
   if (matchPattern(lo, m_Constant(&loInt)) &&
       matchPattern(hi, m_Constant(&hiInt)) &&
       matchPattern(step, m_Constant(&stepInt))) {
     if (((hiInt.getInt() - loInt.getInt()) % stepInt.getInt()) == 0)
       return rewriter.create<vector::ConstantMaskOp>(
           loc, mtp, rewriter.getI64ArrayAttr(codegen.curVecLength));
   }
   // Otherwise, generate a vector mask that avoids overrunning the upperbound
   // during vector execution. Here we rely on subsequent loop optimizations to
   // avoid executing the mask in all iterations, for example, by splitting the
   // loop into an unconditional vector loop and a scalar cleanup loop.
   Value end = rewriter.create<SubIOp>(loc, hi, iv);
   return rewriter.create<vector::CreateMaskOp>(loc, mtp, end);
 }

 /// Generates a vectorized load lhs = a[ind[lo:hi]] or lhs = a[lo:hi].
 static Value genVectorLoad(CodeGen &codegen, PatternRewriter &rewriter,
                            Value ptr, ArrayRef<Value> args) {
   Location loc = ptr.getLoc();
   VectorType vtp = vectorType(codegen, ptr);
   Value pass = rewriter.create<ConstantOp>(loc, vtp, rewriter.getZeroAttr(vtp));
   if (args.back().getType().isa<VectorType>())
     return rewriter.create<vector::GatherOp>(loc, vtp, ptr, args.back(),
                                              codegen.curVecMask, pass);
   return rewriter.create<vector::MaskedLoadOp>(loc, vtp, ptr, args,
                                                codegen.curVecMask, pass);
 }

 /// Generates a vectorized store a[ind[lo:hi]] = rhs or a[lo:hi] = rhs.
 static void genVectorStore(CodeGen &codegen, PatternRewriter &rewriter,
                            Value rhs, Value ptr, ArrayRef<Value> args) {
   Location loc = ptr.getLoc();
   if (args.back().getType().isa<VectorType>())
     rewriter.create<vector::ScatterOp>(loc, ptr, args.back(),
                                        codegen.curVecMask, rhs);
   else
     rewriter.create<vector::MaskedStoreOp>(loc, ptr, args, codegen.curVecMask,
                                            rhs);
 }

 /// Generates a vectorized invariant. Here we rely on subsequent loop
 /// optimizations to hoist the invariant broadcast out of the vector loop.
 static Value genVectorInvariantValue(CodeGen &codegen,
                                      PatternRewriter &rewriter, Value val) {
   VectorType vtp = VectorType::get(codegen.curVecLength, val.getType());
   return rewriter.create<vector::BroadcastOp>(val.getLoc(), vtp, val);
 }

 /// Generates a load on a dense or sparse tensor.
 static Value genTensorLoad(Merger &merger, CodeGen &codegen,
                            PatternRewriter &rewriter, linalg::GenericOp op,
                            unsigned exp) {
   // Test if the load was hoisted to a higher loop nest.
   Value val = merger.exp(exp).val;
   if (val)
     return val;
   // Actual load.
   SmallVector<Value, 4> args;
   unsigned tensor = merger.exp(exp).e0;
   auto map = op.getIndexingMap(tensor);
   bool sparse = false;
   for (unsigned i = 0, m = map.getNumResults(); i < m; ++i) {
     unsigned idx = map.getDimPosition(i);
     args.push_back(codegen.loops[idx]); // universal dense index
     if (sparse || merger.isDim(tensor, idx, Dim::kSparse)) {
       sparse = true;
       args.clear();
       args.push_back(codegen.pidxs[tensor][idx]); // position index
     }
   }
   Location loc = op.getLoc();
   Value ptr = codegen.buffers[tensor];
   if (codegen.curVecLength > 1)
     return genVectorLoad(codegen, rewriter, ptr, args);
   return rewriter.create<LoadOp>(loc, ptr, args);
 }

 /// Generates a store on a dense tensor.
 static void genTensorStore(Merger &merger, CodeGen &codegen,
                            PatternRewriter &rewriter, linalg::GenericOp op,
                            unsigned tensor, Value rhs) {
   // Test if this is a scalarized reduction.
   unsigned lhs = op.getNumShapedOperands() - 1;
   if (lhs == tensor && codegen.redVal) {
     codegen.redVal = rhs;
     return;
   }
   // Actual store.
   SmallVector<Value, 4> args;
   auto map = op.getIndexingMap(tensor);
   for (unsigned i = 0, m = map.getNumResults(); i < m; ++i) {
     unsigned idx = map.getDimPosition(i);
     args.push_back(codegen.loops[idx]); // universal dense index
   }
   Location loc = op.getLoc();
   Value ptr = codegen.buffers[tensor];
   if (codegen.curVecLength > 1)
     genVectorStore(codegen, rewriter, rhs, ptr, args);
   else
     rewriter.create<StoreOp>(loc, rhs, ptr, args);
 }

 /// Generates a pointer/index load from the sparse storage scheme.
 static Value genLoad(CodeGen &codegen, PatternRewriter &rewriter, Location loc,
                      Value ptr, Value s) {
   if (codegen.curVecLength > 1)
     return genVectorLoad(codegen, rewriter, ptr, {s});
   Value load = rewriter.create<LoadOp>(loc, ptr, s);
   return load.getType().isa<IndexType>()
              ? load
              : rewriter.create<IndexCastOp>(loc, load, rewriter.getIndexType());
 }

 /// Generates an invariant value.
 static Value genInvariantValue(Merger &merger, CodeGen &codegen,
                                PatternRewriter &rewriter, unsigned exp) {
   Value val = merger.exp(exp).val;
   if (codegen.curVecLength > 1)
     return genVectorInvariantValue(codegen, rewriter, val);
   return val;
 }

 /// Recursively generates tensor expression.
 static Value genExp(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,
                     linalg::GenericOp op, unsigned exp) {
   if (merger.exp(exp).kind == Kind::kTensor)
     return genTensorLoad(merger, codegen, rewriter, op, exp);
   else if (merger.exp(exp).kind == Kind::kInvariant)
     return genInvariantValue(merger, codegen, rewriter, exp);
   Value v0 = genExp(merger, codegen, rewriter, op, merger.exp(exp).e0);
   Value v1 = genExp(merger, codegen, rewriter, op, merger.exp(exp).e1);
   switch (merger.exp(exp).kind) {
   case Kind::kTensor:
   case Kind::kInvariant:
     llvm_unreachable("handled above");
   case Kind::kMulF:
     return rewriter.create<MulFOp>(op.getLoc(), v0, v1);
   case Kind::kMulI:
     return rewriter.create<MulIOp>(op.getLoc(), v0, v1);
   case Kind::kAddF:
     return rewriter.create<AddFOp>(op.getLoc(), v0, v1);
   case Kind::kAddI:
     return rewriter.create<AddIOp>(op.getLoc(), v0, v1);
   }
 }

 /// Hoists loop invariant tensor loads for which indices have been exhausted.
 static void genInvariants(Merger &merger, CodeGen &codegen,
                           PatternRewriter &rewriter, linalg::GenericOp op,
                           unsigned exp, unsigned ldx, bool hoist) {
   if (merger.exp(exp).kind == Kind::kTensor) {
     // Inspect tensor indices.
     bool atLevel = ldx == -1u;
     unsigned tensor = merger.exp(exp).e0;
     auto map = op.getIndexingMap(tensor);
     for (unsigned i = 0, m = map.getNumResults(); i < m; ++i) {
       unsigned idx = map.getDimPosition(i);
       if (!codegen.loops[idx])
         return; // still in play
       else if (idx == ldx)
         atLevel = true;
     }
     // All exhausted at this level (atLevel denotes exactly at this level).
     unsigned lhs = op.getNumShapedOperands() - 1;
     if (lhs == tensor) {
       codegen.redExp = hoist ? exp : -1u;
     } else if (atLevel) {
       merger.exp(exp).val =
           hoist ? genTensorLoad(merger, codegen, rewriter, op, exp) : Value();
     }
   } else if (merger.exp(exp).kind != Kind::kInvariant) {
     // Traverse into the binary operations. Note that we only hoist
     // tensor loads, since subsequent MLIR/LLVM passes know how to
     // deal with all other kinds of derived loop invariants.
     unsigned e0 = merger.exp(exp).e0;
     unsigned e1 = merger.exp(exp).e1;
     genInvariants(merger, codegen, rewriter, op, e0, ldx, hoist);
     genInvariants(merger, codegen, rewriter, op, e1, ldx, hoist);
   }
 }

 /// Generates initialization code for the subsequent loop sequence at
 /// current index level. Returns true if the loop sequence needs to
 /// maintain the universal index.
 static bool genInit(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,
                     linalg::GenericOp op, std::vector<unsigned> &topSort,
                     unsigned at, llvm::BitVector &inits) {
   bool needsUniv = false;
   Location loc = op.getLoc();
   unsigned idx = topSort[at];

   // Initialize sparse positions.
   for (unsigned b = 0, be = inits.size(); b < be; b++) {
     if (inits[b]) {
       unsigned tensor = merger.tensor(b);
       assert(idx == merger.index(b));
       if (merger.isDim(b, Dim::kSparse)) {
         // Initialize sparse index.
         unsigned pat = at;
         for (; pat != 0; pat--) {
           if (codegen.pidxs[tensor][topSort[pat - 1]])
             break;
         }
         Value ptr = codegen.pointers[tensor][idx];
         Value one = rewriter.create<ConstantIndexOp>(loc, 1);
         Value p0 = (pat == 0) ? rewriter.create<ConstantIndexOp>(loc, 0)
                               : codegen.pidxs[tensor][topSort[pat - 1]];
         codegen.pidxs[tensor][idx] = genLoad(codegen, rewriter, loc, ptr, p0);
         Value p1 = rewriter.create<AddIOp>(loc, p0, one);
         codegen.highs[tensor][idx] = genLoad(codegen, rewriter, loc, ptr, p1);
       } else {
         // Dense index still in play.
         needsUniv = true;
       }
     }
   }

   // Initialize the universal dense index.
   codegen.loops[idx] = rewriter.create<ConstantIndexOp>(loc, 0);
   return needsUniv;
 }

 /// Returns vectorization strategy. Any implicit inner loop in the Linalg
 /// operation is a candidate. Whether it is actually converted to SIMD code
 /// depends on the requested strategy.
 static bool isVectorFor(CodeGen &codegen, bool isInner, bool isSparse) {
   switch (codegen.options.vectorizationStrategy) {
   case linalg::SparseVectorizationStrategy::kNone:
     return false;
   case linalg::SparseVectorizationStrategy::kDenseInnerLoop:
     return isInner && !isSparse;
   case linalg::SparseVectorizationStrategy::kAnyStorageInnerLoop:
     return isInner;
   }
 }

 /// Returns parallelization strategy. Any implicit loop in the Linalg operation
 /// that is marked "parallel" is a candidate. Whether it is actually converted
 /// to a parallel operation depends on the requested strategy.
 static bool isParallelFor(CodeGen &codegen, bool isOuter, bool isReduction,
                           bool isSparse, bool isVector) {
   switch (codegen.options.parallelizationStrategy) {
   case linalg::SparseParallelizationStrategy::kNone:
     return false;
   case linalg::SparseParallelizationStrategy::kDenseOuterLoop:
     return isOuter && !isSparse && !isReduction && !isVector;
   case linalg::SparseParallelizationStrategy::kAnyStorageOuterLoop:
     return isOuter && !isReduction && !isVector;
   case linalg::SparseParallelizationStrategy::kDenseAnyLoop:
     return !isSparse && !isReduction && !isVector;
   case linalg::SparseParallelizationStrategy::kAnyStorageAnyLoop:
     return !isReduction && !isVector;
   }
 }

 /// Generates a for-loop on a single index.
 static Operation *genFor(Merger &merger, CodeGen &codegen,
                          PatternRewriter &rewriter, linalg::GenericOp op,
                          bool isOuter, bool isInner, unsigned idx,
                          llvm::BitVector &indices) {
   unsigned fb = indices.find_first();
   unsigned tensor = merger.tensor(fb);
   assert(idx == merger.index(fb));
   auto iteratorTypes = op.iterator_types().getValue();
   bool isReduction = linalg::isReductionIteratorType(iteratorTypes[idx]);
   bool isSparse = merger.isDim(fb, Dim::kSparse);
   bool isVector = isVectorFor(codegen, isInner, isSparse);
   bool isParallel =
       isParallelFor(codegen, isOuter, isReduction, isSparse, isVector);

   // Prepare vector length.
   if (isVector)
     codegen.curVecLength = codegen.options.vectorLength;

   // Loop bounds and increment.
   Location loc = op.getLoc();
   Value lo = isSparse ? codegen.pidxs[tensor][idx] : codegen.loops[idx];
   Value hi = isSparse ? codegen.highs[tensor][idx] : codegen.sizes[idx];
   Value step = rewriter.create<ConstantIndexOp>(loc, codegen.curVecLength);

   // Emit a parallel loop.
   if (isParallel) {
     assert(!isVector);
     scf::ParallelOp parOp = rewriter.create<scf::ParallelOp>(loc, lo, hi, step);
     if (isSparse)
       codegen.pidxs[tensor][idx] = parOp.getInductionVars()[0];
     else
       codegen.loops[idx] = parOp.getInductionVars()[0];
     rewriter.setInsertionPointToStart(parOp.getBody());
     return parOp;
   }

   // Emit a sequential loop, potentially with a scalarized reduction.
   bool scalarRed = isInner && codegen.redExp != -1u;
   SmallVector<Value, 4> operands;
   if (scalarRed) {
     Value load;
     if (codegen.redVal) {
       load = codegen.redVal; // chained with previous for-loop
     } else if (isVector) {
       // TODO: assumes + reductions for now
       VectorType vtp = vectorType(codegen, codegen.buffers[codegen.redExp]);
       load = rewriter.create<ConstantOp>(loc, vtp, rewriter.getZeroAttr(vtp));
     } else {
       load = genTensorLoad(merger, codegen, rewriter, op, codegen.redExp);
     }
     operands.push_back(load);
   }
   scf::ForOp forOp = rewriter.create<scf::ForOp>(loc, lo, hi, step, operands);
   if (scalarRed) {
     codegen.redVal = merger.exp(codegen.redExp).val =
         forOp.getRegionIterArgs().front();
   }
   // Assign induction variable to sparse or dense index.
   Value iv = forOp.getInductionVar();
   if (isSparse)
     codegen.pidxs[tensor][idx] = iv;
   else
     codegen.loops[idx] = iv;
   rewriter.setInsertionPointToStart(forOp.getBody());
   // Share vector iteration mask between all subsequent loads/stores.
   if (isVector)
     codegen.curVecMask = genVectorMask(codegen, rewriter, iv, lo, hi, step);
   return forOp;
 }

 /// Emit a while-loop for co-iteration over multiple indices.
 static Operation *genWhile(Merger &merger, CodeGen &codegen,
                            PatternRewriter &rewriter, linalg::GenericOp op,
                            unsigned idx, bool needsUniv,
                            llvm::BitVector &indices) {
   SmallVector<Type, 4> types;
   SmallVector<Value, 4> operands;
   // Construct the while-loop with a parameter for each index.
   Type indexType = rewriter.getIndexType();
   for (unsigned b = 0, be = indices.size(); b < be; b++) {
     if (indices[b] && merger.isDim(b, Dim::kSparse)) {
       unsigned tensor = merger.tensor(b);
       assert(idx == merger.index(b));
       types.push_back(indexType);
       operands.push_back(codegen.pidxs[tensor][idx]);
     }
   }
   if (needsUniv) {
     types.push_back(indexType);
     operands.push_back(codegen.loops[idx]);
   }
   Location loc = op.getLoc();
   scf::WhileOp whileOp = rewriter.create<scf::WhileOp>(loc, types, operands);
   Block *before = rewriter.createBlock(&whileOp.before(), {}, types);
   Block *after = rewriter.createBlock(&whileOp.after(), {}, types);

   // Build the "before" region, which effectively consists
   // of a conjunction of "i < upper" tests on all induction.
   rewriter.setInsertionPointToStart(&whileOp.before().front());
   Value cond;
   unsigned o = 0;
   for (unsigned b = 0, be = indices.size(); b < be; b++) {
     if (indices[b] && merger.isDim(b, Dim::kSparse)) {
       unsigned tensor = merger.tensor(b);
       assert(idx == merger.index(b));
       Value op1 = before->getArgument(o);
       Value op2 = codegen.highs[tensor][idx];
       Value opc = rewriter.create<CmpIOp>(loc, CmpIPredicate::ult, op1, op2);
       cond = cond ? rewriter.create<AndOp>(loc, cond, opc) : opc;
       codegen.pidxs[tensor][idx] = after->getArgument(o++);
     }
   }
   if (needsUniv)
     codegen.loops[idx] = after->getArgument(o++);
   assert(o == operands.size());
   rewriter.create<scf::ConditionOp>(loc, cond, before->getArguments());
   rewriter.setInsertionPointToStart(&whileOp.after().front());
   return whileOp;
 }

 /// Generates a for-loop or a while-loop, depending on whether it implements
 /// singleton iteration or co-iteration over the given conjunction.
 static Operation *genLoop(Merger &merger, CodeGen &codegen,
                           PatternRewriter &rewriter, linalg::GenericOp op,
                           std::vector<unsigned> &topSort, unsigned at,
                           bool needsUniv, llvm::BitVector &indices) {
   unsigned idx = topSort[at];
   if (indices.count() == 1) {
     bool isOuter = at == 0;
     bool isInner = at == topSort.size() - 1;
     return genFor(merger, codegen, rewriter, op, isOuter, isInner, idx,
                   indices);
   }
   return genWhile(merger, codegen, rewriter, op, idx, needsUniv, indices);
 }

 /// Generates the local variables for this loop, consisting of the sparse
 /// indices, restored universal dense index, and dense positions.
 static void genLocals(Merger &merger, CodeGen &codegen,
                       PatternRewriter &rewriter, linalg::GenericOp op,
                       std::vector<unsigned> &topSort, unsigned at,
                       bool needsUniv, llvm::BitVector &locals) {
   Location loc = op.getLoc();
   unsigned idx = topSort[at];

   // Initialize sparse indices.
   Value min;
   for (unsigned b = 0, be = locals.size(); b < be; b++) {
     if (locals[b] && merger.isDim(b, Dim::kSparse)) {
       unsigned tensor = merger.tensor(b);
       assert(idx == merger.index(b));
       Value ptr = codegen.indices[tensor][idx];
       Value s = codegen.pidxs[tensor][idx];
       Value load = genLoad(codegen, rewriter, loc, ptr, s);
       codegen.idxs[tensor][idx] = load;
       if (!needsUniv) {
         if (min) {
           Value cmp =
               rewriter.create<CmpIOp>(loc, CmpIPredicate::ult, load, min);
           min = rewriter.create<SelectOp>(loc, cmp, load, min);
         } else {
           min = load;
         }
       }
     }
   }

   // Merge dense universal index over minimum.
   if (min) {
     assert(!needsUniv);
     codegen.loops[idx] = min;
   }

   // Initialize dense positions.
   for (unsigned b = 0, be = locals.size(); b < be; b++) {
     if (locals[b] && merger.isDim(b, Dim::kDense)) {
       unsigned tensor = merger.tensor(b);
       assert(idx == merger.index(b));
       unsigned pat = at;
       for (; pat != 0; pat--)
         if (codegen.pidxs[tensor][topSort[pat - 1]])
           break;
       Value p = (pat == 0) ? rewriter.create<ConstantIndexOp>(loc, 0)
                            : codegen.pidxs[tensor][topSort[pat - 1]];
       Value m = rewriter.create<MulIOp>(loc, codegen.sizes[idx], p);
       codegen.pidxs[tensor][idx] =
           rewriter.create<AddIOp>(loc, m, codegen.loops[idx]);
     }
   }
 }

 /// Generates the induction structure for a while-loop.
 static void genWhileInduction(Merger &merger, CodeGen &codegen,
                               PatternRewriter &rewriter, linalg::GenericOp op,
                               unsigned idx, bool needsUniv,
                               llvm::BitVector &induction, ResultRange results) {
   Location loc = op.getLoc();
   unsigned o = 0;
   SmallVector<Value, 4> operands;
   Value one = rewriter.create<ConstantIndexOp>(loc, 1);
   for (unsigned b = 0, be = induction.size(); b < be; b++) {
     if (induction[b] && merger.isDim(b, Dim::kSparse)) {
       unsigned tensor = merger.tensor(b);
       assert(idx == merger.index(b));
       Value op1 = codegen.idxs[tensor][idx];
       Value op2 = codegen.loops[idx];
       Value op3 = codegen.pidxs[tensor][idx];
       Value cmp = rewriter.create<CmpIOp>(loc, CmpIPredicate::eq, op1, op2);
       Value add = rewriter.create<AddIOp>(loc, op3, one);
       operands.push_back(rewriter.create<SelectOp>(loc, cmp, add, op3));
       codegen.pidxs[tensor][idx] = results[o++];
     }
   }
   if (needsUniv) {
     operands.push_back(rewriter.create<AddIOp>(loc, codegen.loops[idx], one));
     codegen.loops[idx] = results[o++];
   }
   assert(o == operands.size());
   rewriter.create<scf::YieldOp>(loc, operands);
 }

 /// Generates a single if-statement within a while-loop.
 static scf::IfOp genIf(Merger &merger, CodeGen &codegen,
                        PatternRewriter &rewriter, linalg::GenericOp op,
                        unsigned idx, llvm::BitVector &conditions) {
   Location loc = op.getLoc();
   Value cond;
   for (unsigned b = 0, be = conditions.size(); b < be; b++) {
     if (conditions[b]) {
       unsigned tensor = merger.tensor(b);
       assert(idx == merger.index(b));
       Value clause;
       if (merger.isDim(b, Dim::kSparse)) {
         Value op1 = codegen.idxs[tensor][idx];
         Value op2 = codegen.loops[idx];
         clause = rewriter.create<CmpIOp>(loc, CmpIPredicate::eq, op1, op2);
       } else {
         clause = rewriter.create<ConstantIntOp>(loc, 1, 1); // true
       }
       cond = cond ? rewriter.create<AndOp>(loc, cond, clause) : clause;
     }
   }
   scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, cond, /*else*/ true);
   rewriter.setInsertionPointToStart(&ifOp.thenRegion().front());
   return ifOp;
 }

 /// Recursively generates code while computing iteration lattices in order
 /// to manage the complexity of implementing co-iteration over unions
 /// and intersections of sparse iterations spaces.
 static void genStmt(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,
                     linalg::GenericOp op, std::vector<unsigned> &topSort,
                     unsigned exp, unsigned at) {
   // At each leaf, assign remaining tensor (sub)expression to output tensor.
   if (at == topSort.size()) {
     unsigned lhs = op.getNumShapedOperands() - 1;
     Value rhs = genExp(merger, codegen, rewriter, op, exp);
     genTensorStore(merger, codegen, rewriter, op, lhs, rhs);
     return;
   }

   // Construct iteration lattices for current loop index, with L0 at top.
   // Then emit initialization code for the loop sequence at this level.
   // We maintain the universal dense index if dense indices are still
   // in play for a non-singleton loop sequence.
   Location loc = op.getLoc();
   unsigned idx = topSort[at];
   unsigned lts = merger.optimizeSet(buildLattices(merger, op, exp, idx));
   unsigned lsize = merger.set(lts).size();
   assert(lsize != 0);
   unsigned l0 = merger.set(lts)[0];
   unsigned ldx = at == 0 ? -1u : topSort[at - 1];
   genInvariants(merger, codegen, rewriter, op, exp, ldx, /*hoist=*/true);
   bool needsUniv = genInit(merger, codegen, rewriter, op, topSort, at,
                            merger.lat(l0).bits) &&
                    lsize > 1;

   // Emit a loop for every lattice point L0 >= Li.
   for (unsigned i = 0; i < lsize; i++) {
     unsigned li = merger.set(lts)[i];

     // Emit loop.
     codegen.curVecLength = 1;
     llvm::BitVector indices = merger.lat(li).simple;
     Operation *loop =
         genLoop(merger, codegen, rewriter, op, topSort, at, needsUniv, indices);
     genLocals(merger, codegen, rewriter, op, topSort, at, needsUniv,
               merger.lat(li).bits);

     // Visit all lattices points with Li >= Lj to generate the
     // loop-body, possibly with if statements for coiteration.
     bool isWhile = dyn_cast<scf::WhileOp>(loop) != nullptr;
     for (unsigned j = 0; j < lsize; j++) {
       unsigned lj = merger.set(lts)[j];
       unsigned ej = merger.lat(lj).exp;
       if (li == lj || merger.latGT(li, lj)) {
         if (li != lj) {
           llvm::BitVector tmp = merger.lat(lj).bits;
           tmp ^= merger.lat(li).bits;
           if (!merger.hasAnyDimOf(tmp, Dim::kSparse))
             continue; // only dense exhausted within if/else
         }
         // Recurse into body of each branch.
         if (isWhile) {
           scf::IfOp ifOp =
               genIf(merger, codegen, rewriter, op, idx, merger.lat(lj).simple);
           genStmt(merger, codegen, rewriter, op, topSort, ej, at + 1);
           rewriter.setInsertionPointToStart(&ifOp.elseRegion().front());
         } else {
           genStmt(merger, codegen, rewriter, op, topSort, ej, at + 1);
         }
       }
     }

     // Wrap-up induction and restore insertion point.
     if (isWhile) {
       scf::WhileOp whileOp = cast<scf::WhileOp>(loop);
       rewriter.setInsertionPointToEnd(&whileOp.after().front());
       genWhileInduction(merger, codegen, rewriter, op, idx, needsUniv,
                         merger.lat(li).bits, whileOp.results());
     } else {
       needsUniv = false;
       if (codegen.redVal) {
         rewriter.create<scf::YieldOp>(loc, codegen.redVal);
         codegen.redVal = loop->getResult(0);
       }
     }
     rewriter.setInsertionPointAfter(loop);
   }

   // Wrap-up loop sequence.
   Value red = codegen.redVal;
   if (red) {
     codegen.redVal = merger.exp(codegen.redExp).val = Value(); // end chain
     unsigned lhs = op.getNumShapedOperands() - 1;
     if (codegen.curVecLength > 1) {
       codegen.curVecLength = 1;
       Value ld = genTensorLoad(merger, codegen, rewriter, op, codegen.redExp);
       red = rewriter.create<vector::ReductionOp>(
           loc, ld.getType(), rewriter.getStringAttr("add"), red, ld);
     }
     genTensorStore(merger, codegen, rewriter, op, lhs, red);
   }
   genInvariants(merger, codegen, rewriter, op, exp, ldx, /*hoist=*/false);
   codegen.loops[idx] = Value();
 }

 namespace {

 /// Sparse rewriting rule for generic Lingalg operation.
 struct GenericOpSparsifier : public OpRewritePattern<linalg::GenericOp> {
 public:
   GenericOpSparsifier(MLIRContext *context, linalg::SparsificationOptions o)
       : OpRewritePattern<linalg::GenericOp>(context), options(o) {}

   LogicalResult matchAndRewrite(linalg::GenericOp op,
                                 PatternRewriter &rewriter) const override {
     // Detects sparse annotations and translate the per-dimension sparsity
     // information for all tensors to loop indices in the kernel.
     if (!op.hasSparseSemantics())
       return failure();
     assert(op.getNumOutputs() == 1);
     unsigned numTensors = op.getNumShapedOperands();
     unsigned numLoops = op.iterator_types().getValue().size();
     Merger merger(numTensors, numLoops);
     findSparseAnnotations(merger, op);

     // Computes a topologically sorted iteration graph to ensure
     // tensors are visited in natural index order. Fails on cycles.
     // This assumes that higher-level passes have already put the
     // tensors in each tensor expression in a feasible order.
     std::vector<unsigned> topSort;
     if (!computeIterationGraph(merger, op, topSort, /*sparseOnly=*/false) &&
         !computeIterationGraph(merger, op, topSort, /*sparseOnly=*/true))
       return failure();

     // Finds the terminating yield statement and builds the tensor
     // expression for the Linalg operation in SSA form.
     Operation *yield = op.region().front().getTerminator();
     Optional<unsigned> exp = buildTensorExp(merger, op, yield->getOperand(0));
     if (!exp.hasValue())
       return failure(); // build failure

     // Recursively generates code.
     CodeGen codegen(options, numTensors, numLoops);
     genBuffers(merger, codegen, rewriter, op);
     genStmt(merger, codegen, rewriter, op, topSort, exp.getValue(), 0);
     Value result =
         rewriter.create<TensorLoadOp>(op.getLoc(), codegen.buffers.back());
     rewriter.replaceOp(op, result);
     return success();
   }

 private:
   /// Options to control sparse code generation.
   linalg::SparsificationOptions options;
 };

 } // namespace

 /// Populates the given patterns list with rewriting rules required for
 /// the sparsification of linear algebra operations.
 void linalg::populateSparsificationPatterns(
     MLIRContext *context, OwningRewritePatternList &patterns,
     const SparsificationOptions &options) {
   patterns.insert<GenericOpSparsifier>(context, options);
 }