mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638

//===- LoopEmitter.h --------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_SPARSETENSORLOOPEMITTER_H_
#define MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_SPARSETENSORLOOPEMITTER_H_

#include <vector>

#include "mlir/Dialect/SparseTensor/IR/Enums.h"
#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
#include "mlir/Dialect/SparseTensor/Utils/Merger.h"
#include "mlir/IR/PatternMatch.h"

namespace mlir {
namespace sparse_tensor {

//===----------------------------------------------------------------------===//
/// The position of a loop in the loop-stack, or the position of a
/// `LoopId` in a topologically-sorted list of `LoopId`s.
///
/// Although this type may have the same cardinality as `LoopId`, it must
/// not be confused with that type.  The `LoopId` type is used by the `Merger`
/// as a unique identifier for loop-variables, regardless of the ordering
/// of those loops.  Whereas the `LoopOrd` type is used by the `LoopEmitter`
/// (and `CodegenEnv`) to refer to the actual order in which loops are
/// generated.
///
/// TODO: further explicate the correspondences between these various
/// types.  In particular, since the `$dim` argument to `linalg::IndexOp`
/// is a De Bruijn index, it seems like that should correspond to `LoopOrd`,
/// and yet the `Merger` has that correspond with `LoopId` instead.
/// In addition `LoopEmitter::genAffine` has `AffineDimExpr::position`
/// correspond to `LoopId`, however it is unclear what the providence
/// of those `AffineDimExpr` is.
//
// TODO: use a struct/class rather than a typedef, so that we can actually
// typecheck this to avoid mixups in the code.
using LoopOrd = unsigned;

// A compressed <tensor id, level> pair.
using TensorLevel = unsigned;
//===----------------------------------------------------------------------===//
// SparseTensorLoopEmiter class, manages sparse tensors and helps to
// generate loop structure to (co)-iterate sparse tensors.
//
// An example usage:
// To generate the following loops over T1<?x?> and T2<?x?>
//
// for i in TENSOR_1_0 {
//   for j : TENSOR_2_0 {
//     for k : TENSOR_1_1 {}
//     for k : TENSOR_2_1 {}
//   }
// }
//
// One can use
//
// LoopEmiter loopEmiter({T1, T1});
// loopEmiter.initializeLoopEmit();
// loopEmiter.enterLoopOverTensorAtLvl(T1, 0);
// loopEmiter.enterLoopOverTensorAtLvl(T2, 0);
// loopEmiter.enterLoopOverTensorAtLvl(T1, 1);
// loopEmiter.exitCurrentLoop();
// loopEmiter.enterLoopOverTensorAtLvl(T2, 1);
// loopEmiter.exitCurrentLoop(); // exit k
// loopEmiter.exitCurrentLoop(); // exit j
// loopEmiter.exitCurrentLoop(); // exit i
//===----------------------------------------------------------------------===//

class LoopEmitter {
public:
  /// Optional callback function to setup dense output tensors when
  /// initializing the loop emitter (e.g., to fill a dense output with zeros).
  using OutputUpdater = function_ref<Value(OpBuilder &builder, Location loc,
                                           Value memref, Value tensor)>;
  // Map from [tid, dim] to a list of dependent [tid, dim] for affine expression
  // index on sparse tensors.
  // E.g., for affine index (d0 + d1), it depends on two [tid, dim] that defines
  // d0 and d1 (for affine expression reduction).
  // If the list is empty, it means that there is no affine expression on the
  // input [tid, dim].
  // NOTE: The caller is responsible to ensure that the order of the returned
  // list to be consistent with the topological order of the iteration graph,
  // otherwise the loop emitter might reduce a wrong dependent index variable
  // when generating slice-driven loops.
  using DependentLvlGetter =
      function_ref<std::vector<std::pair<TensorId, Level>>(TensorId, Level)>;

  LoopEmitter() = default;

  /// Takes an array of input tensors, which the generated loops will
  /// iterate over.  Each tensor is given a `TensorId` (numerically equal
  /// to the position of that tensor `Value` in the array).  Setting
  /// `isSparseOut` indicates that the sparse output tensor is empty,
  /// so the loop emitter will generate loops over it according to the
  /// level-sizes.  The `topSort` array specifies the actual order in
  /// which loops are generated, thus providing a mapping from `LoopOrd`
  /// to `LoopId`.
  void initialize(ValueRange tensors, StringAttr loopTag = nullptr,
                  bool hasOutput = false, bool isSparseOut = false,
                  ArrayRef<LoopId> topSort = {},
                  DependentLvlGetter getter = nullptr);

  explicit LoopEmitter(ValueRange tensors, StringAttr loopTag = nullptr,
                       bool hasOutput = false, bool isSparseOut = false,
                       ArrayRef<LoopId> topSort = {},
                       DependentLvlGetter getter = nullptr);

  /// Starts a loop emitting session by generating all the buffers needed
  /// for iterating over the tensors.
  void initializeLoopEmit(OpBuilder &builder, Location loc,
                          OutputUpdater updater = nullptr);

  /// Generates code to compute an affine expression whose variables are
  /// `LoopId`s (i.e., `a.cast<AffineDimExpr>().getPosition()` is a valid
  /// `LoopId`).
  Value genAffine(OpBuilder &builder, Location loc, AffineExpr a);

  /// Enters a new loop sequence, the loops within the same sequence starts
  /// from the break points of previous loop instead of starting over from 0.
  /// e.g.,
  /// {
  ///   // loop sequence start.
  ///   p0 = while(xxx)
  ///     ...
  ///     break p0
  ///
  ///   // Starts loop from p0
  ///   for (i = p0; i < end; i++)
  ///     ...
  ///   // loop sequence end.
  /// }
  void enterNewLoopSeq(OpBuilder &builder, Location loc,
                       ArrayRef<TensorLevel> tidLvls);

  /// Exits the current loop sequence, this will reset universal index to 0.
  void exitCurrentLoopSeq(OpBuilder &builder, Location loc);

  // TODO: Get rid of `lvls` in the argument list? Track the level we
  // are currently at internally. Then it would be enterNextLvlForTensor.
  // Still need a way to specify the lvl for non-annotated tensors though,
  // as those can be accessed out of order.
  //
  /// Emits loop over tensor_tid_lvl, it assumes that loops between
  /// tensor_tid_[0, lvl - 1] have already been generated.
  /// The function will also perform in-place update on the `reduc` vector to
  /// return the reduction variable used inside the generated loop.
  Operation *enterLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
                                      ArrayRef<TensorLevel> tidLvls,
                                      MutableArrayRef<Value> reduc = {},
                                      bool isParallel = false);

  Operation *enterFilterLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
                                            TensorId tid, Level lvl,
                                            AffineExpr affine,
                                            MutableArrayRef<Value> reduc = {});

  void genDenseAffineAddress(OpBuilder &builder, Location loc,
                             TensorLevel tidLvl, AffineExpr lvlExpr);

  /// Emits a co-iteration loop over a set of tensors.
  Operation *enterCoIterationOverTensorsAtLvls(
      OpBuilder &builder, Location loc, ArrayRef<TensorLevel> tidLvls,
      bool needsUniv, MutableArrayRef<Value> reduc = {});

  void exitCurrentLoop(RewriterBase &rewriter, Location loc,
                       MutableArrayRef<Value> reduc = {});

  /// Fills the out-parameter with the loop induction variables for all
  /// loops in the current loop-stack.  The variables are given in the
  /// same order as the loop-stack, hence `ivs` should be indexed into
  /// by `LoopOrd` (not `LoopId`).
  void getLoopIVs(SmallVectorImpl<Value> &ivs) const {
    ivs.clear();
    ivs.reserve(getCurrentDepth());
    for (auto &l : loopStack)
      ivs.push_back(l.iv);
  }

  /// Gets the current depth of the loop-stack.  The result is given
  /// the type `LoopOrd` for the same reason as one-past-the-end iterators.
  LoopOrd getCurrentDepth() const { return loopStack.size(); }

  /// Gets loop induction variable for the given `LoopOrd`.
  Value getLoopIV(LoopOrd n) const {
    return n < getCurrentDepth() ? loopStack[n].iv : Value();
  }

  /// Gets the total number of tensors that loopEmitter is operating on.
  unsigned getNumTensors() const { return tensors.size(); }

  /// Compresses a TensorId and Level into a TensorLevel.
  TensorLevel makeTensorLevel(TensorId t, Level l) const {
    return l * getNumTensors() + t;
  }

  /// De-compresses a TensorLevel back to a pair of TensorId and Level.
  std::pair<TensorId, Level> unpackTensorLevel(TensorLevel tidLvl) const {
    unsigned nt = getNumTensors();
    return std::make_pair(tidLvl % nt, tidLvl / nt);
  }

  /// Converts a range of TensorLevel to a range of std::pair<TensorId, Level>
  template <class ContainerTy>
  auto unpackTensorLevelRange(ContainerTy &&c) const {
    using EltTy = decltype(*c.begin());
    static_assert(std::is_same_v<llvm::remove_cvref_t<EltTy>, TensorLevel>,
                  "Must be unpacking a TensorLevel range");
    return llvm::map_range(std::forward<ContainerTy>(c), [this](EltTy tl) {
      return this->unpackTensorLevel(tl);
    });
  }

  ///
  /// Getters.
  ///
  const std::vector<std::vector<Value>> &getPosits() const { return posits; };
  const std::vector<std::vector<Value>> &getCoords() const { return coords; };
  const std::vector<std::vector<Value>> &getHighs() const { return highs; };
  const std::vector<std::vector<Value>> &getPositionBuffers() const {
    return positionsBuffers;
  };
  const std::vector<std::vector<Value>> &getCoordinateBuffers() const {
    return coordinatesBuffers;
  };
  const std::vector<Value> &getValBuffer() const { return valBuffer; };

  constexpr static llvm::StringLiteral getLoopEmitterLoopAttrName() {
    return llvm::StringLiteral("Emitted from");
  }

private:
  // A tuple that stored the slice-driven loop information.
  struct SliceLoopInfo final {
    SliceLoopInfo(TensorId tid, Level lvl, bool reduced)
        : tid(tid), lvl(lvl), reduced(reduced) {}
    TensorId tid;
    Level lvl;
    bool reduced;
  };
  // LoopInfo stores information of a loop generated by LoopEmitter. E.g.,
  // the set of tensors levels that the loop is iterating over.
  struct LoopInfo final {
    LoopInfo(ArrayRef<TensorLevel> tidLvls,
             ArrayRef<SliceLoopInfo> sliceDrivenInfo, Operation *loop,
             Block *userBlock, Value iv, StringAttr loopTag)
        : tidLvls(tidLvls), sliceDrivenInfo(sliceDrivenInfo), loop(loop),
          userCodeBlock(userBlock), iv(iv) {
      // Attached a special tag to loop emitter generated loop.
      if (loopTag)
        loop->setAttr(LoopEmitter::getLoopEmitterLoopAttrName(), loopTag);
    }
    // The set of <tensor, lvl> that the loop is operating on
    const llvm::SmallVector<TensorLevel> tidLvls;
    // Slice-driven loop conditions.
    const llvm::SmallVector<SliceLoopInfo> sliceDrivenInfo;
    const Operation *loop;      // the loop operation
    Block *const userCodeBlock; // the block holding users' generated code.
    const Value iv;             // the induction variable for the loop
  };

  // SliceInfo stores information of an extracted slice for slice-driven loop.
  // E.g., the in-scope SSA values for the minimum coordinates and offset for
  // the slice, etc.
  struct SliceInfo final {
    // Note that we do not need to create a actual sparse tensor slice but
    // instead only need to maintain the metadata of the slice.
    SliceInfo(Value minCrd, Value offset, Value isNonEmpty,
              std::optional<Level> slicedOnLvl, unsigned depth)
        : minCrd(minCrd), offset(offset), isNonEmpty(isNonEmpty),
          slicedOnLvl(slicedOnLvl), depth(depth) {
      // TODO: use std::optional<pair<Level, minCrd>>
      assert(!slicedOnLvl || minCrd);
    }

    // Whether this is the tensor that has not yet been sliced.
    bool isInitialTensor() const { return !slicedOnLvl.has_value(); }

    Value minCrd;                     // the minimum coordinate of the slice.
    Value offset;                     // the offset of the current slice.
    Value isNonEmpty;                 // whether the slice is empty.
    std::optional<Level> slicedOnLvl; // the level on which the slice is done
    unsigned depth; // the depth (relative to dependentDimMap[tid][lvl]).
  };

  using LoopBodyBuilder = llvm::function_ref<void(OpBuilder &, Location, Value,
                                                  MutableArrayRef<Value>)>;

  /// Linearizes address for dense dimension (i.e., p = (i * d0) + j).
  Value genAddress(OpBuilder &builder, Location loc, TensorId tid, Level lvl,
                   Value iv);

  /// Generates the segment high for a non-unique level (to fast forward
  /// duplicated coordinates).  That is, it generates the code:
  ///
  ///   crd = coordinates_tid_lvl[pos]
  ///   while (pos < pHi && coordinates_tid_lvl[pos] == crd)
  ///      pos++;
  ///   <return pos>;
  Value genSegmentHigh(OpBuilder &builder, Location loc, TensorId tid,
                       Level lvl, Value pos, Value pHi);

  /// Generates instructions to compute the coordinate of tensors[tid][lvl]
  /// under the current loop context.  The final argument is the
  /// collapsed-output level, whereas this function handles converting
  /// that to the uncollapsed-input level
  Value genSparseCrd(OpBuilder &builder, Location loc, TensorId tid,
                     Level dstLvl);

  /// Generates a predicate to determine whether the tranformed coordinates are
  /// in the given slice.
  /// Returns std::pair<Transformed coordinates, Predicate>
  std::pair<Value, Value> genSliceLegitPredicate(OpBuilder &builder,
                                                 Location loc, Value crd,
                                                 TensorId tid, Level lvl);

  bool isOutputTensor(TensorId tid) const {
    return hasOutput && tid == getNumTensors() - 1;
  }

  bool isSparseOutput(TensorId tid) const {
    return isOutputTensor(tid) && isSparseOut;
  }

  bool isValidLevel(TensorId tid, Level lvl) const {
    return tid < lvlTypes.size() && lvl < lvlTypes[tid].size();
  }

  /// Prepares loop for iterating over `tensor[lvl]`, under the assumption
  /// that `tensor[0...lvl-1]` loops have already been set up.
  void prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
                                  TensorId tid, Level lvl);

  /// Emits extra locals, since the locals might not be in simplified lattices
  /// point used to generate the loops, but are still required to generate
  /// expressions.
  void emitExtraLocalsForTensorsAtDenseLvls(OpBuilder &builder, Location loc,
                                            ArrayRef<TensorLevel> tidLvls);

  /// Emits a for loop to iterate over a tensor level with the provided lower
  /// bound `lo` and upper bound `hi`.
  /// Apart from iterating just single tensor level, for loops can be used for
  /// slice-driven loop on dense level too.
  Operation *emitForLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
                                        TensorId tid, Level lvl, Value lo,
                                        Value hi, MutableArrayRef<Value> reduc,
                                        bool isParallel);

  /// Emits a while loop to iterate over a sparse level that has been sliced.
  /// Inserts break statement when the coordinate exceeds the sliceSize;
  /// The method sets the insertion point inside the generated while loop body
  /// after the break statement before return (so that callers need to handle
  /// only in-bound coordinates).
  Operation *emitWhileLoopOverSliceAtSparseLvl(OpBuilder &builder, Location loc,
                                               Value pLo, Value pHi,
                                               Value offset, Value sliceSize,
                                               TensorId tid, Level lvl,
                                               MutableArrayRef<Value> reduc);

  /// Exits a for loop, returns the reduction results, e.g.,
  /// For sequential for loops:
  /// %ret = for () {
  ///   ...
  ///   %val = addi %args, %c
  ///   yield %val
  /// }
  /// For parallel loops, the following generated code by users:
  /// %ret = parallel () init(%args) {
  ///   ...
  ///   %val = op %args, %c
  /// }
  /// will be transformed into
  /// %ret = parallel () init(%args) {
  ///   ...
  ///   scf.reduce(%c) bb0(%0, %1){
  ///     %val = op %0, %1
  ///     scf.reduce.return %val
  ///   }
  /// }
  /// NOTE: only one instruction will be moved into reduce block,
  /// transformation will fail if multiple instructions are used to compute
  /// the reduction value. Return %ret to user, while %val is provided by
  /// users (`reduc`).
  void exitForLoop(RewriterBase &rewriter, Location loc,
                   MutableArrayRef<Value> reduc);

  /// Exits a while loop, returns the reduction results.
  void exitWhileLoop(OpBuilder &builder, Location loc,
                     MutableArrayRef<Value> reduc);

  //
  // View-based-reshape methods.
  //

  /// Get the collapse reassociation for `tensors[tid][dstLvl]`.
  /// For unreshaped operands, the reassociation is simply an identity
  /// transformation.
  ///
  /// NOTE: the result uses `Level` rather than the `int64_t` of
  /// `ReassociationIndices`, since the former gives clarity to what
  /// the values actually mean.
  ///
  /// TODO: why not do this computation when we first store the reassoc,
  /// instead of doing it every time we look it up?
  SmallVector<Level, 2> getCollapseReassociation(TensorId tid, Level dstLvl) {
    assert(tid < getNumTensors() && "Invalid TensorId");
    assert(collapseReassoc.size() == getNumTensors());
    if (const auto reassoc = collapseReassoc[tid]) {
      // TODO: store the dstLvlRank in the LoopEmitter so that we can
      // check `dstLvl < dstLvlRank` at the top; and only here need to
      // assert that `reassoc.size() == dstLvlRank`.
      assert(dstLvl < reassoc.size() && "Level is out-of-bounds");
      const auto srcLvls = cast<ArrayAttr>(reassoc[dstLvl]);
      return llvm::to_vector<2>(
          llvm::map_range(srcLvls, [&](Attribute srcLvl) -> Level {
            // TODO: replace this with the converter for `LevelAttr`.
            return cast<IntegerAttr>(srcLvl).getValue().getZExtValue();
          }));
    }
    return {dstLvl};
  }

  //
  // Slice-driven loop related methods.
  //

  /// Retrieves the most recent slice on lvl. To reduce affine expression like
  /// d0 + d1 + d2, we need two slices (one of size d1 + d2, and the other of
  /// size d2). This methods returns the latter slice (of size d2).
  const SliceInfo &getMostRecentSliceOnLvl(TensorId tid, Level lvl);

  /// Similar to getMostRecentSliceOnLvl, but yields error when the most recent
  /// slice is not the final slice needed to fully reduced the dependencies.
  const SliceInfo &getFinalSliceOnLvl(TensorId tid, Level lvl) {
    const SliceInfo &info = getMostRecentSliceOnLvl(tid, lvl);
    assert(info.depth == dependentLvlMap[tid][lvl].size() - 1);
    return info;
  }

  /// Get the remaining number of constraints needed to fully *resolve*
  /// dependent levels on tensor[tid].
  unsigned remDepOnLevel(TensorId tid, Level lvl) const;

  /// Whether the tid, lvl is fully *reduced*, i.e., the non-trivial index
  /// expression has been reduced to a trivial one.
  /// E.g., A[i + j] => A[i + 2] (j is reduced)
  bool depFullyReduced(TensorId tid, Level lvl) const {
    return remDepOnLevel(tid, lvl) == 1;
  }

  /// Whether the tid, lvl is fully resolved, i.e., we entered the level already
  /// (the index on that level is determined).
  /// E.g., A[i + j] => A[2 + 3] (both i and j become invariants for inner
  /// loops).
  bool lvlFullyResolved(TensorId tid, Level lvl) const {
    return remDepOnLevel(tid, lvl) == 0;
  }

  /// Generates a whileOp to iterate over a subset of coordinates on tid on lvl
  /// using the pHi and pLo provided, the loop break on the first coordinate
  /// that exceeds the slice boundary (i.e., coord >= slice.offset +
  /// slice.size).
  std::pair<Operation *, ValueRange>
  genSliceLvlTraverseLoop(OpBuilder &builder, Location loc, Value pLo,
                          Value pHi, Value offset, Value size, TensorId tid,
                          Level lvl, ValueRange userReduc, bool genYield,
                          LoopBodyBuilder bodyBuilder);

  /// Generates a nested loop that iterates over tid on all the coordinates on
  /// lvl.
  ValueRange genUnResolvedSliceTreeTraverse(
      OpBuilder &builder, Location loc, TensorId tid,
      ArrayRef<const SliceInfo *> unResLvls,
      std::optional<std::pair<TensorId, Level>> firstResLvl,
      ValueRange userReduc, LoopBodyBuilder bodyBuilder);

  /// Generates code to get the first non-empty slice of tid on lvl, when all
  /// the previous level before `lvl` are resolved (or lvl is the first level).
  ///
  /// This is the simple case because the previous level are resolved into a
  /// single node in the storage tree.
  void genResolvedSliceBegin(OpBuilder &builder, Location loc, TensorId tid,
                             Level lvl);

  /// Generates code to get the first non-empty slice of tid on lvl, when
  /// the previous levels before `lvl` are unresolved
  ///
  /// This is the complex case because the previous levels corresponding to a
  /// range of nodes in the storage tree.
  void genUnResolvedSliceBegin(OpBuilder &builder, Location loc, TensorId tid,
                               Level lvl);

  /// Invalidates the index kept in slice postion buffers (by setting it to
  /// zero).
  /// TODO: We should instead use an SSA value for the index.
  void invalidateSliceIterIdx(OpBuilder &builder, Location loc, TensorId tid,
                              Level lvl);
  /// Generates code to get the first non-empty slice of tid on lvl.
  /// return true if has already been resolved.
  bool genSliceBegin(OpBuilder &builder, Location loc, TensorId tid, Level lvl);

  /// Generates code to get the next non-empty slices of tid on lvl.
  void genSliceNextInduction(OpBuilder &builder, Location loc,
                             const Operation *whileOp, TensorId tid, Level lvl,
                             SmallVectorImpl<Value> &operands,
                             unsigned &retIdx);

  /// Generates a slice-driven while loop as follows.
  ///
  /// curSlice = getFirstNonEmptySlice(tensor).
  ///
  /// while(isNonEmpty) {
  ///   ..user code..
  ///   isNonEmpty, curSlice = getNextNonEmptySlice(curSlice)
  /// }
  Operation *emitSliceDrivenLoopOverTensorAtLvl(OpBuilder &builder,
                                                Location loc, TensorId tid,
                                                Level lvl,
                                                MutableArrayRef<Value> reduc);

  /// A optional string attribute that should be attached to the loop
  /// generated by loop emitter, it might help following passes to identify
  /// loops that operates on sparse tensors more easily.
  StringAttr loopTag;
  /// Whether the loop emitter needs to treat the last tensor as the output
  /// tensor.
  bool hasOutput;
  bool isSparseOut;

  /// The insertion point to allocate top level local variables.
  Operation *localInsertPos;

  //
  // Fields which have `numTensor` many entries.
  //
  // TODO: switch to an AOS style to avoid any possible mismatches.
  //

  /// Input and (optional) output tensors.
  std::vector<Value> tensors;
  /// Level-types for each `(TensorId, Level)` pair.
  std::vector<std::vector<DimLevelType>> lvlTypes;
  // Sparse iteration information for each `(TensorId, Level)` pair.
  // These arrays are updated to remain current within the current loop.
  // TODO: Clarify which of these are indexed by dstLvl vs srcLvl.
  //
  /// The collection of positions for a given element (one such collection
  /// for each tensor).  This is the position analogue of the "coords"
  /// naming convention.
  ///
  /// FIXME: [CLARIFY_POSITS_LVL] It's unclear which levels are used
  /// to index the `posits` array.  On the one hand `genSparseCrd`
  /// uses dstLvl; on the other hand `enterLoopOverTensorAtLvl`,
  /// `prepareLoopOverTensorAtLvl`, and `enterCoIterationOverTensorsAtLvls`
  /// uses srcLvl.  So which is it?
  std::vector<std::vector<Value>> posits;
  /// The collection of coordinates for a given element (one such
  /// collection for each tensor).
  std::vector<std::vector<Value>> coords;
  // The segment upper bound for non-uniques level after de-duplication.
  std::vector<std::vector<Value>> segHi;
  std::vector<std::vector<Value>> highs;
  std::vector<std::vector<Value>> lvlSizes;
  std::vector<std::vector<Value>> positionsBuffers;   // to_positions
  std::vector<std::vector<Value>> coordinatesBuffers; // to_coordinates
  std::vector<Value> valBuffer;                       // to_value

  //
  // Slice-driven loops related fields.
  //

  /// Whether the sparse input is a slice.
  std::vector<bool> isSparseSlices;
  /// Values related to slices.
  std::vector<std::vector<Value>> sliceOffsets;
  std::vector<std::vector<Value>> sliceStrides;

  // Map from [tid, level] to a list of dependent [tid, level].
  // See comments for `DependentDimGetter`.
  std::vector<std::vector<std::vector<std::pair<TensorId, Level>>>>
      dependentLvlMap;

  // The cached position buffer for the slices, they serve the same purpose as
  // ptrBuffer for compressed dimensions.
  // But they always starts with the first pidx pointing to coord > slice.offset
  // to avoid iteration from the beginning.
  std::vector<std::vector<std::vector<Value>>> slicePosBuffer;

  // The cached size for each slices.
  std::vector<std::vector<std::vector<Value>>> sliceSizes;

  // The number of reduced dependencies on a tensor level so far.
  std::vector<std::vector<unsigned>> levelReducedDep;

  // sliceStack[tid] holds the generated slice stack on tid.
  std::vector<std::vector<SliceInfo>> sliceStack;

  //
  // View based reshape related-fields and methods
  //

  /// Collapse Reassociations related to a specific tensor
  // TODO: support expand.
  std::vector<ArrayAttr> collapseReassoc;

  /// TODO: not yet used, it should track the current level for each tensor
  /// to help eliminate `lvls` paramters from above APIs.
  /// std::vector<Level> curLvl;

  //
  // Fields which have at most `numLoops` many entries.
  //

  /// Loop Stack, stores the information of all the nested loops that are
  /// alive.
  std::vector<LoopInfo> loopStack;

  // Loop Sequence Stack, stores the unversial index for the current loop
  // sequence. and a list of tids which was taken sliced.
  // TODO: maybe we should have a LoopSeqInfo
  std::vector<std::pair<Value, std::vector<std::tuple<TensorId, Level, bool>>>>
      loopSeqStack;

  /// Maps `LoopId` (used by `AffineDimExpr`) to `LoopOrd` (in the `loopStack`).
  /// TODO: We should probably use a callback function here to make it more
  /// general.
  std::vector<LoopOrd> loopIdToOrd;
};

} // namespace sparse_tensor
} // namespace mlir

#endif // MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_SPARSETENSORLOOPEMITTER_H_