1 //===- VectorToSCF.cpp - Conversion from Vector to mix of SCF and Std -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements target-dependent lowering of vector transfer operations.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include <type_traits>
14 
15 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
16 
17 #include "../PassDetail.h"
18 #include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
19 #include "mlir/Dialect/SCF/EDSC/Builders.h"
20 #include "mlir/Dialect/SCF/EDSC/Intrinsics.h"
21 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
22 #include "mlir/Dialect/Vector/EDSC/Intrinsics.h"
23 #include "mlir/Dialect/Vector/VectorOps.h"
24 #include "mlir/Dialect/Vector/VectorUtils.h"
25 #include "mlir/IR/AffineExpr.h"
26 #include "mlir/IR/AffineMap.h"
27 #include "mlir/IR/Builders.h"
28 #include "mlir/IR/Matchers.h"
29 #include "mlir/Pass/Pass.h"
30 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
31 #include "mlir/Transforms/Passes.h"
32 
33 using namespace mlir;
34 using namespace mlir::edsc;
35 using namespace mlir::edsc::intrinsics;
36 using vector::TransferReadOp;
37 using vector::TransferWriteOp;
38 
39 // Return a list of Values that correspond to multiple AffineApplyOp, one for
40 // each result of `map`. Each `expr` in `map` is canonicalized and folded
41 // greedily according to its operands.
42 // TODO: factor out in a common location that both linalg and vector can use.
43 static SmallVector<Value, 4>
applyMapToValues(OpBuilder & b,Location loc,AffineMap map,ValueRange values)44 applyMapToValues(OpBuilder &b, Location loc, AffineMap map, ValueRange values) {
45   SmallVector<Value, 4> res;
46   res.reserve(map.getNumResults());
47   unsigned numDims = map.getNumDims(), numSym = map.getNumSymbols();
48   // For each `expr` in `map`, applies the `expr` to the values extracted from
49   // ranges. If the resulting application can be folded into a Value, the
50   // folding occurs eagerly. Otherwise, an affine.apply operation is emitted.
51   for (auto expr : map.getResults()) {
52     AffineMap map = AffineMap::get(numDims, numSym, expr);
53     SmallVector<Value, 4> operands(values.begin(), values.end());
54     fullyComposeAffineMapAndOperands(&map, &operands);
55     canonicalizeMapAndOperands(&map, &operands);
56     res.push_back(b.createOrFold<AffineApplyOp>(loc, map, operands));
57   }
58   return res;
59 }
60 
61 namespace {
62 /// Helper class captures the common information needed to lower N>1-D vector
63 /// transfer operations (read and write).
64 /// On construction, this class opens an edsc::ScopedContext for simpler IR
65 /// manipulation.
66 /// In pseudo-IR, for an n-D vector_transfer_read such as:
67 ///
68 /// ```
69 ///   vector_transfer_read(%m, %offsets, identity_map, %fill) :
70 ///     memref<(leading_dims) x (major_dims) x (minor_dims) x type>,
71 ///     vector<(major_dims) x (minor_dims) x type>
72 /// ```
73 ///
74 /// where rank(minor_dims) is the lower-level vector rank (e.g. 1 for LLVM or
75 /// higher).
76 ///
77 /// This is the entry point to emitting pseudo-IR resembling:
78 ///
79 /// ```
80 ///   %tmp = alloc(): memref<(major_dims) x vector<minor_dim x type>>
81 ///   for (%ivs_major, {0}, {vector_shape}, {1}) { // (N-1)-D loop nest
82 ///     if (any_of(%ivs_major + %offsets, <, major_dims)) {
83 ///       %v = vector_transfer_read(
84 ///         {%offsets_leading, %ivs_major + %offsets_major, %offsets_minor},
85 ///          %ivs_minor):
86 ///         memref<(leading_dims) x (major_dims) x (minor_dims) x type>,
87 ///         vector<(minor_dims) x type>;
88 ///       store(%v, %tmp);
89 ///     } else {
90 ///       %v = splat(vector<(minor_dims) x type>, %fill)
91 ///       store(%v, %tmp, %ivs_major);
92 ///     }
93 ///   }
94 ///   %res = load(%tmp, %0): memref<(major_dims) x vector<minor_dim x type>>):
95 //      vector<(major_dims) x (minor_dims) x type>
96 /// ```
97 ///
98 template <typename ConcreteOp>
99 class NDTransferOpHelper {
100 public:
NDTransferOpHelper(PatternRewriter & rewriter,ConcreteOp xferOp,const VectorTransferToSCFOptions & options)101   NDTransferOpHelper(PatternRewriter &rewriter, ConcreteOp xferOp,
102                      const VectorTransferToSCFOptions &options)
103       : rewriter(rewriter), options(options), loc(xferOp.getLoc()),
104         scope(std::make_unique<ScopedContext>(rewriter, loc)), xferOp(xferOp),
105         op(xferOp.getOperation()) {
106     vectorType = xferOp.getVectorType();
107     // TODO: when we go to k > 1-D vectors adapt minorRank.
108     minorRank = 1;
109     majorRank = vectorType.getRank() - minorRank;
110     leadingRank = xferOp.getLeadingMemRefRank();
111     majorVectorType =
112         VectorType::get(vectorType.getShape().take_front(majorRank),
113                         vectorType.getElementType());
114     minorVectorType =
115         VectorType::get(vectorType.getShape().take_back(minorRank),
116                         vectorType.getElementType());
117     /// Memref of minor vector type is used for individual transfers.
118     memRefMinorVectorType =
119         MemRefType::get(majorVectorType.getShape(), minorVectorType, {},
120                         xferOp.getMemRefType().getMemorySpace());
121   }
122 
123   LogicalResult doReplace();
124 
125 private:
126   /// Creates the loop nest on the "major" dimensions and calls the
127   /// `loopBodyBuilder` lambda in the context of the loop nest.
128   void
129   emitLoops(llvm::function_ref<void(ValueRange, ValueRange, ValueRange,
130                                     ValueRange, const MemRefBoundsCapture &)>
131                 loopBodyBuilder);
132 
133   /// Common state to lower vector transfer ops.
134   PatternRewriter &rewriter;
135   const VectorTransferToSCFOptions &options;
136   Location loc;
137   std::unique_ptr<ScopedContext> scope;
138   ConcreteOp xferOp;
139   Operation *op;
140   // A vector transfer copies data between:
141   //   - memref<(leading_dims) x (major_dims) x (minor_dims) x type>
142   //   - vector<(major_dims) x (minor_dims) x type>
143   unsigned minorRank;         // for now always 1
144   unsigned majorRank;         // vector rank - minorRank
145   unsigned leadingRank;       // memref rank - vector rank
146   VectorType vectorType;      // vector<(major_dims) x (minor_dims) x type>
147   VectorType majorVectorType; // vector<(major_dims) x type>
148   VectorType minorVectorType; // vector<(minor_dims) x type>
149   MemRefType memRefMinorVectorType; // memref<vector<(minor_dims) x type>>
150 };
151 
152 template <typename ConcreteOp>
emitLoops(llvm::function_ref<void (ValueRange,ValueRange,ValueRange,ValueRange,const MemRefBoundsCapture &)> loopBodyBuilder)153 void NDTransferOpHelper<ConcreteOp>::emitLoops(
154     llvm::function_ref<void(ValueRange, ValueRange, ValueRange, ValueRange,
155                             const MemRefBoundsCapture &)>
156         loopBodyBuilder) {
157   /// Loop nest operates on the major dimensions
158   MemRefBoundsCapture memrefBoundsCapture(xferOp.memref());
159 
160   if (options.unroll) {
161     auto shape = majorVectorType.getShape();
162     auto strides = computeStrides(shape);
163     unsigned numUnrolledInstances = computeMaxLinearIndex(shape);
164     ValueRange indices(xferOp.indices());
165     for (unsigned idx = 0; idx < numUnrolledInstances; ++idx) {
166       SmallVector<int64_t, 4> offsets = delinearize(strides, idx);
167       SmallVector<Value, 4> offsetValues =
168           llvm::to_vector<4>(llvm::map_range(offsets, [](int64_t off) -> Value {
169             return std_constant_index(off);
170           }));
171       loopBodyBuilder(offsetValues, indices.take_front(leadingRank),
172                       indices.drop_front(leadingRank).take_front(majorRank),
173                       indices.take_back(minorRank), memrefBoundsCapture);
174     }
175   } else {
176     VectorBoundsCapture vectorBoundsCapture(majorVectorType);
177     auto majorLbs = vectorBoundsCapture.getLbs();
178     auto majorUbs = vectorBoundsCapture.getUbs();
179     auto majorSteps = vectorBoundsCapture.getSteps();
180     affineLoopNestBuilder(
181         majorLbs, majorUbs, majorSteps, [&](ValueRange majorIvs) {
182           ValueRange indices(xferOp.indices());
183           loopBodyBuilder(majorIvs, indices.take_front(leadingRank),
184                           indices.drop_front(leadingRank).take_front(majorRank),
185                           indices.take_back(minorRank), memrefBoundsCapture);
186         });
187   }
188 }
189 
extractConstantIndex(Value v)190 static Optional<int64_t> extractConstantIndex(Value v) {
191   if (auto cstOp = v.getDefiningOp<ConstantIndexOp>())
192     return cstOp.getValue();
193   if (auto affineApplyOp = v.getDefiningOp<AffineApplyOp>())
194     if (affineApplyOp.getAffineMap().isSingleConstant())
195       return affineApplyOp.getAffineMap().getSingleConstantResult();
196   return None;
197 }
198 
199 // Missing foldings of scf.if make it necessary to perform poor man's folding
200 // eagerly, especially in the case of unrolling. In the future, this should go
201 // away once scf.if folds properly.
onTheFlyFoldSLT(Value v,Value ub)202 static Value onTheFlyFoldSLT(Value v, Value ub) {
203   using namespace mlir::edsc::op;
204   auto maybeCstV = extractConstantIndex(v);
205   auto maybeCstUb = extractConstantIndex(ub);
206   if (maybeCstV && maybeCstUb && *maybeCstV < *maybeCstUb)
207     return Value();
208   return slt(v, ub);
209 }
210 
211 ///   1. Compute the indexings `majorIvs + majorOffsets` and save them in
212 ///      `majorIvsPlusOffsets`.
213 ///   2. Return a value of i1 that determines whether the first
214 ///   `majorIvs.rank()`
215 ///      dimensions `majorIvs + majorOffsets` are all within `memrefBounds`.
216 static Value
emitInBoundsCondition(PatternRewriter & rewriter,VectorTransferOpInterface xferOp,unsigned leadingRank,ValueRange majorIvs,ValueRange majorOffsets,const MemRefBoundsCapture & memrefBounds,SmallVectorImpl<Value> & majorIvsPlusOffsets)217 emitInBoundsCondition(PatternRewriter &rewriter,
218                       VectorTransferOpInterface xferOp, unsigned leadingRank,
219                       ValueRange majorIvs, ValueRange majorOffsets,
220                       const MemRefBoundsCapture &memrefBounds,
221                       SmallVectorImpl<Value> &majorIvsPlusOffsets) {
222   Value inBoundsCondition;
223   majorIvsPlusOffsets.reserve(majorIvs.size());
224   unsigned idx = 0;
225   SmallVector<Value, 4> bounds =
226       applyMapToValues(rewriter, xferOp.getLoc(), xferOp.permutation_map(),
227                        memrefBounds.getUbs());
228   for (auto it : llvm::zip(majorIvs, majorOffsets, bounds)) {
229     Value iv = std::get<0>(it), off = std::get<1>(it), ub = std::get<2>(it);
230     using namespace mlir::edsc::op;
231     majorIvsPlusOffsets.push_back(iv + off);
232     if (xferOp.isMaskedDim(leadingRank + idx)) {
233       Value inBoundsCond = onTheFlyFoldSLT(majorIvsPlusOffsets.back(), ub);
234       if (inBoundsCond)
235         inBoundsCondition = (inBoundsCondition)
236                                 ? (inBoundsCondition && inBoundsCond)
237                                 : inBoundsCond;
238     }
239     ++idx;
240   }
241   return inBoundsCondition;
242 }
243 
244 // TODO: Parallelism and threadlocal considerations.
setAllocAtFunctionEntry(MemRefType memRefMinorVectorType,Operation * op)245 static Value setAllocAtFunctionEntry(MemRefType memRefMinorVectorType,
246                                      Operation *op) {
247   auto &b = ScopedContext::getBuilderRef();
248   OpBuilder::InsertionGuard guard(b);
249   Operation *scope =
250       op->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
251   assert(scope && "Expected op to be inside automatic allocation scope");
252   b.setInsertionPointToStart(&scope->getRegion(0).front());
253   Value res = std_alloca(memRefMinorVectorType);
254   return res;
255 }
256 
257 template <>
doReplace()258 LogicalResult NDTransferOpHelper<TransferReadOp>::doReplace() {
259   Value alloc, result;
260   if (options.unroll)
261     result = std_splat(vectorType, xferOp.padding());
262   else
263     alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op);
264 
265   emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets,
266                 ValueRange majorOffsets, ValueRange minorOffsets,
267                 const MemRefBoundsCapture &memrefBounds) {
268     /// Lambda to load 1-D vector in the current loop ivs + offset context.
269     auto load1DVector = [&](ValueRange majorIvsPlusOffsets) -> Value {
270       SmallVector<Value, 8> indexing;
271       indexing.reserve(leadingRank + majorRank + minorRank);
272       indexing.append(leadingOffsets.begin(), leadingOffsets.end());
273       indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end());
274       indexing.append(minorOffsets.begin(), minorOffsets.end());
275       Value memref = xferOp.memref();
276       auto map =
277           getTransferMinorIdentityMap(xferOp.getMemRefType(), minorVectorType);
278       ArrayAttr masked;
279       if (!xferOp.isMaskedDim(xferOp.getVectorType().getRank() - 1)) {
280         OpBuilder &b = ScopedContext::getBuilderRef();
281         masked = b.getBoolArrayAttr({false});
282       }
283       return vector_transfer_read(minorVectorType, memref, indexing,
284                                   AffineMapAttr::get(map), xferOp.padding(),
285                                   masked);
286     };
287 
288     // 1. Compute the inBoundsCondition in the current loops ivs + offset
289     // context.
290     SmallVector<Value, 4> majorIvsPlusOffsets;
291     Value inBoundsCondition = emitInBoundsCondition(
292         rewriter, cast<VectorTransferOpInterface>(xferOp.getOperation()),
293         leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets);
294 
295     if (inBoundsCondition) {
296       // 2. If the condition is not null, we need an IfOp, which may yield
297       // if `options.unroll` is true.
298       SmallVector<Type, 1> resultType;
299       if (options.unroll)
300         resultType.push_back(vectorType);
301 
302       // 3. If in-bounds, progressively lower to a 1-D transfer read, otherwise
303       // splat a 1-D vector.
304       ValueRange ifResults = conditionBuilder(
305           resultType, inBoundsCondition,
306           [&]() -> scf::ValueVector {
307             Value vector = load1DVector(majorIvsPlusOffsets);
308             // 3.a. If `options.unroll` is true, insert the 1-D vector in the
309             // aggregate. We must yield and merge with the `else` branch.
310             if (options.unroll) {
311               vector = vector_insert(vector, result, majorIvs);
312               return {vector};
313             }
314             // 3.b. Otherwise, just go through the temporary `alloc`.
315             std_store(vector, alloc, majorIvs);
316             return {};
317           },
318           [&]() -> scf::ValueVector {
319             Value vector = std_splat(minorVectorType, xferOp.padding());
320             // 3.c. If `options.unroll` is true, insert the 1-D vector in the
321             // aggregate. We must yield and merge with the `then` branch.
322             if (options.unroll) {
323               vector = vector_insert(vector, result, majorIvs);
324               return {vector};
325             }
326             // 3.d. Otherwise, just go through the temporary `alloc`.
327             std_store(vector, alloc, majorIvs);
328             return {};
329           });
330 
331       if (!resultType.empty())
332         result = *ifResults.begin();
333     } else {
334       // 4. Guaranteed in-bounds, progressively lower to a 1-D transfer read.
335       Value loaded1D = load1DVector(majorIvsPlusOffsets);
336       // 5.a. If `options.unroll` is true, insert the 1-D vector in the
337       // aggregate.
338       if (options.unroll)
339         result = vector_insert(loaded1D, result, majorIvs);
340       // 5.b. Otherwise, just go through the temporary `alloc`.
341       else
342         std_store(loaded1D, alloc, majorIvs);
343     }
344   });
345 
346   assert((!options.unroll ^ (bool)result) &&
347          "Expected resulting Value iff unroll");
348   if (!result)
349     result = std_load(vector_type_cast(MemRefType::get({}, vectorType), alloc));
350   rewriter.replaceOp(op, result);
351 
352   return success();
353 }
354 
355 template <>
doReplace()356 LogicalResult NDTransferOpHelper<TransferWriteOp>::doReplace() {
357   Value alloc;
358   if (!options.unroll) {
359     alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op);
360     std_store(xferOp.vector(),
361               vector_type_cast(MemRefType::get({}, vectorType), alloc));
362   }
363 
364   emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets,
365                 ValueRange majorOffsets, ValueRange minorOffsets,
366                 const MemRefBoundsCapture &memrefBounds) {
367     // Lower to 1-D vector_transfer_write and let recursion handle it.
368     auto emitTransferWrite = [&](ValueRange majorIvsPlusOffsets) {
369       SmallVector<Value, 8> indexing;
370       indexing.reserve(leadingRank + majorRank + minorRank);
371       indexing.append(leadingOffsets.begin(), leadingOffsets.end());
372       indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end());
373       indexing.append(minorOffsets.begin(), minorOffsets.end());
374       Value result;
375       // If `options.unroll` is true, extract the 1-D vector from the
376       // aggregate.
377       if (options.unroll)
378         result = vector_extract(xferOp.vector(), majorIvs);
379       else
380         result = std_load(alloc, majorIvs);
381       auto map =
382           getTransferMinorIdentityMap(xferOp.getMemRefType(), minorVectorType);
383       ArrayAttr masked;
384       if (!xferOp.isMaskedDim(xferOp.getVectorType().getRank() - 1)) {
385         OpBuilder &b = ScopedContext::getBuilderRef();
386         masked = b.getBoolArrayAttr({false});
387       }
388       vector_transfer_write(result, xferOp.memref(), indexing,
389                             AffineMapAttr::get(map), masked);
390     };
391 
392     // 1. Compute the inBoundsCondition in the current loops ivs + offset
393     // context.
394     SmallVector<Value, 4> majorIvsPlusOffsets;
395     Value inBoundsCondition = emitInBoundsCondition(
396         rewriter, cast<VectorTransferOpInterface>(xferOp.getOperation()),
397         leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets);
398 
399     if (inBoundsCondition) {
400       // 2.a. If the condition is not null, we need an IfOp, to write
401       // conditionally. Progressively lower to a 1-D transfer write.
402       conditionBuilder(inBoundsCondition,
403                        [&] { emitTransferWrite(majorIvsPlusOffsets); });
404     } else {
405       // 2.b. Guaranteed in-bounds. Progressively lower to a 1-D transfer write.
406       emitTransferWrite(majorIvsPlusOffsets);
407     }
408   });
409 
410   rewriter.eraseOp(op);
411 
412   return success();
413 }
414 
415 } // namespace
416 
417 /// Analyzes the `transfer` to find an access dimension along the fastest remote
418 /// MemRef dimension. If such a dimension with coalescing properties is found,
419 /// `pivs` and `vectorBoundsCapture` are swapped so that the invocation of
420 /// LoopNestBuilder captures it in the innermost loop.
421 template <typename TransferOpTy>
computeCoalescedIndex(TransferOpTy transfer)422 static int computeCoalescedIndex(TransferOpTy transfer) {
423   // rank of the remote memory access, coalescing behavior occurs on the
424   // innermost memory dimension.
425   auto remoteRank = transfer.getMemRefType().getRank();
426   // Iterate over the results expressions of the permutation map to determine
427   // the loop order for creating pointwise copies between remote and local
428   // memories.
429   int coalescedIdx = -1;
430   auto exprs = transfer.permutation_map().getResults();
431   for (auto en : llvm::enumerate(exprs)) {
432     auto dim = en.value().template dyn_cast<AffineDimExpr>();
433     if (!dim) {
434       continue;
435     }
436     auto memRefDim = dim.getPosition();
437     if (memRefDim == remoteRank - 1) {
438       // memRefDim has coalescing properties, it should be swapped in the last
439       // position.
440       assert(coalescedIdx == -1 && "Unexpected > 1 coalesced indices");
441       coalescedIdx = en.index();
442     }
443   }
444   return coalescedIdx;
445 }
446 
447 template <typename TransferOpTy>
VectorTransferRewriter(VectorTransferToSCFOptions options,MLIRContext * context)448 VectorTransferRewriter<TransferOpTy>::VectorTransferRewriter(
449     VectorTransferToSCFOptions options, MLIRContext *context)
450     : RewritePattern(TransferOpTy::getOperationName(), 1, context),
451       options(options) {}
452 
453 /// Used for staging the transfer in a local buffer.
454 template <typename TransferOpTy>
tmpMemRefType(TransferOpTy transfer) const455 MemRefType VectorTransferRewriter<TransferOpTy>::tmpMemRefType(
456     TransferOpTy transfer) const {
457   auto vectorType = transfer.getVectorType();
458   return MemRefType::get(vectorType.getShape().drop_back(),
459                          VectorType::get(vectorType.getShape().take_back(),
460                                          vectorType.getElementType()),
461                          {}, 0);
462 }
463 
emitWithBoundsChecks(PatternRewriter & rewriter,VectorTransferOpInterface transfer,ValueRange ivs,const MemRefBoundsCapture & memRefBoundsCapture,function_ref<void (ArrayRef<Value>)> inBoundsFun,function_ref<void (ArrayRef<Value>)> outOfBoundsFun=nullptr)464 static void emitWithBoundsChecks(
465     PatternRewriter &rewriter, VectorTransferOpInterface transfer,
466     ValueRange ivs, const MemRefBoundsCapture &memRefBoundsCapture,
467     function_ref<void(ArrayRef<Value>)> inBoundsFun,
468     function_ref<void(ArrayRef<Value>)> outOfBoundsFun = nullptr) {
469   // Permute the incoming indices according to the permutation map.
470   SmallVector<Value, 4> indices =
471       applyMapToValues(rewriter, transfer.getLoc(), transfer.permutation_map(),
472                        transfer.indices());
473 
474   // Generate a bounds check if necessary.
475   SmallVector<Value, 4> majorIvsPlusOffsets;
476   Value inBoundsCondition =
477       emitInBoundsCondition(rewriter, transfer, 0, ivs, indices,
478                             memRefBoundsCapture, majorIvsPlusOffsets);
479 
480   // Apply the permutation map to the ivs. The permutation map may not use all
481   // the inputs.
482   SmallVector<Value, 4> scalarAccessExprs(transfer.indices().size());
483   for (unsigned memRefDim = 0; memRefDim < transfer.indices().size();
484        ++memRefDim) {
485     // Linear search on a small number of entries.
486     int loopIndex = -1;
487     auto exprs = transfer.permutation_map().getResults();
488     for (auto en : llvm::enumerate(exprs)) {
489       auto expr = en.value();
490       auto dim = expr.dyn_cast<AffineDimExpr>();
491       // Sanity check.
492       assert((dim || expr.cast<AffineConstantExpr>().getValue() == 0) &&
493              "Expected dim or 0 in permutationMap");
494       if (dim && memRefDim == dim.getPosition()) {
495         loopIndex = en.index();
496         break;
497       }
498     }
499 
500     using namespace edsc::op;
501     auto i = transfer.indices()[memRefDim];
502     scalarAccessExprs[memRefDim] = loopIndex < 0 ? i : i + ivs[loopIndex];
503   }
504 
505   if (inBoundsCondition)
506     conditionBuilder(
507         /* scf.if */ inBoundsCondition, // {
508         [&] { inBoundsFun(scalarAccessExprs); },
509         // } else {
510         outOfBoundsFun ? [&] { outOfBoundsFun(scalarAccessExprs); }
511                        : function_ref<void()>()
512         // }
513     );
514   else
515     inBoundsFun(scalarAccessExprs);
516 }
517 
518 namespace mlir {
519 
520 /// Lowers TransferReadOp into a combination of:
521 ///   1. local memory allocation;
522 ///   2. perfect loop nest over:
523 ///      a. scalar load from local buffers (viewed as a scalar memref);
524 ///      a. scalar store to original memref (with padding).
525 ///   3. vector_load from local buffer (viewed as a memref<1 x vector>);
526 ///   4. local memory deallocation.
527 ///
528 /// Lowers the data transfer part of a TransferReadOp while ensuring no
529 /// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by
530 /// padding.
531 
532 /// Performs the rewrite.
533 template <>
matchAndRewrite(Operation * op,PatternRewriter & rewriter) const534 LogicalResult VectorTransferRewriter<TransferReadOp>::matchAndRewrite(
535     Operation *op, PatternRewriter &rewriter) const {
536   using namespace mlir::edsc::op;
537 
538   TransferReadOp transfer = cast<TransferReadOp>(op);
539 
540   // Fall back to a loop if the fastest varying stride is not 1 or it is
541   // permuted.
542   int64_t offset;
543   SmallVector<int64_t, 4> strides;
544   auto successStrides =
545       getStridesAndOffset(transfer.getMemRefType(), strides, offset);
546   if (succeeded(successStrides) && strides.back() == 1 &&
547       transfer.permutation_map().isMinorIdentity()) {
548     // If > 1D, emit a bunch of loops around 1-D vector transfers.
549     if (transfer.getVectorType().getRank() > 1)
550       return NDTransferOpHelper<TransferReadOp>(rewriter, transfer, options)
551           .doReplace();
552     // If 1-D this is now handled by the target-specific lowering.
553     if (transfer.getVectorType().getRank() == 1)
554       return failure();
555   }
556 
557   // Conservative lowering to scalar load / stores.
558   // 1. Setup all the captures.
559   ScopedContext scope(rewriter, transfer.getLoc());
560   StdIndexedValue remote(transfer.memref());
561   MemRefBoundsCapture memRefBoundsCapture(transfer.memref());
562   VectorBoundsCapture vectorBoundsCapture(transfer.vector());
563   int coalescedIdx = computeCoalescedIndex(transfer);
564   // Swap the vectorBoundsCapture which will reorder loop bounds.
565   if (coalescedIdx >= 0)
566     vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1,
567                                    coalescedIdx);
568 
569   auto lbs = vectorBoundsCapture.getLbs();
570   auto ubs = vectorBoundsCapture.getUbs();
571   SmallVector<Value, 8> steps;
572   steps.reserve(vectorBoundsCapture.getSteps().size());
573   for (auto step : vectorBoundsCapture.getSteps())
574     steps.push_back(std_constant_index(step));
575 
576   // 2. Emit alloc-copy-load-dealloc.
577   MLIRContext *ctx = op->getContext();
578   Value tmp = setAllocAtFunctionEntry(tmpMemRefType(transfer), transfer);
579   StdIndexedValue local(tmp);
580   loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) {
581     auto ivsStorage = llvm::to_vector<8>(loopIvs);
582     // Swap the ivs which will reorder memory accesses.
583     if (coalescedIdx >= 0)
584       std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]);
585 
586     ArrayRef<Value> ivs(ivsStorage);
587     Value pos = std_index_cast(IntegerType::get(32, ctx), ivs.back());
588     Value inVector = local(ivs.drop_back());
589     auto loadValue = [&](ArrayRef<Value> indices) {
590       Value vector = vector_insert_element(remote(indices), inVector, pos);
591       local(ivs.drop_back()) = vector;
592     };
593     auto loadPadding = [&](ArrayRef<Value>) {
594       Value vector = vector_insert_element(transfer.padding(), inVector, pos);
595       local(ivs.drop_back()) = vector;
596     };
597     emitWithBoundsChecks(
598         rewriter, cast<VectorTransferOpInterface>(transfer.getOperation()), ivs,
599         memRefBoundsCapture, loadValue, loadPadding);
600   });
601   Value vectorValue = std_load(vector_type_cast(tmp));
602 
603   // 3. Propagate.
604   rewriter.replaceOp(op, vectorValue);
605   return success();
606 }
607 
608 /// Lowers TransferWriteOp into a combination of:
609 ///   1. local memory allocation;
610 ///   2. vector_store to local buffer (viewed as a memref<1 x vector>);
611 ///   3. perfect loop nest over:
612 ///      a. scalar load from local buffers (viewed as a scalar memref);
613 ///      a. scalar store to original memref (if in bounds).
614 ///   4. local memory deallocation.
615 ///
616 /// More specifically, lowers the data transfer part while ensuring no
617 /// out-of-bounds accesses are possible.
618 template <>
matchAndRewrite(Operation * op,PatternRewriter & rewriter) const619 LogicalResult VectorTransferRewriter<TransferWriteOp>::matchAndRewrite(
620     Operation *op, PatternRewriter &rewriter) const {
621   using namespace edsc::op;
622 
623   TransferWriteOp transfer = cast<TransferWriteOp>(op);
624 
625   // Fall back to a loop if the fastest varying stride is not 1 or it is
626   // permuted.
627   int64_t offset;
628   SmallVector<int64_t, 4> strides;
629   auto successStrides =
630       getStridesAndOffset(transfer.getMemRefType(), strides, offset);
631   if (succeeded(successStrides) && strides.back() == 1 &&
632       transfer.permutation_map().isMinorIdentity()) {
633     // If > 1D, emit a bunch of loops around 1-D vector transfers.
634     if (transfer.getVectorType().getRank() > 1)
635       return NDTransferOpHelper<TransferWriteOp>(rewriter, transfer, options)
636           .doReplace();
637     // If 1-D this is now handled by the target-specific lowering.
638     if (transfer.getVectorType().getRank() == 1)
639       return failure();
640   }
641 
642   // 1. Setup all the captures.
643   ScopedContext scope(rewriter, transfer.getLoc());
644   StdIndexedValue remote(transfer.memref());
645   MemRefBoundsCapture memRefBoundsCapture(transfer.memref());
646   Value vectorValue(transfer.vector());
647   VectorBoundsCapture vectorBoundsCapture(transfer.vector());
648   int coalescedIdx = computeCoalescedIndex(transfer);
649   // Swap the vectorBoundsCapture which will reorder loop bounds.
650   if (coalescedIdx >= 0)
651     vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1,
652                                    coalescedIdx);
653 
654   auto lbs = vectorBoundsCapture.getLbs();
655   auto ubs = vectorBoundsCapture.getUbs();
656   SmallVector<Value, 8> steps;
657   steps.reserve(vectorBoundsCapture.getSteps().size());
658   for (auto step : vectorBoundsCapture.getSteps())
659     steps.push_back(std_constant_index(step));
660 
661   // 2. Emit alloc-store-copy-dealloc.
662   Value tmp = setAllocAtFunctionEntry(tmpMemRefType(transfer), transfer);
663   StdIndexedValue local(tmp);
664   Value vec = vector_type_cast(tmp);
665   std_store(vectorValue, vec);
666   loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) {
667     auto ivsStorage = llvm::to_vector<8>(loopIvs);
668     // Swap the ivsStorage which will reorder memory accesses.
669     if (coalescedIdx >= 0)
670       std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]);
671 
672     ArrayRef<Value> ivs(ivsStorage);
673     Value pos =
674         std_index_cast(IntegerType::get(32, op->getContext()), ivs.back());
675     auto storeValue = [&](ArrayRef<Value> indices) {
676       Value scalar = vector_extract_element(local(ivs.drop_back()), pos);
677       remote(indices) = scalar;
678     };
679     emitWithBoundsChecks(
680         rewriter, cast<VectorTransferOpInterface>(transfer.getOperation()), ivs,
681         memRefBoundsCapture, storeValue);
682   });
683 
684   // 3. Erase.
685   rewriter.eraseOp(op);
686   return success();
687 }
688 
populateVectorToSCFConversionPatterns(OwningRewritePatternList & patterns,MLIRContext * context,const VectorTransferToSCFOptions & options)689 void populateVectorToSCFConversionPatterns(
690     OwningRewritePatternList &patterns, MLIRContext *context,
691     const VectorTransferToSCFOptions &options) {
692   patterns.insert<VectorTransferRewriter<vector::TransferReadOp>,
693                   VectorTransferRewriter<vector::TransferWriteOp>>(options,
694                                                                    context);
695 }
696 
697 } // namespace mlir
698 
699 namespace {
700 
701 struct ConvertVectorToSCFPass
702     : public ConvertVectorToSCFBase<ConvertVectorToSCFPass> {
703   ConvertVectorToSCFPass() = default;
ConvertVectorToSCFPass__anonf27aa1611211::ConvertVectorToSCFPass704   ConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
705     this->fullUnroll = options.unroll;
706   }
707 
runOnFunction__anonf27aa1611211::ConvertVectorToSCFPass708   void runOnFunction() override {
709     OwningRewritePatternList patterns;
710     auto *context = getFunction().getContext();
711     populateVectorToSCFConversionPatterns(
712         patterns, context, VectorTransferToSCFOptions().setUnroll(fullUnroll));
713     applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
714   }
715 };
716 
717 } // namespace
718 
719 std::unique_ptr<Pass>
createConvertVectorToSCFPass(const VectorTransferToSCFOptions & options)720 mlir::createConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
721   return std::make_unique<ConvertVectorToSCFPass>(options);
722 }
723