1 //===- VectorToSCF.cpp - Conversion from Vector to mix of SCF and Std -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements target-dependent lowering of vector transfer operations.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include <type_traits>
14
15 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
16
17 #include "../PassDetail.h"
18 #include "mlir/Dialect/Affine/EDSC/Intrinsics.h"
19 #include "mlir/Dialect/SCF/EDSC/Builders.h"
20 #include "mlir/Dialect/SCF/EDSC/Intrinsics.h"
21 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
22 #include "mlir/Dialect/Vector/EDSC/Intrinsics.h"
23 #include "mlir/Dialect/Vector/VectorOps.h"
24 #include "mlir/Dialect/Vector/VectorUtils.h"
25 #include "mlir/IR/AffineExpr.h"
26 #include "mlir/IR/AffineMap.h"
27 #include "mlir/IR/Builders.h"
28 #include "mlir/IR/Matchers.h"
29 #include "mlir/Pass/Pass.h"
30 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
31 #include "mlir/Transforms/Passes.h"
32
33 using namespace mlir;
34 using namespace mlir::edsc;
35 using namespace mlir::edsc::intrinsics;
36 using vector::TransferReadOp;
37 using vector::TransferWriteOp;
38
39 // Return a list of Values that correspond to multiple AffineApplyOp, one for
40 // each result of `map`. Each `expr` in `map` is canonicalized and folded
41 // greedily according to its operands.
42 // TODO: factor out in a common location that both linalg and vector can use.
43 static SmallVector<Value, 4>
applyMapToValues(OpBuilder & b,Location loc,AffineMap map,ValueRange values)44 applyMapToValues(OpBuilder &b, Location loc, AffineMap map, ValueRange values) {
45 SmallVector<Value, 4> res;
46 res.reserve(map.getNumResults());
47 unsigned numDims = map.getNumDims(), numSym = map.getNumSymbols();
48 // For each `expr` in `map`, applies the `expr` to the values extracted from
49 // ranges. If the resulting application can be folded into a Value, the
50 // folding occurs eagerly. Otherwise, an affine.apply operation is emitted.
51 for (auto expr : map.getResults()) {
52 AffineMap map = AffineMap::get(numDims, numSym, expr);
53 SmallVector<Value, 4> operands(values.begin(), values.end());
54 fullyComposeAffineMapAndOperands(&map, &operands);
55 canonicalizeMapAndOperands(&map, &operands);
56 res.push_back(b.createOrFold<AffineApplyOp>(loc, map, operands));
57 }
58 return res;
59 }
60
61 namespace {
62 /// Helper class captures the common information needed to lower N>1-D vector
63 /// transfer operations (read and write).
64 /// On construction, this class opens an edsc::ScopedContext for simpler IR
65 /// manipulation.
66 /// In pseudo-IR, for an n-D vector_transfer_read such as:
67 ///
68 /// ```
69 /// vector_transfer_read(%m, %offsets, identity_map, %fill) :
70 /// memref<(leading_dims) x (major_dims) x (minor_dims) x type>,
71 /// vector<(major_dims) x (minor_dims) x type>
72 /// ```
73 ///
74 /// where rank(minor_dims) is the lower-level vector rank (e.g. 1 for LLVM or
75 /// higher).
76 ///
77 /// This is the entry point to emitting pseudo-IR resembling:
78 ///
79 /// ```
80 /// %tmp = alloc(): memref<(major_dims) x vector<minor_dim x type>>
81 /// for (%ivs_major, {0}, {vector_shape}, {1}) { // (N-1)-D loop nest
82 /// if (any_of(%ivs_major + %offsets, <, major_dims)) {
83 /// %v = vector_transfer_read(
84 /// {%offsets_leading, %ivs_major + %offsets_major, %offsets_minor},
85 /// %ivs_minor):
86 /// memref<(leading_dims) x (major_dims) x (minor_dims) x type>,
87 /// vector<(minor_dims) x type>;
88 /// store(%v, %tmp);
89 /// } else {
90 /// %v = splat(vector<(minor_dims) x type>, %fill)
91 /// store(%v, %tmp, %ivs_major);
92 /// }
93 /// }
94 /// %res = load(%tmp, %0): memref<(major_dims) x vector<minor_dim x type>>):
95 // vector<(major_dims) x (minor_dims) x type>
96 /// ```
97 ///
98 template <typename ConcreteOp>
99 class NDTransferOpHelper {
100 public:
NDTransferOpHelper(PatternRewriter & rewriter,ConcreteOp xferOp,const VectorTransferToSCFOptions & options)101 NDTransferOpHelper(PatternRewriter &rewriter, ConcreteOp xferOp,
102 const VectorTransferToSCFOptions &options)
103 : rewriter(rewriter), options(options), loc(xferOp.getLoc()),
104 scope(std::make_unique<ScopedContext>(rewriter, loc)), xferOp(xferOp),
105 op(xferOp.getOperation()) {
106 vectorType = xferOp.getVectorType();
107 // TODO: when we go to k > 1-D vectors adapt minorRank.
108 minorRank = 1;
109 majorRank = vectorType.getRank() - minorRank;
110 leadingRank = xferOp.getLeadingMemRefRank();
111 majorVectorType =
112 VectorType::get(vectorType.getShape().take_front(majorRank),
113 vectorType.getElementType());
114 minorVectorType =
115 VectorType::get(vectorType.getShape().take_back(minorRank),
116 vectorType.getElementType());
117 /// Memref of minor vector type is used for individual transfers.
118 memRefMinorVectorType =
119 MemRefType::get(majorVectorType.getShape(), minorVectorType, {},
120 xferOp.getMemRefType().getMemorySpace());
121 }
122
123 LogicalResult doReplace();
124
125 private:
126 /// Creates the loop nest on the "major" dimensions and calls the
127 /// `loopBodyBuilder` lambda in the context of the loop nest.
128 void
129 emitLoops(llvm::function_ref<void(ValueRange, ValueRange, ValueRange,
130 ValueRange, const MemRefBoundsCapture &)>
131 loopBodyBuilder);
132
133 /// Common state to lower vector transfer ops.
134 PatternRewriter &rewriter;
135 const VectorTransferToSCFOptions &options;
136 Location loc;
137 std::unique_ptr<ScopedContext> scope;
138 ConcreteOp xferOp;
139 Operation *op;
140 // A vector transfer copies data between:
141 // - memref<(leading_dims) x (major_dims) x (minor_dims) x type>
142 // - vector<(major_dims) x (minor_dims) x type>
143 unsigned minorRank; // for now always 1
144 unsigned majorRank; // vector rank - minorRank
145 unsigned leadingRank; // memref rank - vector rank
146 VectorType vectorType; // vector<(major_dims) x (minor_dims) x type>
147 VectorType majorVectorType; // vector<(major_dims) x type>
148 VectorType minorVectorType; // vector<(minor_dims) x type>
149 MemRefType memRefMinorVectorType; // memref<vector<(minor_dims) x type>>
150 };
151
152 template <typename ConcreteOp>
emitLoops(llvm::function_ref<void (ValueRange,ValueRange,ValueRange,ValueRange,const MemRefBoundsCapture &)> loopBodyBuilder)153 void NDTransferOpHelper<ConcreteOp>::emitLoops(
154 llvm::function_ref<void(ValueRange, ValueRange, ValueRange, ValueRange,
155 const MemRefBoundsCapture &)>
156 loopBodyBuilder) {
157 /// Loop nest operates on the major dimensions
158 MemRefBoundsCapture memrefBoundsCapture(xferOp.memref());
159
160 if (options.unroll) {
161 auto shape = majorVectorType.getShape();
162 auto strides = computeStrides(shape);
163 unsigned numUnrolledInstances = computeMaxLinearIndex(shape);
164 ValueRange indices(xferOp.indices());
165 for (unsigned idx = 0; idx < numUnrolledInstances; ++idx) {
166 SmallVector<int64_t, 4> offsets = delinearize(strides, idx);
167 SmallVector<Value, 4> offsetValues =
168 llvm::to_vector<4>(llvm::map_range(offsets, [](int64_t off) -> Value {
169 return std_constant_index(off);
170 }));
171 loopBodyBuilder(offsetValues, indices.take_front(leadingRank),
172 indices.drop_front(leadingRank).take_front(majorRank),
173 indices.take_back(minorRank), memrefBoundsCapture);
174 }
175 } else {
176 VectorBoundsCapture vectorBoundsCapture(majorVectorType);
177 auto majorLbs = vectorBoundsCapture.getLbs();
178 auto majorUbs = vectorBoundsCapture.getUbs();
179 auto majorSteps = vectorBoundsCapture.getSteps();
180 affineLoopNestBuilder(
181 majorLbs, majorUbs, majorSteps, [&](ValueRange majorIvs) {
182 ValueRange indices(xferOp.indices());
183 loopBodyBuilder(majorIvs, indices.take_front(leadingRank),
184 indices.drop_front(leadingRank).take_front(majorRank),
185 indices.take_back(minorRank), memrefBoundsCapture);
186 });
187 }
188 }
189
extractConstantIndex(Value v)190 static Optional<int64_t> extractConstantIndex(Value v) {
191 if (auto cstOp = v.getDefiningOp<ConstantIndexOp>())
192 return cstOp.getValue();
193 if (auto affineApplyOp = v.getDefiningOp<AffineApplyOp>())
194 if (affineApplyOp.getAffineMap().isSingleConstant())
195 return affineApplyOp.getAffineMap().getSingleConstantResult();
196 return None;
197 }
198
199 // Missing foldings of scf.if make it necessary to perform poor man's folding
200 // eagerly, especially in the case of unrolling. In the future, this should go
201 // away once scf.if folds properly.
onTheFlyFoldSLT(Value v,Value ub)202 static Value onTheFlyFoldSLT(Value v, Value ub) {
203 using namespace mlir::edsc::op;
204 auto maybeCstV = extractConstantIndex(v);
205 auto maybeCstUb = extractConstantIndex(ub);
206 if (maybeCstV && maybeCstUb && *maybeCstV < *maybeCstUb)
207 return Value();
208 return slt(v, ub);
209 }
210
211 /// 1. Compute the indexings `majorIvs + majorOffsets` and save them in
212 /// `majorIvsPlusOffsets`.
213 /// 2. Return a value of i1 that determines whether the first
214 /// `majorIvs.rank()`
215 /// dimensions `majorIvs + majorOffsets` are all within `memrefBounds`.
216 static Value
emitInBoundsCondition(PatternRewriter & rewriter,VectorTransferOpInterface xferOp,unsigned leadingRank,ValueRange majorIvs,ValueRange majorOffsets,const MemRefBoundsCapture & memrefBounds,SmallVectorImpl<Value> & majorIvsPlusOffsets)217 emitInBoundsCondition(PatternRewriter &rewriter,
218 VectorTransferOpInterface xferOp, unsigned leadingRank,
219 ValueRange majorIvs, ValueRange majorOffsets,
220 const MemRefBoundsCapture &memrefBounds,
221 SmallVectorImpl<Value> &majorIvsPlusOffsets) {
222 Value inBoundsCondition;
223 majorIvsPlusOffsets.reserve(majorIvs.size());
224 unsigned idx = 0;
225 SmallVector<Value, 4> bounds =
226 applyMapToValues(rewriter, xferOp.getLoc(), xferOp.permutation_map(),
227 memrefBounds.getUbs());
228 for (auto it : llvm::zip(majorIvs, majorOffsets, bounds)) {
229 Value iv = std::get<0>(it), off = std::get<1>(it), ub = std::get<2>(it);
230 using namespace mlir::edsc::op;
231 majorIvsPlusOffsets.push_back(iv + off);
232 if (xferOp.isMaskedDim(leadingRank + idx)) {
233 Value inBoundsCond = onTheFlyFoldSLT(majorIvsPlusOffsets.back(), ub);
234 if (inBoundsCond)
235 inBoundsCondition = (inBoundsCondition)
236 ? (inBoundsCondition && inBoundsCond)
237 : inBoundsCond;
238 }
239 ++idx;
240 }
241 return inBoundsCondition;
242 }
243
244 // TODO: Parallelism and threadlocal considerations.
setAllocAtFunctionEntry(MemRefType memRefMinorVectorType,Operation * op)245 static Value setAllocAtFunctionEntry(MemRefType memRefMinorVectorType,
246 Operation *op) {
247 auto &b = ScopedContext::getBuilderRef();
248 OpBuilder::InsertionGuard guard(b);
249 Operation *scope =
250 op->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
251 assert(scope && "Expected op to be inside automatic allocation scope");
252 b.setInsertionPointToStart(&scope->getRegion(0).front());
253 Value res = std_alloca(memRefMinorVectorType);
254 return res;
255 }
256
257 template <>
doReplace()258 LogicalResult NDTransferOpHelper<TransferReadOp>::doReplace() {
259 Value alloc, result;
260 if (options.unroll)
261 result = std_splat(vectorType, xferOp.padding());
262 else
263 alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op);
264
265 emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets,
266 ValueRange majorOffsets, ValueRange minorOffsets,
267 const MemRefBoundsCapture &memrefBounds) {
268 /// Lambda to load 1-D vector in the current loop ivs + offset context.
269 auto load1DVector = [&](ValueRange majorIvsPlusOffsets) -> Value {
270 SmallVector<Value, 8> indexing;
271 indexing.reserve(leadingRank + majorRank + minorRank);
272 indexing.append(leadingOffsets.begin(), leadingOffsets.end());
273 indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end());
274 indexing.append(minorOffsets.begin(), minorOffsets.end());
275 Value memref = xferOp.memref();
276 auto map =
277 getTransferMinorIdentityMap(xferOp.getMemRefType(), minorVectorType);
278 ArrayAttr masked;
279 if (!xferOp.isMaskedDim(xferOp.getVectorType().getRank() - 1)) {
280 OpBuilder &b = ScopedContext::getBuilderRef();
281 masked = b.getBoolArrayAttr({false});
282 }
283 return vector_transfer_read(minorVectorType, memref, indexing,
284 AffineMapAttr::get(map), xferOp.padding(),
285 masked);
286 };
287
288 // 1. Compute the inBoundsCondition in the current loops ivs + offset
289 // context.
290 SmallVector<Value, 4> majorIvsPlusOffsets;
291 Value inBoundsCondition = emitInBoundsCondition(
292 rewriter, cast<VectorTransferOpInterface>(xferOp.getOperation()),
293 leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets);
294
295 if (inBoundsCondition) {
296 // 2. If the condition is not null, we need an IfOp, which may yield
297 // if `options.unroll` is true.
298 SmallVector<Type, 1> resultType;
299 if (options.unroll)
300 resultType.push_back(vectorType);
301
302 // 3. If in-bounds, progressively lower to a 1-D transfer read, otherwise
303 // splat a 1-D vector.
304 ValueRange ifResults = conditionBuilder(
305 resultType, inBoundsCondition,
306 [&]() -> scf::ValueVector {
307 Value vector = load1DVector(majorIvsPlusOffsets);
308 // 3.a. If `options.unroll` is true, insert the 1-D vector in the
309 // aggregate. We must yield and merge with the `else` branch.
310 if (options.unroll) {
311 vector = vector_insert(vector, result, majorIvs);
312 return {vector};
313 }
314 // 3.b. Otherwise, just go through the temporary `alloc`.
315 std_store(vector, alloc, majorIvs);
316 return {};
317 },
318 [&]() -> scf::ValueVector {
319 Value vector = std_splat(minorVectorType, xferOp.padding());
320 // 3.c. If `options.unroll` is true, insert the 1-D vector in the
321 // aggregate. We must yield and merge with the `then` branch.
322 if (options.unroll) {
323 vector = vector_insert(vector, result, majorIvs);
324 return {vector};
325 }
326 // 3.d. Otherwise, just go through the temporary `alloc`.
327 std_store(vector, alloc, majorIvs);
328 return {};
329 });
330
331 if (!resultType.empty())
332 result = *ifResults.begin();
333 } else {
334 // 4. Guaranteed in-bounds, progressively lower to a 1-D transfer read.
335 Value loaded1D = load1DVector(majorIvsPlusOffsets);
336 // 5.a. If `options.unroll` is true, insert the 1-D vector in the
337 // aggregate.
338 if (options.unroll)
339 result = vector_insert(loaded1D, result, majorIvs);
340 // 5.b. Otherwise, just go through the temporary `alloc`.
341 else
342 std_store(loaded1D, alloc, majorIvs);
343 }
344 });
345
346 assert((!options.unroll ^ (bool)result) &&
347 "Expected resulting Value iff unroll");
348 if (!result)
349 result = std_load(vector_type_cast(MemRefType::get({}, vectorType), alloc));
350 rewriter.replaceOp(op, result);
351
352 return success();
353 }
354
355 template <>
doReplace()356 LogicalResult NDTransferOpHelper<TransferWriteOp>::doReplace() {
357 Value alloc;
358 if (!options.unroll) {
359 alloc = setAllocAtFunctionEntry(memRefMinorVectorType, op);
360 std_store(xferOp.vector(),
361 vector_type_cast(MemRefType::get({}, vectorType), alloc));
362 }
363
364 emitLoops([&](ValueRange majorIvs, ValueRange leadingOffsets,
365 ValueRange majorOffsets, ValueRange minorOffsets,
366 const MemRefBoundsCapture &memrefBounds) {
367 // Lower to 1-D vector_transfer_write and let recursion handle it.
368 auto emitTransferWrite = [&](ValueRange majorIvsPlusOffsets) {
369 SmallVector<Value, 8> indexing;
370 indexing.reserve(leadingRank + majorRank + minorRank);
371 indexing.append(leadingOffsets.begin(), leadingOffsets.end());
372 indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end());
373 indexing.append(minorOffsets.begin(), minorOffsets.end());
374 Value result;
375 // If `options.unroll` is true, extract the 1-D vector from the
376 // aggregate.
377 if (options.unroll)
378 result = vector_extract(xferOp.vector(), majorIvs);
379 else
380 result = std_load(alloc, majorIvs);
381 auto map =
382 getTransferMinorIdentityMap(xferOp.getMemRefType(), minorVectorType);
383 ArrayAttr masked;
384 if (!xferOp.isMaskedDim(xferOp.getVectorType().getRank() - 1)) {
385 OpBuilder &b = ScopedContext::getBuilderRef();
386 masked = b.getBoolArrayAttr({false});
387 }
388 vector_transfer_write(result, xferOp.memref(), indexing,
389 AffineMapAttr::get(map), masked);
390 };
391
392 // 1. Compute the inBoundsCondition in the current loops ivs + offset
393 // context.
394 SmallVector<Value, 4> majorIvsPlusOffsets;
395 Value inBoundsCondition = emitInBoundsCondition(
396 rewriter, cast<VectorTransferOpInterface>(xferOp.getOperation()),
397 leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets);
398
399 if (inBoundsCondition) {
400 // 2.a. If the condition is not null, we need an IfOp, to write
401 // conditionally. Progressively lower to a 1-D transfer write.
402 conditionBuilder(inBoundsCondition,
403 [&] { emitTransferWrite(majorIvsPlusOffsets); });
404 } else {
405 // 2.b. Guaranteed in-bounds. Progressively lower to a 1-D transfer write.
406 emitTransferWrite(majorIvsPlusOffsets);
407 }
408 });
409
410 rewriter.eraseOp(op);
411
412 return success();
413 }
414
415 } // namespace
416
417 /// Analyzes the `transfer` to find an access dimension along the fastest remote
418 /// MemRef dimension. If such a dimension with coalescing properties is found,
419 /// `pivs` and `vectorBoundsCapture` are swapped so that the invocation of
420 /// LoopNestBuilder captures it in the innermost loop.
421 template <typename TransferOpTy>
computeCoalescedIndex(TransferOpTy transfer)422 static int computeCoalescedIndex(TransferOpTy transfer) {
423 // rank of the remote memory access, coalescing behavior occurs on the
424 // innermost memory dimension.
425 auto remoteRank = transfer.getMemRefType().getRank();
426 // Iterate over the results expressions of the permutation map to determine
427 // the loop order for creating pointwise copies between remote and local
428 // memories.
429 int coalescedIdx = -1;
430 auto exprs = transfer.permutation_map().getResults();
431 for (auto en : llvm::enumerate(exprs)) {
432 auto dim = en.value().template dyn_cast<AffineDimExpr>();
433 if (!dim) {
434 continue;
435 }
436 auto memRefDim = dim.getPosition();
437 if (memRefDim == remoteRank - 1) {
438 // memRefDim has coalescing properties, it should be swapped in the last
439 // position.
440 assert(coalescedIdx == -1 && "Unexpected > 1 coalesced indices");
441 coalescedIdx = en.index();
442 }
443 }
444 return coalescedIdx;
445 }
446
447 template <typename TransferOpTy>
VectorTransferRewriter(VectorTransferToSCFOptions options,MLIRContext * context)448 VectorTransferRewriter<TransferOpTy>::VectorTransferRewriter(
449 VectorTransferToSCFOptions options, MLIRContext *context)
450 : RewritePattern(TransferOpTy::getOperationName(), 1, context),
451 options(options) {}
452
453 /// Used for staging the transfer in a local buffer.
454 template <typename TransferOpTy>
tmpMemRefType(TransferOpTy transfer) const455 MemRefType VectorTransferRewriter<TransferOpTy>::tmpMemRefType(
456 TransferOpTy transfer) const {
457 auto vectorType = transfer.getVectorType();
458 return MemRefType::get(vectorType.getShape().drop_back(),
459 VectorType::get(vectorType.getShape().take_back(),
460 vectorType.getElementType()),
461 {}, 0);
462 }
463
emitWithBoundsChecks(PatternRewriter & rewriter,VectorTransferOpInterface transfer,ValueRange ivs,const MemRefBoundsCapture & memRefBoundsCapture,function_ref<void (ArrayRef<Value>)> inBoundsFun,function_ref<void (ArrayRef<Value>)> outOfBoundsFun=nullptr)464 static void emitWithBoundsChecks(
465 PatternRewriter &rewriter, VectorTransferOpInterface transfer,
466 ValueRange ivs, const MemRefBoundsCapture &memRefBoundsCapture,
467 function_ref<void(ArrayRef<Value>)> inBoundsFun,
468 function_ref<void(ArrayRef<Value>)> outOfBoundsFun = nullptr) {
469 // Permute the incoming indices according to the permutation map.
470 SmallVector<Value, 4> indices =
471 applyMapToValues(rewriter, transfer.getLoc(), transfer.permutation_map(),
472 transfer.indices());
473
474 // Generate a bounds check if necessary.
475 SmallVector<Value, 4> majorIvsPlusOffsets;
476 Value inBoundsCondition =
477 emitInBoundsCondition(rewriter, transfer, 0, ivs, indices,
478 memRefBoundsCapture, majorIvsPlusOffsets);
479
480 // Apply the permutation map to the ivs. The permutation map may not use all
481 // the inputs.
482 SmallVector<Value, 4> scalarAccessExprs(transfer.indices().size());
483 for (unsigned memRefDim = 0; memRefDim < transfer.indices().size();
484 ++memRefDim) {
485 // Linear search on a small number of entries.
486 int loopIndex = -1;
487 auto exprs = transfer.permutation_map().getResults();
488 for (auto en : llvm::enumerate(exprs)) {
489 auto expr = en.value();
490 auto dim = expr.dyn_cast<AffineDimExpr>();
491 // Sanity check.
492 assert((dim || expr.cast<AffineConstantExpr>().getValue() == 0) &&
493 "Expected dim or 0 in permutationMap");
494 if (dim && memRefDim == dim.getPosition()) {
495 loopIndex = en.index();
496 break;
497 }
498 }
499
500 using namespace edsc::op;
501 auto i = transfer.indices()[memRefDim];
502 scalarAccessExprs[memRefDim] = loopIndex < 0 ? i : i + ivs[loopIndex];
503 }
504
505 if (inBoundsCondition)
506 conditionBuilder(
507 /* scf.if */ inBoundsCondition, // {
508 [&] { inBoundsFun(scalarAccessExprs); },
509 // } else {
510 outOfBoundsFun ? [&] { outOfBoundsFun(scalarAccessExprs); }
511 : function_ref<void()>()
512 // }
513 );
514 else
515 inBoundsFun(scalarAccessExprs);
516 }
517
518 namespace mlir {
519
520 /// Lowers TransferReadOp into a combination of:
521 /// 1. local memory allocation;
522 /// 2. perfect loop nest over:
523 /// a. scalar load from local buffers (viewed as a scalar memref);
524 /// a. scalar store to original memref (with padding).
525 /// 3. vector_load from local buffer (viewed as a memref<1 x vector>);
526 /// 4. local memory deallocation.
527 ///
528 /// Lowers the data transfer part of a TransferReadOp while ensuring no
529 /// out-of-bounds accesses are possible. Out-of-bounds behavior is handled by
530 /// padding.
531
532 /// Performs the rewrite.
533 template <>
matchAndRewrite(Operation * op,PatternRewriter & rewriter) const534 LogicalResult VectorTransferRewriter<TransferReadOp>::matchAndRewrite(
535 Operation *op, PatternRewriter &rewriter) const {
536 using namespace mlir::edsc::op;
537
538 TransferReadOp transfer = cast<TransferReadOp>(op);
539
540 // Fall back to a loop if the fastest varying stride is not 1 or it is
541 // permuted.
542 int64_t offset;
543 SmallVector<int64_t, 4> strides;
544 auto successStrides =
545 getStridesAndOffset(transfer.getMemRefType(), strides, offset);
546 if (succeeded(successStrides) && strides.back() == 1 &&
547 transfer.permutation_map().isMinorIdentity()) {
548 // If > 1D, emit a bunch of loops around 1-D vector transfers.
549 if (transfer.getVectorType().getRank() > 1)
550 return NDTransferOpHelper<TransferReadOp>(rewriter, transfer, options)
551 .doReplace();
552 // If 1-D this is now handled by the target-specific lowering.
553 if (transfer.getVectorType().getRank() == 1)
554 return failure();
555 }
556
557 // Conservative lowering to scalar load / stores.
558 // 1. Setup all the captures.
559 ScopedContext scope(rewriter, transfer.getLoc());
560 StdIndexedValue remote(transfer.memref());
561 MemRefBoundsCapture memRefBoundsCapture(transfer.memref());
562 VectorBoundsCapture vectorBoundsCapture(transfer.vector());
563 int coalescedIdx = computeCoalescedIndex(transfer);
564 // Swap the vectorBoundsCapture which will reorder loop bounds.
565 if (coalescedIdx >= 0)
566 vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1,
567 coalescedIdx);
568
569 auto lbs = vectorBoundsCapture.getLbs();
570 auto ubs = vectorBoundsCapture.getUbs();
571 SmallVector<Value, 8> steps;
572 steps.reserve(vectorBoundsCapture.getSteps().size());
573 for (auto step : vectorBoundsCapture.getSteps())
574 steps.push_back(std_constant_index(step));
575
576 // 2. Emit alloc-copy-load-dealloc.
577 MLIRContext *ctx = op->getContext();
578 Value tmp = setAllocAtFunctionEntry(tmpMemRefType(transfer), transfer);
579 StdIndexedValue local(tmp);
580 loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) {
581 auto ivsStorage = llvm::to_vector<8>(loopIvs);
582 // Swap the ivs which will reorder memory accesses.
583 if (coalescedIdx >= 0)
584 std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]);
585
586 ArrayRef<Value> ivs(ivsStorage);
587 Value pos = std_index_cast(IntegerType::get(32, ctx), ivs.back());
588 Value inVector = local(ivs.drop_back());
589 auto loadValue = [&](ArrayRef<Value> indices) {
590 Value vector = vector_insert_element(remote(indices), inVector, pos);
591 local(ivs.drop_back()) = vector;
592 };
593 auto loadPadding = [&](ArrayRef<Value>) {
594 Value vector = vector_insert_element(transfer.padding(), inVector, pos);
595 local(ivs.drop_back()) = vector;
596 };
597 emitWithBoundsChecks(
598 rewriter, cast<VectorTransferOpInterface>(transfer.getOperation()), ivs,
599 memRefBoundsCapture, loadValue, loadPadding);
600 });
601 Value vectorValue = std_load(vector_type_cast(tmp));
602
603 // 3. Propagate.
604 rewriter.replaceOp(op, vectorValue);
605 return success();
606 }
607
608 /// Lowers TransferWriteOp into a combination of:
609 /// 1. local memory allocation;
610 /// 2. vector_store to local buffer (viewed as a memref<1 x vector>);
611 /// 3. perfect loop nest over:
612 /// a. scalar load from local buffers (viewed as a scalar memref);
613 /// a. scalar store to original memref (if in bounds).
614 /// 4. local memory deallocation.
615 ///
616 /// More specifically, lowers the data transfer part while ensuring no
617 /// out-of-bounds accesses are possible.
618 template <>
matchAndRewrite(Operation * op,PatternRewriter & rewriter) const619 LogicalResult VectorTransferRewriter<TransferWriteOp>::matchAndRewrite(
620 Operation *op, PatternRewriter &rewriter) const {
621 using namespace edsc::op;
622
623 TransferWriteOp transfer = cast<TransferWriteOp>(op);
624
625 // Fall back to a loop if the fastest varying stride is not 1 or it is
626 // permuted.
627 int64_t offset;
628 SmallVector<int64_t, 4> strides;
629 auto successStrides =
630 getStridesAndOffset(transfer.getMemRefType(), strides, offset);
631 if (succeeded(successStrides) && strides.back() == 1 &&
632 transfer.permutation_map().isMinorIdentity()) {
633 // If > 1D, emit a bunch of loops around 1-D vector transfers.
634 if (transfer.getVectorType().getRank() > 1)
635 return NDTransferOpHelper<TransferWriteOp>(rewriter, transfer, options)
636 .doReplace();
637 // If 1-D this is now handled by the target-specific lowering.
638 if (transfer.getVectorType().getRank() == 1)
639 return failure();
640 }
641
642 // 1. Setup all the captures.
643 ScopedContext scope(rewriter, transfer.getLoc());
644 StdIndexedValue remote(transfer.memref());
645 MemRefBoundsCapture memRefBoundsCapture(transfer.memref());
646 Value vectorValue(transfer.vector());
647 VectorBoundsCapture vectorBoundsCapture(transfer.vector());
648 int coalescedIdx = computeCoalescedIndex(transfer);
649 // Swap the vectorBoundsCapture which will reorder loop bounds.
650 if (coalescedIdx >= 0)
651 vectorBoundsCapture.swapRanges(vectorBoundsCapture.rank() - 1,
652 coalescedIdx);
653
654 auto lbs = vectorBoundsCapture.getLbs();
655 auto ubs = vectorBoundsCapture.getUbs();
656 SmallVector<Value, 8> steps;
657 steps.reserve(vectorBoundsCapture.getSteps().size());
658 for (auto step : vectorBoundsCapture.getSteps())
659 steps.push_back(std_constant_index(step));
660
661 // 2. Emit alloc-store-copy-dealloc.
662 Value tmp = setAllocAtFunctionEntry(tmpMemRefType(transfer), transfer);
663 StdIndexedValue local(tmp);
664 Value vec = vector_type_cast(tmp);
665 std_store(vectorValue, vec);
666 loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) {
667 auto ivsStorage = llvm::to_vector<8>(loopIvs);
668 // Swap the ivsStorage which will reorder memory accesses.
669 if (coalescedIdx >= 0)
670 std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]);
671
672 ArrayRef<Value> ivs(ivsStorage);
673 Value pos =
674 std_index_cast(IntegerType::get(32, op->getContext()), ivs.back());
675 auto storeValue = [&](ArrayRef<Value> indices) {
676 Value scalar = vector_extract_element(local(ivs.drop_back()), pos);
677 remote(indices) = scalar;
678 };
679 emitWithBoundsChecks(
680 rewriter, cast<VectorTransferOpInterface>(transfer.getOperation()), ivs,
681 memRefBoundsCapture, storeValue);
682 });
683
684 // 3. Erase.
685 rewriter.eraseOp(op);
686 return success();
687 }
688
populateVectorToSCFConversionPatterns(OwningRewritePatternList & patterns,MLIRContext * context,const VectorTransferToSCFOptions & options)689 void populateVectorToSCFConversionPatterns(
690 OwningRewritePatternList &patterns, MLIRContext *context,
691 const VectorTransferToSCFOptions &options) {
692 patterns.insert<VectorTransferRewriter<vector::TransferReadOp>,
693 VectorTransferRewriter<vector::TransferWriteOp>>(options,
694 context);
695 }
696
697 } // namespace mlir
698
699 namespace {
700
701 struct ConvertVectorToSCFPass
702 : public ConvertVectorToSCFBase<ConvertVectorToSCFPass> {
703 ConvertVectorToSCFPass() = default;
ConvertVectorToSCFPass__anonf27aa1611211::ConvertVectorToSCFPass704 ConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
705 this->fullUnroll = options.unroll;
706 }
707
runOnFunction__anonf27aa1611211::ConvertVectorToSCFPass708 void runOnFunction() override {
709 OwningRewritePatternList patterns;
710 auto *context = getFunction().getContext();
711 populateVectorToSCFConversionPatterns(
712 patterns, context, VectorTransferToSCFOptions().setUnroll(fullUnroll));
713 applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
714 }
715 };
716
717 } // namespace
718
719 std::unique_ptr<Pass>
createConvertVectorToSCFPass(const VectorTransferToSCFOptions & options)720 mlir::createConvertVectorToSCFPass(const VectorTransferToSCFOptions &options) {
721 return std::make_unique<ConvertVectorToSCFPass>(options);
722 }
723