1 //===- SuperVectorize.cpp - Vectorize Pass Impl ---------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements vectorization of loops, operations and data types to
10 // a target-independent, n-D super-vector abstraction.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "PassDetail.h"
15 #include "mlir/Analysis/LoopAnalysis.h"
16 #include "mlir/Analysis/NestedMatcher.h"
17 #include "mlir/Analysis/SliceAnalysis.h"
18 #include "mlir/Analysis/Utils.h"
19 #include "mlir/Dialect/Affine/IR/AffineOps.h"
20 #include "mlir/Dialect/Affine/Passes.h"
21 #include "mlir/Dialect/Affine/Utils.h"
22 #include "mlir/Dialect/StandardOps/IR/Ops.h"
23 #include "mlir/Dialect/Vector/VectorOps.h"
24 #include "mlir/Dialect/Vector/VectorUtils.h"
25 #include "mlir/IR/AffineExpr.h"
26 #include "mlir/IR/Builders.h"
27 #include "mlir/IR/Location.h"
28 #include "mlir/IR/Types.h"
29 #include "mlir/Support/LLVM.h"
30 #include "mlir/Transforms/FoldUtils.h"
31 
32 #include "llvm/ADT/DenseMap.h"
33 #include "llvm/ADT/DenseSet.h"
34 #include "llvm/ADT/SetVector.h"
35 #include "llvm/ADT/SmallString.h"
36 #include "llvm/ADT/SmallVector.h"
37 #include "llvm/Support/CommandLine.h"
38 #include "llvm/Support/Debug.h"
39 
40 using namespace mlir;
41 using namespace vector;
42 
43 ///
44 /// Implements a high-level vectorization strategy on a Function.
45 /// The abstraction used is that of super-vectors, which provide a single,
46 /// compact, representation in the vector types, information that is expected
47 /// to reduce the impact of the phase ordering problem
48 ///
49 /// Vector granularity:
50 /// ===================
51 /// This pass is designed to perform vectorization at a super-vector
52 /// granularity. A super-vector is loosely defined as a vector type that is a
53 /// multiple of a "good" vector size so the HW can efficiently implement a set
54 /// of high-level primitives. Multiple is understood along any dimension; e.g.
55 /// both vector<16xf32> and vector<2x8xf32> are valid super-vectors for a
56 /// vector<8xf32> HW vector. Note that a "good vector size so the HW can
57 /// efficiently implement a set of high-level primitives" is not necessarily an
58 /// integer multiple of actual hardware registers. We leave details of this
59 /// distinction unspecified for now.
60 ///
61 /// Some may prefer the terminology a "tile of HW vectors". In this case, one
62 /// should note that super-vectors implement an "always full tile" abstraction.
63 /// They guarantee no partial-tile separation is necessary by relying on a
64 /// high-level copy-reshape abstraction that we call vector.transfer. This
65 /// copy-reshape operations is also responsible for performing layout
66 /// transposition if necessary. In the general case this will require a scoped
67 /// allocation in some notional local memory.
68 ///
69 /// Whatever the mental model one prefers to use for this abstraction, the key
70 /// point is that we burn into a single, compact, representation in the vector
71 /// types, information that is expected to reduce the impact of the phase
72 /// ordering problem. Indeed, a vector type conveys information that:
73 ///   1. the associated loops have dependency semantics that do not prevent
74 ///      vectorization;
75 ///   2. the associate loops have been sliced in chunks of static sizes that are
76 ///      compatible with vector sizes (i.e. similar to unroll-and-jam);
77 ///   3. the inner loops, in the unroll-and-jam analogy of 2, are captured by
78 ///   the
79 ///      vector type and no vectorization hampering transformations can be
80 ///      applied to them anymore;
81 ///   4. the underlying memrefs are accessed in some notional contiguous way
82 ///      that allows loading into vectors with some amount of spatial locality;
83 /// In other words, super-vectorization provides a level of separation of
84 /// concern by way of opacity to subsequent passes. This has the effect of
85 /// encapsulating and propagating vectorization constraints down the list of
86 /// passes until we are ready to lower further.
87 ///
88 /// For a particular target, a notion of minimal n-d vector size will be
89 /// specified and vectorization targets a multiple of those. In the following
90 /// paragraph, let "k ." represent "a multiple of", to be understood as a
91 /// multiple in the same dimension (e.g. vector<16 x k . 128> summarizes
92 /// vector<16 x 128>, vector<16 x 256>, vector<16 x 1024>, etc).
93 ///
94 /// Some non-exhaustive notable super-vector sizes of interest include:
95 ///   - CPU: vector<k . HW_vector_size>,
96 ///          vector<k' . core_count x k . HW_vector_size>,
97 ///          vector<socket_count x k' . core_count x k . HW_vector_size>;
98 ///   - GPU: vector<k . warp_size>,
99 ///          vector<k . warp_size x float2>,
100 ///          vector<k . warp_size x float4>,
101 ///          vector<k . warp_size x 4 x 4x 4> (for tensor_core sizes).
102 ///
103 /// Loops and operations are emitted that operate on those super-vector shapes.
104 /// Subsequent lowering passes will materialize to actual HW vector sizes. These
105 /// passes are expected to be (gradually) more target-specific.
106 ///
107 /// At a high level, a vectorized load in a loop will resemble:
108 /// ```mlir
109 ///   affine.for %i = ? to ? step ? {
110 ///     %v_a = vector.transfer_read A[%i] : memref<?xf32>, vector<128xf32>
111 ///   }
112 /// ```
113 /// It is the responsibility of the implementation of vector.transfer_read to
114 /// materialize vector registers from the original scalar memrefs. A later (more
115 /// target-dependent) lowering pass will materialize to actual HW vector sizes.
116 /// This lowering may be occur at different times:
117 ///   1. at the MLIR level into a combination of loops, unrolling, DmaStartOp +
118 ///      DmaWaitOp + vectorized operations for data transformations and shuffle;
119 ///      thus opening opportunities for unrolling and pipelining. This is an
120 ///      instance of library call "whiteboxing"; or
121 ///   2. later in the a target-specific lowering pass or hand-written library
122 ///      call; achieving full separation of concerns. This is an instance of
123 ///      library call; or
124 ///   3. a mix of both, e.g. based on a model.
125 /// In the future, these operations will expose a contract to constrain the
126 /// search on vectorization patterns and sizes.
127 ///
128 /// Occurrence of super-vectorization in the compiler flow:
129 /// =======================================================
130 /// This is an active area of investigation. We start with 2 remarks to position
131 /// super-vectorization in the context of existing ongoing work: LLVM VPLAN
132 /// and LLVM SLP Vectorizer.
133 ///
134 /// LLVM VPLAN:
135 /// -----------
136 /// The astute reader may have noticed that in the limit, super-vectorization
137 /// can be applied at a similar time and with similar objectives than VPLAN.
138 /// For instance, in the case of a traditional, polyhedral compilation-flow (for
139 /// instance, the PPCG project uses ISL to provide dependence analysis,
140 /// multi-level(scheduling + tiling), lifting footprint to fast memory,
141 /// communication synthesis, mapping, register optimizations) and before
142 /// unrolling. When vectorization is applied at this *late* level in a typical
143 /// polyhedral flow, and is instantiated with actual hardware vector sizes,
144 /// super-vectorization is expected to match (or subsume) the type of patterns
145 /// that LLVM's VPLAN aims at targeting. The main difference here is that MLIR
146 /// is higher level and our implementation should be significantly simpler. Also
147 /// note that in this mode, recursive patterns are probably a bit of an overkill
148 /// although it is reasonable to expect that mixing a bit of outer loop and
149 /// inner loop vectorization + unrolling will provide interesting choices to
150 /// MLIR.
151 ///
152 /// LLVM SLP Vectorizer:
153 /// --------------------
154 /// Super-vectorization however is not meant to be usable in a similar fashion
155 /// to the SLP vectorizer. The main difference lies in the information that
156 /// both vectorizers use: super-vectorization examines contiguity of memory
157 /// references along fastest varying dimensions and loops with recursive nested
158 /// patterns capturing imperfectly-nested loop nests; the SLP vectorizer, on
159 /// the other hand, performs flat pattern matching inside a single unrolled loop
160 /// body and stitches together pieces of load and store operations into full
161 /// 1-D vectors. We envision that the SLP vectorizer is a good way to capture
162 /// innermost loop, control-flow dependent patterns that super-vectorization may
163 /// not be able to capture easily. In other words, super-vectorization does not
164 /// aim at replacing the SLP vectorizer and the two solutions are complementary.
165 ///
166 /// Ongoing investigations:
167 /// -----------------------
168 /// We discuss the following *early* places where super-vectorization is
169 /// applicable and touch on the expected benefits and risks . We list the
170 /// opportunities in the context of the traditional polyhedral compiler flow
171 /// described in PPCG. There are essentially 6 places in the MLIR pass pipeline
172 /// we expect to experiment with super-vectorization:
173 /// 1. Right after language lowering to MLIR: this is the earliest time where
174 ///    super-vectorization is expected to be applied. At this level, all the
175 ///    language/user/library-level annotations are available and can be fully
176 ///    exploited. Examples include loop-type annotations (such as parallel,
177 ///    reduction, scan, dependence distance vector, vectorizable) as well as
178 ///    memory access annotations (such as non-aliasing writes guaranteed,
179 ///    indirect accesses that are permutations by construction) accesses or
180 ///    that a particular operation is prescribed atomic by the user. At this
181 ///    level, anything that enriches what dependence analysis can do should be
182 ///    aggressively exploited. At this level we are close to having explicit
183 ///    vector types in the language, except we do not impose that burden on the
184 ///    programmer/library: we derive information from scalar code + annotations.
185 /// 2. After dependence analysis and before polyhedral scheduling: the
186 ///    information that supports vectorization does not need to be supplied by a
187 ///    higher level of abstraction. Traditional dependence analysis is available
188 ///    in MLIR and will be used to drive vectorization and cost models.
189 ///
190 /// Let's pause here and remark that applying super-vectorization as described
191 /// in 1. and 2. presents clear opportunities and risks:
192 ///   - the opportunity is that vectorization is burned in the type system and
193 ///   is protected from the adverse effect of loop scheduling, tiling, loop
194 ///   interchange and all passes downstream. Provided that subsequent passes are
195 ///   able to operate on vector types; the vector shapes, associated loop
196 ///   iterator properties, alignment, and contiguity of fastest varying
197 ///   dimensions are preserved until we lower the super-vector types. We expect
198 ///   this to significantly rein in on the adverse effects of phase ordering.
199 ///   - the risks are that a. all passes after super-vectorization have to work
200 ///   on elemental vector types (not that this is always true, wherever
201 ///   vectorization is applied) and b. that imposing vectorization constraints
202 ///   too early may be overall detrimental to loop fusion, tiling and other
203 ///   transformations because the dependence distances are coarsened when
204 ///   operating on elemental vector types. For this reason, the pattern
205 ///   profitability analysis should include a component that also captures the
206 ///   maximal amount of fusion available under a particular pattern. This is
207 ///   still at the stage of rough ideas but in this context, search is our
208 ///   friend as the Tensor Comprehensions and auto-TVM contributions
209 ///   demonstrated previously.
210 /// Bottom-line is we do not yet have good answers for the above but aim at
211 /// making it easy to answer such questions.
212 ///
213 /// Back to our listing, the last places where early super-vectorization makes
214 /// sense are:
215 /// 3. right after polyhedral-style scheduling: PLUTO-style algorithms are known
216 ///    to improve locality, parallelism and be configurable (e.g. max-fuse,
217 ///    smart-fuse etc). They can also have adverse effects on contiguity
218 ///    properties that are required for vectorization but the vector.transfer
219 ///    copy-reshape-pad-transpose abstraction is expected to help recapture
220 ///    these properties.
221 /// 4. right after polyhedral-style scheduling+tiling;
222 /// 5. right after scheduling+tiling+rescheduling: points 4 and 5 represent
223 ///    probably the most promising places because applying tiling achieves a
224 ///    separation of concerns that allows rescheduling to worry less about
225 ///    locality and more about parallelism and distribution (e.g. min-fuse).
226 ///
227 /// At these levels the risk-reward looks different: on one hand we probably
228 /// lost a good deal of language/user/library-level annotation; on the other
229 /// hand we gained parallelism and locality through scheduling and tiling.
230 /// However we probably want to ensure tiling is compatible with the
231 /// full-tile-only abstraction used in super-vectorization or suffer the
232 /// consequences. It is too early to place bets on what will win but we expect
233 /// super-vectorization to be the right abstraction to allow exploring at all
234 /// these levels. And again, search is our friend.
235 ///
236 /// Lastly, we mention it again here:
237 /// 6. as a MLIR-based alternative to VPLAN.
238 ///
239 /// Lowering, unrolling, pipelining:
240 /// ================================
241 /// TODO: point to the proper places.
242 ///
243 /// Algorithm:
244 /// ==========
245 /// The algorithm proceeds in a few steps:
246 ///  1. defining super-vectorization patterns and matching them on the tree of
247 ///     AffineForOp. A super-vectorization pattern is defined as a recursive
248 ///     data structures that matches and captures nested, imperfectly-nested
249 ///     loops that have a. conformable loop annotations attached (e.g. parallel,
250 ///     reduction, vectorizable, ...) as well as b. all contiguous load/store
251 ///     operations along a specified minor dimension (not necessarily the
252 ///     fastest varying) ;
253 ///  2. analyzing those patterns for profitability (TODO: and
254 ///     interference);
255 ///  3. Then, for each pattern in order:
256 ///    a. applying iterative rewriting of the loop and the load operations in
257 ///       inner-to-outer order. Rewriting is implemented by coarsening the loops
258 ///       and turning load operations into opaque vector.transfer_read ops;
259 ///    b. keeping track of the load operations encountered as "roots" and the
260 ///       store operations as "terminals";
261 ///    c. traversing the use-def chains starting from the roots and iteratively
262 ///       propagating vectorized values. Scalar values that are encountered
263 ///       during this process must come from outside the scope of the current
264 ///       pattern (TODO: enforce this and generalize). Such a scalar value
265 ///       is vectorized only if it is a constant (into a vector splat). The
266 ///       non-constant case is not supported for now and results in the pattern
267 ///       failing to vectorize;
268 ///    d. performing a second traversal on the terminals (store ops) to
269 ///       rewriting the scalar value they write to memory into vector form.
270 ///       If the scalar value has been vectorized previously, we simply replace
271 ///       it by its vector form. Otherwise, if the scalar value is a constant,
272 ///       it is vectorized into a splat. In all other cases, vectorization for
273 ///       the pattern currently fails.
274 ///    e. if everything under the root AffineForOp in the current pattern
275 ///       vectorizes properly, we commit that loop to the IR. Otherwise we
276 ///       discard it and restore a previously cloned version of the loop. Thanks
277 ///       to the recursive scoping nature of matchers and captured patterns,
278 ///       this is transparently achieved by a simple RAII implementation.
279 ///    f. vectorization is applied on the next pattern in the list. Because
280 ///       pattern interference avoidance is not yet implemented and that we do
281 ///       not support further vectorizing an already vector load we need to
282 ///       re-verify that the pattern is still vectorizable. This is expected to
283 ///       make cost models more difficult to write and is subject to improvement
284 ///       in the future.
285 ///
286 /// Points c. and d. above are worth additional comment. In most passes that
287 /// do not change the type of operands, it is usually preferred to eagerly
288 /// `replaceAllUsesWith`. Unfortunately this does not work for vectorization
289 /// because during the use-def chain traversal, all the operands of an operation
290 /// must be available in vector form. Trying to propagate eagerly makes the IR
291 /// temporarily invalid and results in errors such as:
292 ///   `vectorize.mlir:308:13: error: 'addf' op requires the same type for all
293 ///   operands and results
294 ///      %s5 = addf %a5, %b5 : f32`
295 ///
296 /// Lastly, we show a minimal example for which use-def chains rooted in load /
297 /// vector.transfer_read are not enough. This is what motivated splitting
298 /// terminal processing out of the use-def chains starting from loads. In the
299 /// following snippet, there is simply no load::
300 /// ```mlir
301 /// func @fill(%A : memref<128xf32>) -> () {
302 ///   %f1 = constant 1.0 : f32
303 ///   affine.for %i0 = 0 to 32 {
304 ///     affine.store %f1, %A[%i0] : memref<128xf32, 0>
305 ///   }
306 ///   return
307 /// }
308 /// ```
309 ///
310 /// Choice of loop transformation to support the algorithm:
311 /// =======================================================
312 /// The choice of loop transformation to apply for coarsening vectorized loops
313 /// is still subject to exploratory tradeoffs. In particular, say we want to
314 /// vectorize by a factor 128, we want to transform the following input:
315 /// ```mlir
316 ///   affine.for %i = %M to %N {
317 ///     %a = affine.load %A[%i] : memref<?xf32>
318 ///   }
319 /// ```
320 ///
321 /// Traditionally, one would vectorize late (after scheduling, tiling,
322 /// memory promotion etc) say after stripmining (and potentially unrolling in
323 /// the case of LLVM's SLP vectorizer):
324 /// ```mlir
325 ///   affine.for %i = floor(%M, 128) to ceil(%N, 128) {
326 ///     affine.for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) {
327 ///       %a = affine.load %A[%ii] : memref<?xf32>
328 ///     }
329 ///   }
330 /// ```
331 ///
332 /// Instead, we seek to vectorize early and freeze vector types before
333 /// scheduling, so we want to generate a pattern that resembles:
334 /// ```mlir
335 ///   affine.for %i = ? to ? step ? {
336 ///     %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32>
337 ///   }
338 /// ```
339 ///
340 /// i. simply dividing the lower / upper bounds by 128 creates issues
341 ///    when representing expressions such as ii + 1 because now we only
342 ///    have access to original values that have been divided. Additional
343 ///    information is needed to specify accesses at below-128 granularity;
344 /// ii. another alternative is to coarsen the loop step but this may have
345 ///    consequences on dependence analysis and fusability of loops: fusable
346 ///    loops probably need to have the same step (because we don't want to
347 ///    stripmine/unroll to enable fusion).
348 /// As a consequence, we choose to represent the coarsening using the loop
349 /// step for now and reevaluate in the future. Note that we can renormalize
350 /// loop steps later if/when we have evidence that they are problematic.
351 ///
352 /// For the simple strawman example above, vectorizing for a 1-D vector
353 /// abstraction of size 128 returns code similar to:
354 /// ```mlir
355 ///   affine.for %i = %M to %N step 128 {
356 ///     %v_a = vector.transfer_read %A[%i] : memref<?xf32>, vector<128xf32>
357 ///   }
358 /// ```
359 ///
360 /// Unsupported cases, extensions, and work in progress (help welcome :-) ):
361 /// ========================================================================
362 ///   1. lowering to concrete vector types for various HW;
363 ///   2. reduction support;
364 ///   3. non-effecting padding during vector.transfer_read and filter during
365 ///      vector.transfer_write;
366 ///   4. misalignment support vector.transfer_read / vector.transfer_write
367 ///      (hopefully without read-modify-writes);
368 ///   5. control-flow support;
369 ///   6. cost-models, heuristics and search;
370 ///   7. Op implementation, extensions and implication on memref views;
371 ///   8. many TODOs left around.
372 ///
373 /// Examples:
374 /// =========
375 /// Consider the following Function:
376 /// ```mlir
377 /// func @vector_add_2d(%M : index, %N : index) -> f32 {
378 ///   %A = alloc (%M, %N) : memref<?x?xf32, 0>
379 ///   %B = alloc (%M, %N) : memref<?x?xf32, 0>
380 ///   %C = alloc (%M, %N) : memref<?x?xf32, 0>
381 ///   %f1 = constant 1.0 : f32
382 ///   %f2 = constant 2.0 : f32
383 ///   affine.for %i0 = 0 to %M {
384 ///     affine.for %i1 = 0 to %N {
385 ///       // non-scoped %f1
386 ///       affine.store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
387 ///     }
388 ///   }
389 ///   affine.for %i2 = 0 to %M {
390 ///     affine.for %i3 = 0 to %N {
391 ///       // non-scoped %f2
392 ///       affine.store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
393 ///     }
394 ///   }
395 ///   affine.for %i4 = 0 to %M {
396 ///     affine.for %i5 = 0 to %N {
397 ///       %a5 = affine.load %A[%i4, %i5] : memref<?x?xf32, 0>
398 ///       %b5 = affine.load %B[%i4, %i5] : memref<?x?xf32, 0>
399 ///       %s5 = addf %a5, %b5 : f32
400 ///       // non-scoped %f1
401 ///       %s6 = addf %s5, %f1 : f32
402 ///       // non-scoped %f2
403 ///       %s7 = addf %s5, %f2 : f32
404 ///       // diamond dependency.
405 ///       %s8 = addf %s7, %s6 : f32
406 ///       affine.store %s8, %C[%i4, %i5] : memref<?x?xf32, 0>
407 ///     }
408 ///   }
409 ///   %c7 = constant 7 : index
410 ///   %c42 = constant 42 : index
411 ///   %res = load %C[%c7, %c42] : memref<?x?xf32, 0>
412 ///   return %res : f32
413 /// }
414 /// ```
415 ///
416 /// The -affine-vectorize pass with the following arguments:
417 /// ```
418 /// -affine-vectorize="virtual-vector-size=256 test-fastest-varying=0"
419 /// ```
420 ///
421 /// produces this standard innermost-loop vectorized code:
422 /// ```mlir
423 /// func @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 {
424 ///   %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
425 ///   %1 = alloc(%arg0, %arg1) : memref<?x?xf32>
426 ///   %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
427 ///   %cst = constant 1.0 : f32
428 ///   %cst_0 = constant 2.0 : f32
429 ///   affine.for %i0 = 0 to %arg0 {
430 ///     affine.for %i1 = 0 to %arg1 step 256 {
431 ///       %cst_1 = constant dense<vector<256xf32>, 1.0> :
432 ///                vector<256xf32>
433 ///       vector.transfer_write %cst_1, %0[%i0, %i1] :
434 ///                vector<256xf32>, memref<?x?xf32>
435 ///     }
436 ///   }
437 ///   affine.for %i2 = 0 to %arg0 {
438 ///     affine.for %i3 = 0 to %arg1 step 256 {
439 ///       %cst_2 = constant dense<vector<256xf32>, 2.0> :
440 ///                vector<256xf32>
441 ///       vector.transfer_write %cst_2, %1[%i2, %i3] :
442 ///                vector<256xf32>, memref<?x?xf32>
443 ///     }
444 ///   }
445 ///   affine.for %i4 = 0 to %arg0 {
446 ///     affine.for %i5 = 0 to %arg1 step 256 {
447 ///       %3 = vector.transfer_read %0[%i4, %i5] :
448 ///            memref<?x?xf32>, vector<256xf32>
449 ///       %4 = vector.transfer_read %1[%i4, %i5] :
450 ///            memref<?x?xf32>, vector<256xf32>
451 ///       %5 = addf %3, %4 : vector<256xf32>
452 ///       %cst_3 = constant dense<vector<256xf32>, 1.0> :
453 ///                vector<256xf32>
454 ///       %6 = addf %5, %cst_3 : vector<256xf32>
455 ///       %cst_4 = constant dense<vector<256xf32>, 2.0> :
456 ///                vector<256xf32>
457 ///       %7 = addf %5, %cst_4 : vector<256xf32>
458 ///       %8 = addf %7, %6 : vector<256xf32>
459 ///       vector.transfer_write %8, %2[%i4, %i5] :
460 ///                vector<256xf32>, memref<?x?xf32>
461 ///     }
462 ///   }
463 ///   %c7 = constant 7 : index
464 ///   %c42 = constant 42 : index
465 ///   %9 = load %2[%c7, %c42] : memref<?x?xf32>
466 ///   return %9 : f32
467 /// }
468 /// ```
469 ///
470 /// The -affine-vectorize pass with the following arguments:
471 /// ```
472 /// -affine-vectorize="virtual-vector-size=32,256 test-fastest-varying=1,0"
473 /// ```
474 ///
475 /// produces this more interesting mixed outer-innermost-loop vectorized code:
476 /// ```mlir
477 /// func @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 {
478 ///   %0 = alloc(%arg0, %arg1) : memref<?x?xf32>
479 ///   %1 = alloc(%arg0, %arg1) : memref<?x?xf32>
480 ///   %2 = alloc(%arg0, %arg1) : memref<?x?xf32>
481 ///   %cst = constant 1.0 : f32
482 ///   %cst_0 = constant 2.0 : f32
483 ///   affine.for %i0 = 0 to %arg0 step 32 {
484 ///     affine.for %i1 = 0 to %arg1 step 256 {
485 ///       %cst_1 = constant dense<vector<32x256xf32>, 1.0> :
486 ///                vector<32x256xf32>
487 ///       vector.transfer_write %cst_1, %0[%i0, %i1] :
488 ///                vector<32x256xf32>, memref<?x?xf32>
489 ///     }
490 ///   }
491 ///   affine.for %i2 = 0 to %arg0 step 32 {
492 ///     affine.for %i3 = 0 to %arg1 step 256 {
493 ///       %cst_2 = constant dense<vector<32x256xf32>, 2.0> :
494 ///                vector<32x256xf32>
495 ///       vector.transfer_write %cst_2, %1[%i2, %i3] :
496 ///                vector<32x256xf32>, memref<?x?xf32>
497 ///     }
498 ///   }
499 ///   affine.for %i4 = 0 to %arg0 step 32 {
500 ///     affine.for %i5 = 0 to %arg1 step 256 {
501 ///       %3 = vector.transfer_read %0[%i4, %i5] :
502 ///                memref<?x?xf32> vector<32x256xf32>
503 ///       %4 = vector.transfer_read %1[%i4, %i5] :
504 ///                memref<?x?xf32>, vector<32x256xf32>
505 ///       %5 = addf %3, %4 : vector<32x256xf32>
506 ///       %cst_3 = constant dense<vector<32x256xf32>, 1.0> :
507 ///                vector<32x256xf32>
508 ///       %6 = addf %5, %cst_3 : vector<32x256xf32>
509 ///       %cst_4 = constant dense<vector<32x256xf32>, 2.0> :
510 ///                vector<32x256xf32>
511 ///       %7 = addf %5, %cst_4 : vector<32x256xf32>
512 ///       %8 = addf %7, %6 : vector<32x256xf32>
513 ///       vector.transfer_write %8, %2[%i4, %i5] :
514 ///                vector<32x256xf32>, memref<?x?xf32>
515 ///     }
516 ///   }
517 ///   %c7 = constant 7 : index
518 ///   %c42 = constant 42 : index
519 ///   %9 = load %2[%c7, %c42] : memref<?x?xf32>
520 ///   return %9 : f32
521 /// }
522 /// ```
523 ///
524 /// Of course, much more intricate n-D imperfectly-nested patterns can be
525 /// vectorized too and specified in a fully declarative fashion.
526 
527 #define DEBUG_TYPE "early-vect"
528 
529 using llvm::dbgs;
530 using llvm::SetVector;
531 
532 /// Forward declaration.
533 static FilterFunctionType
534 isVectorizableLoopPtrFactory(const DenseSet<Operation *> &parallelLoops,
535                              int fastestVaryingMemRefDimension);
536 
537 /// Creates a vectorization pattern from the command line arguments.
538 /// Up to 3-D patterns are supported.
539 /// If the command line argument requests a pattern of higher order, returns an
540 /// empty pattern list which will conservatively result in no vectorization.
541 static std::vector<NestedPattern>
makePatterns(const DenseSet<Operation * > & parallelLoops,int vectorRank,ArrayRef<int64_t> fastestVaryingPattern)542 makePatterns(const DenseSet<Operation *> &parallelLoops, int vectorRank,
543              ArrayRef<int64_t> fastestVaryingPattern) {
544   using matcher::For;
545   int64_t d0 = fastestVaryingPattern.empty() ? -1 : fastestVaryingPattern[0];
546   int64_t d1 = fastestVaryingPattern.size() < 2 ? -1 : fastestVaryingPattern[1];
547   int64_t d2 = fastestVaryingPattern.size() < 3 ? -1 : fastestVaryingPattern[2];
548   switch (vectorRank) {
549   case 1:
550     return {For(isVectorizableLoopPtrFactory(parallelLoops, d0))};
551   case 2:
552     return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
553                 For(isVectorizableLoopPtrFactory(parallelLoops, d1)))};
554   case 3:
555     return {For(isVectorizableLoopPtrFactory(parallelLoops, d0),
556                 For(isVectorizableLoopPtrFactory(parallelLoops, d1),
557                     For(isVectorizableLoopPtrFactory(parallelLoops, d2))))};
558   default: {
559     return std::vector<NestedPattern>();
560   }
561   }
562 }
563 
vectorTransferPattern()564 static NestedPattern &vectorTransferPattern() {
565   static auto pattern = matcher::Op([](Operation &op) {
566     return isa<vector::TransferReadOp, vector::TransferWriteOp>(op);
567   });
568   return pattern;
569 }
570 
571 namespace {
572 
573 /// Base state for the vectorize pass.
574 /// Command line arguments are preempted by non-empty pass arguments.
575 struct Vectorize : public AffineVectorizeBase<Vectorize> {
576   Vectorize() = default;
577   Vectorize(ArrayRef<int64_t> virtualVectorSize);
578   void runOnFunction() override;
579 };
580 
581 } // end anonymous namespace
582 
Vectorize(ArrayRef<int64_t> virtualVectorSize)583 Vectorize::Vectorize(ArrayRef<int64_t> virtualVectorSize) {
584   vectorSizes = virtualVectorSize;
585 }
586 
vectorizeLoopIfProfitable(Operation * loop,unsigned depthInPattern,unsigned patternDepth,VectorizationStrategy * strategy)587 static void vectorizeLoopIfProfitable(Operation *loop, unsigned depthInPattern,
588                                       unsigned patternDepth,
589                                       VectorizationStrategy *strategy) {
590   assert(patternDepth > depthInPattern &&
591          "patternDepth is greater than depthInPattern");
592   if (patternDepth - depthInPattern > strategy->vectorSizes.size()) {
593     // Don't vectorize this loop
594     return;
595   }
596   strategy->loopToVectorDim[loop] =
597       strategy->vectorSizes.size() - (patternDepth - depthInPattern);
598 }
599 
600 /// Implements a simple strawman strategy for vectorization.
601 /// Given a matched pattern `matches` of depth `patternDepth`, this strategy
602 /// greedily assigns the fastest varying dimension ** of the vector ** to the
603 /// innermost loop in the pattern.
604 /// When coupled with a pattern that looks for the fastest varying dimension in
605 /// load/store MemRefs, this creates a generic vectorization strategy that works
606 /// for any loop in a hierarchy (outermost, innermost or intermediate).
607 ///
608 /// TODO: In the future we should additionally increase the power of the
609 /// profitability analysis along 3 directions:
610 ///   1. account for loop extents (both static and parametric + annotations);
611 ///   2. account for data layout permutations;
612 ///   3. account for impact of vectorization on maximal loop fusion.
613 /// Then we can quantify the above to build a cost model and search over
614 /// strategies.
analyzeProfitability(ArrayRef<NestedMatch> matches,unsigned depthInPattern,unsigned patternDepth,VectorizationStrategy * strategy)615 static LogicalResult analyzeProfitability(ArrayRef<NestedMatch> matches,
616                                           unsigned depthInPattern,
617                                           unsigned patternDepth,
618                                           VectorizationStrategy *strategy) {
619   for (auto m : matches) {
620     if (failed(analyzeProfitability(m.getMatchedChildren(), depthInPattern + 1,
621                                     patternDepth, strategy))) {
622       return failure();
623     }
624     vectorizeLoopIfProfitable(m.getMatchedOperation(), depthInPattern,
625                               patternDepth, strategy);
626   }
627   return success();
628 }
629 
630 ///// end TODO: Hoist to a VectorizationStrategy.cpp when appropriate /////
631 
632 namespace {
633 
634 struct VectorizationState {
635   /// Adds an entry of pre/post vectorization operations in the state.
636   void registerReplacement(Operation *key, Operation *value);
637   /// When the current vectorization pattern is successful, this erases the
638   /// operations that were marked for erasure in the proper order and resets
639   /// the internal state for the next pattern.
640   void finishVectorizationPattern();
641 
642   // In-order tracking of original Operation that have been vectorized.
643   // Erase in reverse order.
644   SmallVector<Operation *, 16> toErase;
645   // Set of Operation that have been vectorized (the values in the
646   // vectorizationMap for hashed access). The vectorizedSet is used in
647   // particular to filter the operations that have already been vectorized by
648   // this pattern, when iterating over nested loops in this pattern.
649   DenseSet<Operation *> vectorizedSet;
650   // Map of old scalar Operation to new vectorized Operation.
651   DenseMap<Operation *, Operation *> vectorizationMap;
652   // Map of old scalar Value to new vectorized Value.
653   DenseMap<Value, Value> replacementMap;
654   // The strategy drives which loop to vectorize by which amount.
655   const VectorizationStrategy *strategy;
656   // Use-def roots. These represent the starting points for the worklist in the
657   // vectorizeNonTerminals function. They consist of the subset of load
658   // operations that have been vectorized. They can be retrieved from
659   // `vectorizationMap` but it is convenient to keep track of them in a separate
660   // data structure.
661   DenseSet<Operation *> roots;
662   // Terminal operations for the worklist in the vectorizeNonTerminals
663   // function. They consist of the subset of store operations that have been
664   // vectorized. They can be retrieved from `vectorizationMap` but it is
665   // convenient to keep track of them in a separate data structure. Since they
666   // do not necessarily belong to use-def chains starting from loads (e.g
667   // storing a constant), we need to handle them in a post-pass.
668   DenseSet<Operation *> terminals;
669   // Checks that the type of `op` is AffineStoreOp and adds it to the terminals
670   // set.
671   void registerTerminal(Operation *op);
672   // Folder used to factor out constant creation.
673   OperationFolder *folder;
674 
675 private:
676   void registerReplacement(Value key, Value value);
677 };
678 
679 } // end namespace
680 
registerReplacement(Operation * key,Operation * value)681 void VectorizationState::registerReplacement(Operation *key, Operation *value) {
682   LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ commit vectorized op: ");
683   LLVM_DEBUG(key->print(dbgs()));
684   LLVM_DEBUG(dbgs() << "  into  ");
685   LLVM_DEBUG(value->print(dbgs()));
686   assert(key->getNumResults() == 1 && "already registered");
687   assert(value->getNumResults() == 1 && "already registered");
688   assert(vectorizedSet.count(value) == 0 && "already registered");
689   assert(vectorizationMap.count(key) == 0 && "already registered");
690   toErase.push_back(key);
691   vectorizedSet.insert(value);
692   vectorizationMap.insert(std::make_pair(key, value));
693   registerReplacement(key->getResult(0), value->getResult(0));
694   if (isa<AffineLoadOp>(key)) {
695     assert(roots.count(key) == 0 && "root was already inserted previously");
696     roots.insert(key);
697   }
698 }
699 
registerTerminal(Operation * op)700 void VectorizationState::registerTerminal(Operation *op) {
701   assert(isa<AffineStoreOp>(op) && "terminal must be a AffineStoreOp");
702   assert(terminals.count(op) == 0 &&
703          "terminal was already inserted previously");
704   terminals.insert(op);
705 }
706 
finishVectorizationPattern()707 void VectorizationState::finishVectorizationPattern() {
708   while (!toErase.empty()) {
709     auto *op = toErase.pop_back_val();
710     LLVM_DEBUG(dbgs() << "\n[early-vect] finishVectorizationPattern erase: ");
711     LLVM_DEBUG(op->print(dbgs()));
712     op->erase();
713   }
714 }
715 
registerReplacement(Value key,Value value)716 void VectorizationState::registerReplacement(Value key, Value value) {
717   assert(replacementMap.count(key) == 0 && "replacement already registered");
718   replacementMap.insert(std::make_pair(key, value));
719 }
720 
721 // Apply 'map' with 'mapOperands' returning resulting values in 'results'.
computeMemoryOpIndices(Operation * op,AffineMap map,ValueRange mapOperands,SmallVectorImpl<Value> & results)722 static void computeMemoryOpIndices(Operation *op, AffineMap map,
723                                    ValueRange mapOperands,
724                                    SmallVectorImpl<Value> &results) {
725   OpBuilder builder(op);
726   for (auto resultExpr : map.getResults()) {
727     auto singleResMap =
728         AffineMap::get(map.getNumDims(), map.getNumSymbols(), resultExpr);
729     auto afOp =
730         builder.create<AffineApplyOp>(op->getLoc(), singleResMap, mapOperands);
731     results.push_back(afOp);
732   }
733 }
734 
735 ////// TODO: Hoist to a VectorizationMaterialize.cpp when appropriate. ////
736 
737 /// Handles the vectorization of load and store MLIR operations.
738 ///
739 /// AffineLoadOp operations are the roots of the vectorizeNonTerminals call.
740 /// They are vectorized immediately. The resulting vector.transfer_read is
741 /// immediately registered to replace all uses of the AffineLoadOp in this
742 /// pattern's scope.
743 ///
744 /// AffineStoreOp are the terminals of the vectorizeNonTerminals call. They
745 /// need to be vectorized late once all the use-def chains have been traversed.
746 /// Additionally, they may have ssa-values operands which come from outside the
747 /// scope of the current pattern.
748 /// Such special cases force us to delay the vectorization of the stores until
749 /// the last step. Here we merely register the store operation.
750 template <typename LoadOrStoreOpPointer>
vectorizeRootOrTerminal(Value iv,LoadOrStoreOpPointer memoryOp,VectorizationState * state)751 static LogicalResult vectorizeRootOrTerminal(Value iv,
752                                              LoadOrStoreOpPointer memoryOp,
753                                              VectorizationState *state) {
754   auto memRefType = memoryOp.getMemRef().getType().template cast<MemRefType>();
755 
756   auto elementType = memRefType.getElementType();
757   // TODO: ponder whether we want to further vectorize a vector value.
758   assert(VectorType::isValidElementType(elementType) &&
759          "Not a valid vector element type");
760   auto vectorType = VectorType::get(state->strategy->vectorSizes, elementType);
761 
762   // Materialize a MemRef with 1 vector.
763   auto *opInst = memoryOp.getOperation();
764   // For now, vector.transfers must be aligned, operate only on indices with an
765   // identity subset of AffineMap and do not change layout.
766   // TODO: increase the expressiveness power of vector.transfer operations
767   // as needed by various targets.
768   if (auto load = dyn_cast<AffineLoadOp>(opInst)) {
769     OpBuilder b(opInst);
770     ValueRange mapOperands = load.getMapOperands();
771     SmallVector<Value, 8> indices;
772     indices.reserve(load.getMemRefType().getRank());
773     if (load.getAffineMap() !=
774         b.getMultiDimIdentityMap(load.getMemRefType().getRank())) {
775       computeMemoryOpIndices(opInst, load.getAffineMap(), mapOperands, indices);
776     } else {
777       indices.append(mapOperands.begin(), mapOperands.end());
778     }
779     auto permutationMap =
780         makePermutationMap(opInst, indices, state->strategy->loopToVectorDim);
781     if (!permutationMap)
782       return LogicalResult::Failure;
783     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
784     LLVM_DEBUG(permutationMap.print(dbgs()));
785     auto transfer = b.create<vector::TransferReadOp>(
786         opInst->getLoc(), vectorType, memoryOp.getMemRef(), indices,
787         permutationMap);
788     state->registerReplacement(opInst, transfer.getOperation());
789   } else {
790     state->registerTerminal(opInst);
791   }
792   return success();
793 }
794 /// end TODO: Hoist to a VectorizationMaterialize.cpp when appropriate. ///
795 
796 /// Coarsens the loops bounds and transforms all remaining load and store
797 /// operations into the appropriate vector.transfer.
vectorizeAffineForOp(AffineForOp loop,int64_t step,VectorizationState * state)798 static LogicalResult vectorizeAffineForOp(AffineForOp loop, int64_t step,
799                                           VectorizationState *state) {
800   loop.setStep(step);
801 
802   FilterFunctionType notVectorizedThisPattern = [state](Operation &op) {
803     if (!matcher::isLoadOrStore(op)) {
804       return false;
805     }
806     return state->vectorizationMap.count(&op) == 0 &&
807            state->vectorizedSet.count(&op) == 0 &&
808            state->roots.count(&op) == 0 && state->terminals.count(&op) == 0;
809   };
810   auto loadAndStores = matcher::Op(notVectorizedThisPattern);
811   SmallVector<NestedMatch, 8> loadAndStoresMatches;
812   loadAndStores.match(loop.getOperation(), &loadAndStoresMatches);
813   for (auto ls : loadAndStoresMatches) {
814     auto *opInst = ls.getMatchedOperation();
815     auto load = dyn_cast<AffineLoadOp>(opInst);
816     auto store = dyn_cast<AffineStoreOp>(opInst);
817     LLVM_DEBUG(opInst->print(dbgs()));
818     LogicalResult result =
819         load ? vectorizeRootOrTerminal(loop.getInductionVar(), load, state)
820              : vectorizeRootOrTerminal(loop.getInductionVar(), store, state);
821     if (failed(result)) {
822       return failure();
823     }
824   }
825   return success();
826 }
827 
828 /// Returns a FilterFunctionType that can be used in NestedPattern to match a
829 /// loop whose underlying load/store accesses are either invariant or all
830 // varying along the `fastestVaryingMemRefDimension`.
831 static FilterFunctionType
isVectorizableLoopPtrFactory(const DenseSet<Operation * > & parallelLoops,int fastestVaryingMemRefDimension)832 isVectorizableLoopPtrFactory(const DenseSet<Operation *> &parallelLoops,
833                              int fastestVaryingMemRefDimension) {
834   return [&parallelLoops, fastestVaryingMemRefDimension](Operation &forOp) {
835     auto loop = cast<AffineForOp>(forOp);
836     auto parallelIt = parallelLoops.find(loop);
837     if (parallelIt == parallelLoops.end())
838       return false;
839     int memRefDim = -1;
840     auto vectorizableBody =
841         isVectorizableLoopBody(loop, &memRefDim, vectorTransferPattern());
842     if (!vectorizableBody)
843       return false;
844     return memRefDim == -1 || fastestVaryingMemRefDimension == -1 ||
845            memRefDim == fastestVaryingMemRefDimension;
846   };
847 }
848 
849 /// Apply vectorization of `loop` according to `state`. `loops` are processed in
850 /// inner-to-outer order to ensure that all the children loops have already been
851 /// vectorized before vectorizing the parent loop.
852 static LogicalResult
vectorizeLoopsAndLoads(std::vector<SmallVector<AffineForOp,2>> & loops,VectorizationState * state)853 vectorizeLoopsAndLoads(std::vector<SmallVector<AffineForOp, 2>> &loops,
854                        VectorizationState *state) {
855   // Vectorize loops in inner-to-outer order. If any children fails, the parent
856   // will fail too.
857   for (auto &loopsInLevel : llvm::reverse(loops)) {
858     for (AffineForOp loop : loopsInLevel) {
859       // 1. This loop may have been omitted from vectorization for various
860       // reasons (e.g. due to the performance model or pattern depth > vector
861       // size).
862       auto it = state->strategy->loopToVectorDim.find(loop.getOperation());
863       if (it == state->strategy->loopToVectorDim.end())
864         continue;
865 
866       // 2. Actual inner-to-outer transformation.
867       auto vectorDim = it->second;
868       assert(vectorDim < state->strategy->vectorSizes.size() &&
869              "vector dim overflow");
870       //   a. get actual vector size
871       auto vectorSize = state->strategy->vectorSizes[vectorDim];
872       //   b. loop transformation for early vectorization is still subject to
873       //     exploratory tradeoffs (see top of the file). Apply coarsening,
874       //     i.e.:
875       //        | ub -> ub
876       //        | step -> step * vectorSize
877       LLVM_DEBUG(dbgs() << "\n[early-vect] vectorizeForOp by " << vectorSize
878                         << " : \n"
879                         << loop);
880       if (failed(
881               vectorizeAffineForOp(loop, loop.getStep() * vectorSize, state)))
882         return failure();
883     } // end for.
884   }
885 
886   return success();
887 }
888 
889 /// Tries to transform a scalar constant into a vector splat of that constant.
890 /// Returns the vectorized splat operation if the constant is a valid vector
891 /// element type.
892 /// If `type` is not a valid vector type or if the scalar constant is not a
893 /// valid vector element type, returns nullptr.
vectorizeConstant(Operation * op,ConstantOp constant,Type type)894 static Value vectorizeConstant(Operation *op, ConstantOp constant, Type type) {
895   if (!type || !type.isa<VectorType>() ||
896       !VectorType::isValidElementType(constant.getType())) {
897     return nullptr;
898   }
899   OpBuilder b(op);
900   Location loc = op->getLoc();
901   auto vectorType = type.cast<VectorType>();
902   auto attr = DenseElementsAttr::get(vectorType, constant.getValue());
903   auto *constantOpInst = constant.getOperation();
904 
905   OperationState state(loc, constantOpInst->getName().getStringRef(), {},
906                        {vectorType}, {b.getNamedAttr("value", attr)});
907 
908   return b.createOperation(state)->getResult(0);
909 }
910 
911 /// Returns the vector type resulting from applying the provided vectorization
912 /// strategy on the scalar type.
getVectorType(Type scalarTy,const VectorizationStrategy * strategy)913 static VectorType getVectorType(Type scalarTy,
914                                 const VectorizationStrategy *strategy) {
915   assert(!scalarTy.isa<VectorType>() && "Expected scalar type");
916   return VectorType::get(strategy->vectorSizes, scalarTy);
917 }
918 
919 /// Returns true if the provided value is vector uniform given the vectorization
920 /// strategy.
921 // TODO: For now, only values that are invariants to all the loops in the
922 // vectorization strategy are considered vector uniforms.
isUniformDefinition(Value value,const VectorizationStrategy * strategy)923 static bool isUniformDefinition(Value value,
924                                 const VectorizationStrategy *strategy) {
925   for (auto loopToDim : strategy->loopToVectorDim) {
926     auto loop = cast<AffineForOp>(loopToDim.first);
927     if (!loop.isDefinedOutsideOfLoop(value))
928       return false;
929   }
930   return true;
931 }
932 
933 /// Generates a broadcast op for the provided uniform value using the
934 /// vectorization strategy in 'state'.
vectorizeUniform(Value value,VectorizationState * state)935 static Value vectorizeUniform(Value value, VectorizationState *state) {
936   OpBuilder builder(value.getContext());
937   builder.setInsertionPointAfterValue(value);
938 
939   auto vectorTy = getVectorType(value.getType(), state->strategy);
940   auto bcast = builder.create<BroadcastOp>(value.getLoc(), vectorTy, value);
941 
942   // Add broadcast to the replacement map to reuse it for other uses.
943   state->replacementMap[value] = bcast;
944   return bcast;
945 }
946 
947 /// Tries to vectorize a given operand `op` of Operation `op` during
948 /// def-chain propagation or during terminal vectorization, by applying the
949 /// following logic:
950 /// 1. if the defining operation is part of the vectorizedSet (i.e. vectorized
951 ///    useby -def propagation), `op` is already in the proper vector form;
952 /// 2. otherwise, the `op` may be in some other vector form that fails to
953 ///    vectorize atm (i.e. broadcasting required), returns nullptr to indicate
954 ///    failure;
955 /// 3. if the `op` is a constant, returns the vectorized form of the constant;
956 /// 4. if the `op` is uniform, returns a vector broadcast of the `op`;
957 /// 5. non-constant scalars are currently non-vectorizable, in particular to
958 ///    guard against vectorizing an index which may be loop-variant and needs
959 ///    special handling.
960 ///
961 /// In particular this logic captures some of the use cases where definitions
962 /// that are not scoped under the current pattern are needed to vectorize.
963 /// One such example is top level function constants that need to be splatted.
964 ///
965 /// Returns an operand that has been vectorized to match `state`'s strategy if
966 /// vectorization is possible with the above logic. Returns nullptr otherwise.
967 ///
968 /// TODO: handle more complex cases.
vectorizeOperand(Value operand,Operation * op,VectorizationState * state)969 static Value vectorizeOperand(Value operand, Operation *op,
970                               VectorizationState *state) {
971   LLVM_DEBUG(dbgs() << "\n[early-vect]vectorize operand: " << operand);
972   // 1. If this value has already been vectorized this round, we are done.
973   if (state->vectorizedSet.count(operand.getDefiningOp()) > 0) {
974     LLVM_DEBUG(dbgs() << " -> already vector operand");
975     return operand;
976   }
977   // 1.b. Delayed on-demand replacement of a use.
978   //    Note that we cannot just call replaceAllUsesWith because it may result
979   //    in ops with mixed types, for ops whose operands have not all yet
980   //    been vectorized. This would be invalid IR.
981   auto it = state->replacementMap.find(operand);
982   if (it != state->replacementMap.end()) {
983     auto res = it->second;
984     LLVM_DEBUG(dbgs() << "-> delayed replacement by: " << res);
985     return res;
986   }
987   // 2. TODO: broadcast needed.
988   if (operand.getType().isa<VectorType>()) {
989     LLVM_DEBUG(dbgs() << "-> non-vectorizable");
990     return nullptr;
991   }
992   // 3. vectorize constant.
993   if (auto constant = operand.getDefiningOp<ConstantOp>())
994     return vectorizeConstant(op, constant,
995                              getVectorType(operand.getType(), state->strategy));
996 
997   // 4. Uniform values.
998   if (isUniformDefinition(operand, state->strategy))
999     return vectorizeUniform(operand, state);
1000 
1001   // 5. currently non-vectorizable.
1002   LLVM_DEBUG(dbgs() << "-> non-vectorizable: " << operand);
1003   return nullptr;
1004 }
1005 
1006 /// Encodes Operation-specific behavior for vectorization. In general we assume
1007 /// that all operands of an op must be vectorized but this is not always true.
1008 /// In the future, it would be nice to have a trait that describes how a
1009 /// particular operation vectorizes. For now we implement the case distinction
1010 /// here.
1011 /// Returns a vectorized form of an operation or nullptr if vectorization fails.
1012 // TODO: consider adding a trait to Op to describe how it gets vectorized.
1013 // Maybe some Ops are not vectorizable or require some tricky logic, we cannot
1014 // do one-off logic here; ideally it would be TableGen'd.
vectorizeOneOperation(Operation * opInst,VectorizationState * state)1015 static Operation *vectorizeOneOperation(Operation *opInst,
1016                                         VectorizationState *state) {
1017   // Sanity checks.
1018   assert(!isa<AffineLoadOp>(opInst) &&
1019          "all loads must have already been fully vectorized independently");
1020   assert(!isa<vector::TransferReadOp>(opInst) &&
1021          "vector.transfer_read cannot be further vectorized");
1022   assert(!isa<vector::TransferWriteOp>(opInst) &&
1023          "vector.transfer_write cannot be further vectorized");
1024 
1025   if (auto store = dyn_cast<AffineStoreOp>(opInst)) {
1026     OpBuilder b(opInst);
1027     auto memRef = store.getMemRef();
1028     auto value = store.getValueToStore();
1029     auto vectorValue = vectorizeOperand(value, opInst, state);
1030     if (!vectorValue)
1031       return nullptr;
1032 
1033     ValueRange mapOperands = store.getMapOperands();
1034     SmallVector<Value, 8> indices;
1035     indices.reserve(store.getMemRefType().getRank());
1036     if (store.getAffineMap() !=
1037         b.getMultiDimIdentityMap(store.getMemRefType().getRank())) {
1038       computeMemoryOpIndices(opInst, store.getAffineMap(), mapOperands,
1039                              indices);
1040     } else {
1041       indices.append(mapOperands.begin(), mapOperands.end());
1042     }
1043 
1044     auto permutationMap =
1045         makePermutationMap(opInst, indices, state->strategy->loopToVectorDim);
1046     if (!permutationMap)
1047       return nullptr;
1048     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
1049     LLVM_DEBUG(permutationMap.print(dbgs()));
1050     auto transfer = b.create<vector::TransferWriteOp>(
1051         opInst->getLoc(), vectorValue, memRef, indices, permutationMap);
1052     auto *res = transfer.getOperation();
1053     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << *res);
1054     // "Terminals" (i.e. AffineStoreOps) are erased on the spot.
1055     opInst->erase();
1056     return res;
1057   }
1058   if (opInst->getNumRegions() != 0)
1059     return nullptr;
1060 
1061   SmallVector<Type, 8> vectorTypes;
1062   for (auto v : opInst->getResults()) {
1063     vectorTypes.push_back(
1064         VectorType::get(state->strategy->vectorSizes, v.getType()));
1065   }
1066   SmallVector<Value, 8> vectorOperands;
1067   for (auto v : opInst->getOperands()) {
1068     vectorOperands.push_back(vectorizeOperand(v, opInst, state));
1069   }
1070   // Check whether a single operand is null. If so, vectorization failed.
1071   bool success = llvm::all_of(vectorOperands, [](Value op) { return op; });
1072   if (!success) {
1073     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ an operand failed vectorize");
1074     return nullptr;
1075   }
1076 
1077   // Create a clone of the op with the proper operands and return types.
1078   // TODO: The following assumes there is always an op with a fixed
1079   // name that works both in scalar mode and vector mode.
1080   // TODO: Is it worth considering an Operation.clone operation which
1081   // changes the type so we can promote an Operation with less boilerplate?
1082   OpBuilder b(opInst);
1083   OperationState newOp(opInst->getLoc(), opInst->getName().getStringRef(),
1084                        vectorOperands, vectorTypes, opInst->getAttrs(),
1085                        /*successors=*/{}, /*regions=*/{});
1086   return b.createOperation(newOp);
1087 }
1088 
1089 /// Iterates over the forward slice from the loads in the vectorization pattern
1090 /// and rewrites them using their vectorized counterpart by:
1091 ///   1. Create the forward slice starting from the loads in the vectorization
1092 ///   pattern.
1093 ///   2. Topologically sorts the forward slice.
1094 ///   3. For each operation in the slice, create the vector form of this
1095 ///   operation, replacing each operand by a replacement operands retrieved from
1096 ///   replacementMap. If any such replacement is missing, vectorization fails.
vectorizeNonTerminals(VectorizationState * state)1097 static LogicalResult vectorizeNonTerminals(VectorizationState *state) {
1098   // 1. create initial worklist with the uses of the roots.
1099   SetVector<Operation *> worklist;
1100   // Note: state->roots have already been vectorized and must not be vectorized
1101   // again. This fits `getForwardSlice` which does not insert `op` in the
1102   // result.
1103   // Note: we have to exclude terminals because some of their defs may not be
1104   // nested under the vectorization pattern (e.g. constants defined in an
1105   // encompassing scope).
1106   // TODO: Use a backward slice for terminals, avoid special casing and
1107   // merge implementations.
1108   for (auto *op : state->roots) {
1109     getForwardSlice(op, &worklist, [state](Operation *op) {
1110       return state->terminals.count(op) == 0; // propagate if not terminal
1111     });
1112   }
1113   // We merged multiple slices, topological order may not hold anymore.
1114   worklist = topologicalSort(worklist);
1115 
1116   for (unsigned i = 0; i < worklist.size(); ++i) {
1117     auto *op = worklist[i];
1118     LLVM_DEBUG(dbgs() << "\n[early-vect] vectorize use: ");
1119     LLVM_DEBUG(op->print(dbgs()));
1120 
1121     // Create vector form of the operation.
1122     // Insert it just before op, on success register op as replaced.
1123     auto *vectorizedInst = vectorizeOneOperation(op, state);
1124     if (!vectorizedInst) {
1125       return failure();
1126     }
1127 
1128     // 3. Register replacement for future uses in the scope.
1129     //    Note that we cannot just call replaceAllUsesWith because it may
1130     //    result in ops with mixed types, for ops whose operands have not all
1131     //    yet been vectorized. This would be invalid IR.
1132     state->registerReplacement(op, vectorizedInst);
1133   }
1134   return success();
1135 }
1136 
1137 /// Recursive implementation to convert all the nested loops in 'match' to a 2D
1138 /// vector container that preserves the relative nesting level of each loop with
1139 /// respect to the others in 'match'. 'currentLevel' is the nesting level that
1140 /// will be assigned to the loop in the current 'match'.
1141 static void
getMatchedAffineLoopsRec(NestedMatch match,unsigned currentLevel,std::vector<SmallVector<AffineForOp,2>> & loops)1142 getMatchedAffineLoopsRec(NestedMatch match, unsigned currentLevel,
1143                          std::vector<SmallVector<AffineForOp, 2>> &loops) {
1144   // Add a new empty level to the output if it doesn't exist already.
1145   assert(currentLevel <= loops.size() && "Unexpected currentLevel");
1146   if (currentLevel == loops.size())
1147     loops.push_back(SmallVector<AffineForOp, 2>());
1148 
1149   // Add current match and recursively visit its children.
1150   loops[currentLevel].push_back(cast<AffineForOp>(match.getMatchedOperation()));
1151   for (auto childMatch : match.getMatchedChildren()) {
1152     getMatchedAffineLoopsRec(childMatch, currentLevel + 1, loops);
1153   }
1154 }
1155 
1156 /// Converts all the nested loops in 'match' to a 2D vector container that
1157 /// preserves the relative nesting level of each loop with respect to the others
1158 /// in 'match'. This means that every loop in 'loops[i]' will have a parent loop
1159 /// in 'loops[i-1]'. A loop in 'loops[i]' may or may not have a child loop in
1160 /// 'loops[i+1]'.
1161 static void
getMatchedAffineLoops(NestedMatch match,std::vector<SmallVector<AffineForOp,2>> & loops)1162 getMatchedAffineLoops(NestedMatch match,
1163                       std::vector<SmallVector<AffineForOp, 2>> &loops) {
1164   getMatchedAffineLoopsRec(match, /*currLoopDepth=*/0, loops);
1165 }
1166 
1167 /// Internal implementation to vectorize affine loops from a single loop nest
1168 /// using an n-D vectorization strategy.
1169 static LogicalResult
vectorizeLoopNest(std::vector<SmallVector<AffineForOp,2>> & loops,const VectorizationStrategy & strategy)1170 vectorizeLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
1171                   const VectorizationStrategy &strategy) {
1172   assert(loops[0].size() == 1 && "Expected single root loop");
1173   AffineForOp rootLoop = loops[0][0];
1174   OperationFolder folder(rootLoop.getContext());
1175   VectorizationState state;
1176   state.strategy = &strategy;
1177   state.folder = &folder;
1178 
1179   // Since patterns are recursive, they can very well intersect.
1180   // Since we do not want a fully greedy strategy in general, we decouple
1181   // pattern matching, from profitability analysis, from application.
1182   // As a consequence we must check that each root pattern is still
1183   // vectorizable. If a pattern is not vectorizable anymore, we just skip it.
1184   // TODO: implement a non-greedy profitability analysis that keeps only
1185   // non-intersecting patterns.
1186   if (!isVectorizableLoopBody(rootLoop, vectorTransferPattern())) {
1187     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ loop is not vectorizable");
1188     return failure();
1189   }
1190 
1191   /// Sets up error handling for this root loop. This is how the root match
1192   /// maintains a clone for handling failure and restores the proper state via
1193   /// RAII.
1194   auto *loopInst = rootLoop.getOperation();
1195   OpBuilder builder(loopInst);
1196   auto clonedLoop = cast<AffineForOp>(builder.clone(*loopInst));
1197   struct Guard {
1198     LogicalResult failure() {
1199       loop.getInductionVar().replaceAllUsesWith(clonedLoop.getInductionVar());
1200       loop.erase();
1201       return mlir::failure();
1202     }
1203     LogicalResult success() {
1204       clonedLoop.erase();
1205       return mlir::success();
1206     }
1207     AffineForOp loop;
1208     AffineForOp clonedLoop;
1209   } guard{rootLoop, clonedLoop};
1210 
1211   //////////////////////////////////////////////////////////////////////////////
1212   // Start vectorizing.
1213   // From now on, any error triggers the scope guard above.
1214   //////////////////////////////////////////////////////////////////////////////
1215   // 1. Vectorize all the loop candidates, in inner-to-outer order.
1216   // This also vectorizes the roots (AffineLoadOp) as well as registers the
1217   // terminals (AffineStoreOp) for post-processing vectorization (we need to
1218   // wait for all use-def chains into them to be vectorized first).
1219   if (failed(vectorizeLoopsAndLoads(loops, &state))) {
1220     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed root vectorizeLoop");
1221     return guard.failure();
1222   }
1223 
1224   // 2. Vectorize operations reached by use-def chains from root except the
1225   // terminals (store operations) that need to be post-processed separately.
1226   // TODO: add more as we expand.
1227   if (failed(vectorizeNonTerminals(&state))) {
1228     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed vectorizeNonTerminals");
1229     return guard.failure();
1230   }
1231 
1232   // 3. Post-process terminals.
1233   // Note: we have to post-process terminals because some of their defs may not
1234   // be nested under the vectorization pattern (e.g. constants defined in an
1235   // encompassing scope).
1236   // TODO: Use a backward slice for terminals, avoid special casing and
1237   // merge implementations.
1238   for (auto *op : state.terminals) {
1239     if (!vectorizeOneOperation(op, &state)) { // nullptr == failure
1240       LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ failed to vectorize terminals");
1241       return guard.failure();
1242     }
1243   }
1244 
1245   // 4. Finish this vectorization pattern.
1246   LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ success vectorizing pattern");
1247   state.finishVectorizationPattern();
1248   return guard.success();
1249 }
1250 
1251 /// Vectorization is a recursive procedure where anything below can fail. The
1252 /// root match thus needs to maintain a clone for handling failure. Each root
1253 /// may succeed independently but will otherwise clean after itself if anything
1254 /// below it fails.
vectorizeRootMatch(NestedMatch m,const VectorizationStrategy & strategy)1255 static LogicalResult vectorizeRootMatch(NestedMatch m,
1256                                         const VectorizationStrategy &strategy) {
1257   std::vector<SmallVector<AffineForOp, 2>> loopsToVectorize;
1258   getMatchedAffineLoops(m, loopsToVectorize);
1259   return vectorizeLoopNest(loopsToVectorize, strategy);
1260 }
1261 
1262 /// Internal implementation to vectorize affine loops in 'loops' using the n-D
1263 /// vectorization factors in 'vectorSizes'. By default, each vectorization
1264 /// factor is applied inner-to-outer to the loops of each loop nest.
1265 /// 'fastestVaryingPattern' can be optionally used to provide a different loop
1266 /// vectorization order.
vectorizeLoops(Operation * parentOp,DenseSet<Operation * > & loops,ArrayRef<int64_t> vectorSizes,ArrayRef<int64_t> fastestVaryingPattern)1267 static void vectorizeLoops(Operation *parentOp, DenseSet<Operation *> &loops,
1268                            ArrayRef<int64_t> vectorSizes,
1269                            ArrayRef<int64_t> fastestVaryingPattern) {
1270   for (auto &pat :
1271        makePatterns(loops, vectorSizes.size(), fastestVaryingPattern)) {
1272     LLVM_DEBUG(dbgs() << "\n******************************************");
1273     LLVM_DEBUG(dbgs() << "\n******************************************");
1274     LLVM_DEBUG(dbgs() << "\n[early-vect] new pattern on parent op\n");
1275     LLVM_DEBUG(parentOp->print(dbgs()));
1276 
1277     unsigned patternDepth = pat.getDepth();
1278 
1279     SmallVector<NestedMatch, 8> matches;
1280     pat.match(parentOp, &matches);
1281     // Iterate over all the top-level matches and vectorize eagerly.
1282     // This automatically prunes intersecting matches.
1283     for (auto m : matches) {
1284       VectorizationStrategy strategy;
1285       // TODO: depending on profitability, elect to reduce the vector size.
1286       strategy.vectorSizes.assign(vectorSizes.begin(), vectorSizes.end());
1287       if (failed(analyzeProfitability(m.getMatchedChildren(), 1, patternDepth,
1288                                       &strategy))) {
1289         continue;
1290       }
1291       vectorizeLoopIfProfitable(m.getMatchedOperation(), 0, patternDepth,
1292                                 &strategy);
1293       // TODO: if pattern does not apply, report it; alter the
1294       // cost/benefit.
1295       vectorizeRootMatch(m, strategy);
1296       // TODO: some diagnostics if failure to vectorize occurs.
1297     }
1298   }
1299   LLVM_DEBUG(dbgs() << "\n");
1300 }
1301 
1302 std::unique_ptr<OperationPass<FuncOp>>
createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize)1303 createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize) {
1304   return std::make_unique<Vectorize>(virtualVectorSize);
1305 }
createSuperVectorizePass()1306 std::unique_ptr<OperationPass<FuncOp>> createSuperVectorizePass() {
1307   return std::make_unique<Vectorize>();
1308 }
1309 
1310 /// Applies vectorization to the current function by searching over a bunch of
1311 /// predetermined patterns.
runOnFunction()1312 void Vectorize::runOnFunction() {
1313   FuncOp f = getFunction();
1314   if (!fastestVaryingPattern.empty() &&
1315       fastestVaryingPattern.size() != vectorSizes.size()) {
1316     f.emitRemark("Fastest varying pattern specified with different size than "
1317                  "the vector size.");
1318     return signalPassFailure();
1319   }
1320 
1321   DenseSet<Operation *> parallelLoops;
1322   f.walk([&parallelLoops](AffineForOp loop) {
1323     if (isLoopParallel(loop))
1324       parallelLoops.insert(loop);
1325   });
1326 
1327   // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
1328   NestedPatternContext mlContext;
1329   vectorizeLoops(f, parallelLoops, vectorSizes, fastestVaryingPattern);
1330 }
1331 
1332 /// Verify that affine loops in 'loops' meet the nesting criteria expected by
1333 /// SuperVectorizer:
1334 ///   * There must be at least one loop.
1335 ///   * There must be a single root loop (nesting level 0).
1336 ///   * Each loop at a given nesting level must be nested in a loop from a
1337 ///     previous nesting level.
1338 static void
verifyLoopNesting(const std::vector<SmallVector<AffineForOp,2>> & loops)1339 verifyLoopNesting(const std::vector<SmallVector<AffineForOp, 2>> &loops) {
1340   assert(!loops.empty() && "Expected at least one loop");
1341   assert(loops[0].size() == 1 && "Expected only one root loop");
1342 
1343   // Traverse loops outer-to-inner to check some invariants.
1344   for (int i = 1, end = loops.size(); i < end; ++i) {
1345     for (AffineForOp loop : loops[i]) {
1346       //  Check that each loop at this level is nested in one of the loops from
1347       //  the previous level.
1348       bool parentFound = false;
1349       for (AffineForOp maybeParent : loops[i - 1]) {
1350         if (maybeParent->isProperAncestor(loop)) {
1351           parentFound = true;
1352           break;
1353         }
1354       }
1355       assert(parentFound && "Child loop not nested in any parent loop");
1356 
1357       //  Check that each loop at this level is not nested in another loop from
1358       //  this level.
1359 #ifndef NDEBUG
1360       for (AffineForOp sibling : loops[i])
1361         assert(!sibling->isProperAncestor(loop) &&
1362                "Loops at the same level are nested");
1363 #endif
1364     }
1365   }
1366 }
1367 
1368 namespace mlir {
1369 
1370 /// External utility to vectorize affine loops in 'loops' using the n-D
1371 /// vectorization factors in 'vectorSizes'. By default, each vectorization
1372 /// factor is applied inner-to-outer to the loops of each loop nest.
1373 /// 'fastestVaryingPattern' can be optionally used to provide a different loop
1374 /// vectorization order.
vectorizeAffineLoops(Operation * parentOp,DenseSet<Operation * > & loops,ArrayRef<int64_t> vectorSizes,ArrayRef<int64_t> fastestVaryingPattern)1375 void vectorizeAffineLoops(Operation *parentOp, DenseSet<Operation *> &loops,
1376                           ArrayRef<int64_t> vectorSizes,
1377                           ArrayRef<int64_t> fastestVaryingPattern) {
1378   // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
1379   NestedPatternContext mlContext;
1380   vectorizeLoops(parentOp, loops, vectorSizes, fastestVaryingPattern);
1381 }
1382 
1383 /// External utility to vectorize affine loops from a single loop nest using an
1384 /// n-D vectorization strategy (see doc in VectorizationStrategy definition).
1385 /// Loops are provided in a 2D vector container. The first dimension represents
1386 /// the nesting level relative to the loops to be vectorized. The second
1387 /// dimension contains the loops. This means that:
1388 ///   a) every loop in 'loops[i]' must have a parent loop in 'loops[i-1]',
1389 ///   b) a loop in 'loops[i]' may or may not have a child loop in 'loops[i+1]'.
1390 ///
1391 /// For example, for the following loop nest:
1392 ///
1393 ///   func @vec2d(%in0: memref<64x128x512xf32>, %in1: memref<64x128x128xf32>,
1394 ///               %out0: memref<64x128x512xf32>,
1395 ///               %out1: memref<64x128x128xf32>) {
1396 ///     affine.for %i0 = 0 to 64 {
1397 ///       affine.for %i1 = 0 to 128 {
1398 ///         affine.for %i2 = 0 to 512 {
1399 ///           %ld = affine.load %in0[%i0, %i1, %i2] : memref<64x128x512xf32>
1400 ///           affine.store %ld, %out0[%i0, %i1, %i2] : memref<64x128x512xf32>
1401 ///         }
1402 ///         affine.for %i3 = 0 to 128 {
1403 ///           %ld = affine.load %in1[%i0, %i1, %i3] : memref<64x128x128xf32>
1404 ///           affine.store %ld, %out1[%i0, %i1, %i3] : memref<64x128x128xf32>
1405 ///         }
1406 ///       }
1407 ///     }
1408 ///     return
1409 ///   }
1410 ///
1411 /// loops = {{%i0}, {%i2, %i3}}, to vectorize the outermost and the two
1412 /// innermost loops;
1413 /// loops = {{%i1}, {%i2, %i3}}, to vectorize the middle and the two innermost
1414 /// loops;
1415 /// loops = {{%i2}}, to vectorize only the first innermost loop;
1416 /// loops = {{%i3}}, to vectorize only the second innermost loop;
1417 /// loops = {{%i1}}, to vectorize only the middle loop.
1418 LogicalResult
vectorizeAffineLoopNest(std::vector<SmallVector<AffineForOp,2>> & loops,const VectorizationStrategy & strategy)1419 vectorizeAffineLoopNest(std::vector<SmallVector<AffineForOp, 2>> &loops,
1420                         const VectorizationStrategy &strategy) {
1421   // Thread-safe RAII local context, BumpPtrAllocator freed on exit.
1422   NestedPatternContext mlContext;
1423   verifyLoopNesting(loops);
1424   return vectorizeLoopNest(loops, strategy);
1425 }
1426 
1427 std::unique_ptr<OperationPass<FuncOp>>
createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize)1428 createSuperVectorizePass(ArrayRef<int64_t> virtualVectorSize) {
1429   return std::make_unique<Vectorize>(virtualVectorSize);
1430 }
createSuperVectorizePass()1431 std::unique_ptr<OperationPass<FuncOp>> createSuperVectorizePass() {
1432   return std::make_unique<Vectorize>();
1433 }
1434 
1435 } // namespace mlir
1436