1 /*
2  * Copyright (C) 2016 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
18 #define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
19 
20 #include "base/scoped_arena_allocator.h"
21 #include "base/scoped_arena_containers.h"
22 #include "induction_var_range.h"
23 #include "loop_analysis.h"
24 #include "nodes.h"
25 #include "optimization.h"
26 #include "superblock_cloner.h"
27 
28 namespace art {
29 
30 class CompilerOptions;
31 class ArchNoOptsLoopHelper;
32 
33 /**
34  * Loop optimizations. Builds a loop hierarchy and applies optimizations to
35  * the detected nested loops, such as removal of dead induction and empty loops
36  * and inner loop vectorization.
37  */
38 class HLoopOptimization : public HOptimization {
39  public:
40   HLoopOptimization(HGraph* graph,
41                     const CodeGenerator& codegen,    // Needs info about the target.
42                     HInductionVarAnalysis* induction_analysis,
43                     OptimizingCompilerStats* stats,
44                     const char* name = kLoopOptimizationPassName);
45 
46   bool Run() override;
47 
48   static constexpr const char* kLoopOptimizationPassName = "loop_optimization";
49 
50  private:
51   /**
52    * A single loop inside the loop hierarchy representation.
53    */
54   struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> {
LoopNodeLoopNode55     explicit LoopNode(HLoopInformation* lp_info)
56         : loop_info(lp_info),
57           outer(nullptr),
58           inner(nullptr),
59           previous(nullptr),
60           next(nullptr) {}
61     HLoopInformation* loop_info;
62     LoopNode* outer;
63     LoopNode* inner;
64     LoopNode* previous;
65     LoopNode* next;
66   };
67 
68   /*
69    * Vectorization restrictions (bit mask).
70    */
71   enum VectorRestrictions {
72     kNone            = 0,        // no restrictions
73     kNoMul           = 1 << 0,   // no multiplication
74     kNoDiv           = 1 << 1,   // no division
75     kNoShift         = 1 << 2,   // no shift
76     kNoShr           = 1 << 3,   // no arithmetic shift right
77     kNoHiBits        = 1 << 4,   // "wider" operations cannot bring in higher order bits
78     kNoSignedHAdd    = 1 << 5,   // no signed halving add
79     kNoUnsignedHAdd  = 1 << 6,   // no unsigned halving add
80     kNoUnroundedHAdd = 1 << 7,   // no unrounded halving add
81     kNoAbs           = 1 << 8,   // no absolute value
82     kNoStringCharAt  = 1 << 9,   // no StringCharAt
83     kNoReduction     = 1 << 10,  // no reduction
84     kNoSAD           = 1 << 11,  // no sum of absolute differences (SAD)
85     kNoWideSAD       = 1 << 12,  // no sum of absolute differences (SAD) with operand widening
86     kNoDotProd       = 1 << 13,  // no dot product
87   };
88 
89   /*
90    * Vectorization mode during synthesis
91    * (sequential peeling/cleanup loop or vector loop).
92    */
93   enum VectorMode {
94     kSequential,
95     kVector
96   };
97 
98   /*
99    * Representation of a unit-stride array reference.
100    */
101   struct ArrayReference {
102     ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false)
baseArrayReference103         : base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { }
104     bool operator<(const ArrayReference& other) const {
105       return
106           (base < other.base) ||
107           (base == other.base &&
108            (offset < other.offset || (offset == other.offset &&
109                                       (type < other.type ||
110                                        (type == other.type &&
111                                         (lhs < other.lhs ||
112                                          (lhs == other.lhs &&
113                                           is_string_char_at < other.is_string_char_at)))))));
114     }
115     HInstruction* base;      // base address
116     HInstruction* offset;    // offset + i
117     DataType::Type type;     // component type
118     bool lhs;                // def/use
119     bool is_string_char_at;  // compressed string read
120   };
121 
122   //
123   // Loop setup and traversal.
124   //
125 
126   bool LocalRun();
127   void AddLoop(HLoopInformation* loop_info);
128   void RemoveLoop(LoopNode* node);
129 
130   // Traverses all loops inner to outer to perform simplifications and optimizations.
131   // Returns true if loops nested inside current loop (node) have changed.
132   bool TraverseLoopsInnerToOuter(LoopNode* node);
133 
134   //
135   // Optimization.
136   //
137 
138   void SimplifyInduction(LoopNode* node);
139   void SimplifyBlocks(LoopNode* node);
140 
141   // Performs optimizations specific to inner loop with finite header logic (empty loop removal,
142   // unrolling, vectorization). Returns true if anything changed.
143   bool TryOptimizeInnerLoopFinite(LoopNode* node);
144 
145   // Performs optimizations specific to inner loop. Returns true if anything changed.
146   bool OptimizeInnerLoop(LoopNode* node);
147 
148   // Tries to apply loop unrolling for branch penalty reduction and better instruction scheduling
149   // opportunities. Returns whether transformation happened. 'generate_code' determines whether the
150   // optimization should be actually applied.
151   bool TryUnrollingForBranchPenaltyReduction(LoopAnalysisInfo* analysis_info,
152                                              bool generate_code = true);
153 
154   // Tries to apply loop peeling for loop invariant exits elimination. Returns whether
155   // transformation happened. 'generate_code' determines whether the optimization should be
156   // actually applied.
157   bool TryPeelingForLoopInvariantExitsElimination(LoopAnalysisInfo* analysis_info,
158                                                   bool generate_code = true);
159 
160   // Tries to perform whole loop unrolling for a small loop with a small trip count to eliminate
161   // the loop check overhead and to have more opportunities for inter-iteration optimizations.
162   // Returns whether transformation happened. 'generate_code' determines whether the optimization
163   // should be actually applied.
164   bool TryFullUnrolling(LoopAnalysisInfo* analysis_info, bool generate_code = true);
165 
166   // Tries to apply scalar loop peeling and unrolling.
167   bool TryPeelingAndUnrolling(LoopNode* node);
168 
169   //
170   // Vectorization analysis and synthesis.
171   //
172 
173   bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
174   void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count);
175   void GenerateNewLoop(LoopNode* node,
176                        HBasicBlock* block,
177                        HBasicBlock* new_preheader,
178                        HInstruction* lo,
179                        HInstruction* hi,
180                        HInstruction* step,
181                        uint32_t unroll);
182   bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code);
183   bool VectorizeUse(LoopNode* node,
184                     HInstruction* instruction,
185                     bool generate_code,
186                     DataType::Type type,
187                     uint64_t restrictions);
188   uint32_t GetVectorSizeInBytes();
189   bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions);
190   bool TrySetVectorLengthImpl(uint32_t length);
191 
TrySetVectorLength(DataType::Type type,uint32_t length)192   bool TrySetVectorLength(DataType::Type type, uint32_t length) {
193     bool res = TrySetVectorLengthImpl(length);
194     // Currently the vectorizer supports only the mode when full SIMD registers are used.
195     DCHECK(!res || (DataType::Size(type) * length == GetVectorSizeInBytes()));
196     return res;
197   }
198 
199   void GenerateVecInv(HInstruction* org, DataType::Type type);
200   void GenerateVecSub(HInstruction* org, HInstruction* offset);
201   void GenerateVecMem(HInstruction* org,
202                       HInstruction* opa,
203                       HInstruction* opb,
204                       HInstruction* offset,
205                       DataType::Type type);
206   void GenerateVecReductionPhi(HPhi* phi);
207   void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction);
208   HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction);
209   void GenerateVecOp(HInstruction* org,
210                      HInstruction* opa,
211                      HInstruction* opb,
212                      DataType::Type type);
213 
214   // Vectorization idioms.
215   bool VectorizeSaturationIdiom(LoopNode* node,
216                                 HInstruction* instruction,
217                                 bool generate_code,
218                                 DataType::Type type,
219                                 uint64_t restrictions);
220   bool VectorizeHalvingAddIdiom(LoopNode* node,
221                                 HInstruction* instruction,
222                                 bool generate_code,
223                                 DataType::Type type,
224                                 uint64_t restrictions);
225   bool VectorizeSADIdiom(LoopNode* node,
226                          HInstruction* instruction,
227                          bool generate_code,
228                          DataType::Type type,
229                          uint64_t restrictions);
230   bool VectorizeDotProdIdiom(LoopNode* node,
231                              HInstruction* instruction,
232                              bool generate_code,
233                              DataType::Type type,
234                              uint64_t restrictions);
235 
236   // Vectorization heuristics.
237   Alignment ComputeAlignment(HInstruction* offset,
238                              DataType::Type type,
239                              bool is_string_char_at,
240                              uint32_t peeling = 0);
241   void SetAlignmentStrategy(const ScopedArenaVector<uint32_t>& peeling_votes,
242                             const ArrayReference* peeling_candidate);
243   uint32_t MaxNumberPeeled();
244   bool IsVectorizationProfitable(int64_t trip_count);
245 
246   //
247   // Helpers.
248   //
249 
250   bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
251   bool TrySetPhiReduction(HPhi* phi);
252 
253   // Detects loop header with a single induction (returned in main_phi), possibly
254   // other phis for reductions, but no other side effects. Returns true on success.
255   bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi);
256 
257   bool IsEmptyBody(HBasicBlock* block);
258   bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
259                            HInstruction* instruction,
260                            bool collect_loop_uses,
261                            /*out*/ uint32_t* use_count);
262   bool IsUsedOutsideLoop(HLoopInformation* loop_info,
263                          HInstruction* instruction);
264   bool TryReplaceWithLastValue(HLoopInformation* loop_info,
265                                HInstruction* instruction,
266                                HBasicBlock* block);
267   bool TryAssignLastValue(HLoopInformation* loop_info,
268                           HInstruction* instruction,
269                           HBasicBlock* block,
270                           bool collect_loop_uses);
271   void RemoveDeadInstructions(const HInstructionList& list);
272   bool CanRemoveCycle();  // Whether the current 'iset_' is removable.
273 
IsInPredicatedVectorizationMode()274   bool IsInPredicatedVectorizationMode() const { return predicated_vectorization_mode_; }
275 
276   // Compiler options (to query ISA features).
277   const CompilerOptions* compiler_options_;
278 
279   // Cached target SIMD vector register size in bytes.
280   const size_t simd_register_size_;
281 
282   // Range information based on prior induction variable analysis.
283   InductionVarRange induction_range_;
284 
285   // Phase-local heap memory allocator for the loop optimizer. Storage obtained
286   // through this allocator is immediately released when the loop optimizer is done.
287   ScopedArenaAllocator* loop_allocator_;
288 
289   // Global heap memory allocator. Used to build HIR.
290   ArenaAllocator* global_allocator_;
291 
292   // Entries into the loop hierarchy representation. The hierarchy resides
293   // in phase-local heap memory.
294   LoopNode* top_loop_;
295   LoopNode* last_loop_;
296 
297   // Temporary bookkeeping of a set of instructions.
298   // Contents reside in phase-local heap memory.
299   ScopedArenaSet<HInstruction*>* iset_;
300 
301   // Temporary bookkeeping of reduction instructions. Mapping is two-fold:
302   // (1) reductions in the loop-body are mapped back to their phi definition,
303   // (2) phi definitions are mapped to their initial value (updated during
304   //     code generation to feed the proper values into the new chain).
305   // Contents reside in phase-local heap memory.
306   ScopedArenaSafeMap<HInstruction*, HInstruction*>* reductions_;
307 
308   // Flag that tracks if any simplifications have occurred.
309   bool simplified_;
310 
311   // Whether to use predicated loop vectorization (e.g. for arm64 SVE target).
312   bool predicated_vectorization_mode_;
313 
314   // Number of "lanes" for selected packed type.
315   uint32_t vector_length_;
316 
317   // Set of array references in the vector loop.
318   // Contents reside in phase-local heap memory.
319   ScopedArenaSet<ArrayReference>* vector_refs_;
320 
321   // Static or dynamic loop peeling for alignment.
322   uint32_t vector_static_peeling_factor_;
323   const ArrayReference* vector_dynamic_peeling_candidate_;
324 
325   // Dynamic data dependence test of the form a != b.
326   HInstruction* vector_runtime_test_a_;
327   HInstruction* vector_runtime_test_b_;
328 
329   // Mapping used during vectorization synthesis for both the scalar peeling/cleanup
330   // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data
331   // structure maps original instructions into the new instructions.
332   // Contents reside in phase-local heap memory.
333   ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;
334 
335   // Permanent mapping used during vectorization synthesis.
336   // Contents reside in phase-local heap memory.
337   ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_;
338 
339   // Temporary vectorization bookkeeping.
340   VectorMode vector_mode_;  // synthesis mode
341   HBasicBlock* vector_preheader_;  // preheader of the new loop
342   HBasicBlock* vector_header_;  // header of the new loop
343   HBasicBlock* vector_body_;  // body of the new loop
344   HInstruction* vector_index_;  // normalized index of the new loop
345 
346   // Helper for target-specific behaviour for loop optimizations.
347   ArchNoOptsLoopHelper* arch_loop_helper_;
348 
349   friend class LoopOptimizationTest;
350 
351   DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);
352 };
353 
354 }  // namespace art
355 
356 #endif  // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
357