1 /*
2  * Copyright (C) 2016 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
18 #define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
19 
20 #include "base/scoped_arena_allocator.h"
21 #include "base/scoped_arena_containers.h"
22 #include "induction_var_range.h"
23 #include "nodes.h"
24 #include "optimization.h"
25 
26 namespace art {
27 
28 class CompilerDriver;
29 
30 /**
31  * Loop optimizations. Builds a loop hierarchy and applies optimizations to
32  * the detected nested loops, such as removal of dead induction and empty loops
33  * and inner loop vectorization.
34  */
35 class HLoopOptimization : public HOptimization {
36  public:
37   HLoopOptimization(HGraph* graph,
38                     CompilerDriver* compiler_driver,
39                     HInductionVarAnalysis* induction_analysis,
40                     OptimizingCompilerStats* stats,
41                     const char* name = kLoopOptimizationPassName);
42 
43   void Run() OVERRIDE;
44 
45   static constexpr const char* kLoopOptimizationPassName = "loop_optimization";
46 
47  private:
48   /**
49    * A single loop inside the loop hierarchy representation.
50    */
51   struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> {
LoopNodeLoopNode52     explicit LoopNode(HLoopInformation* lp_info)
53         : loop_info(lp_info),
54           outer(nullptr),
55           inner(nullptr),
56           previous(nullptr),
57           next(nullptr) {}
58     HLoopInformation* loop_info;
59     LoopNode* outer;
60     LoopNode* inner;
61     LoopNode* previous;
62     LoopNode* next;
63   };
64 
65   /*
66    * Vectorization restrictions (bit mask).
67    */
68   enum VectorRestrictions {
69     kNone            = 0,        // no restrictions
70     kNoMul           = 1 << 0,   // no multiplication
71     kNoDiv           = 1 << 1,   // no division
72     kNoShift         = 1 << 2,   // no shift
73     kNoShr           = 1 << 3,   // no arithmetic shift right
74     kNoHiBits        = 1 << 4,   // "wider" operations cannot bring in higher order bits
75     kNoSignedHAdd    = 1 << 5,   // no signed halving add
76     kNoUnroundedHAdd = 1 << 6,   // no unrounded halving add
77     kNoAbs           = 1 << 7,   // no absolute value
78     kNoMinMax        = 1 << 8,   // no min/max
79     kNoStringCharAt  = 1 << 9,   // no StringCharAt
80     kNoReduction     = 1 << 10,  // no reduction
81     kNoSAD           = 1 << 11,  // no sum of absolute differences (SAD)
82     kNoWideSAD       = 1 << 12,  // no sum of absolute differences (SAD) with operand widening
83   };
84 
85   /*
86    * Vectorization mode during synthesis
87    * (sequential peeling/cleanup loop or vector loop).
88    */
89   enum VectorMode {
90     kSequential,
91     kVector
92   };
93 
94   /*
95    * Representation of a unit-stride array reference.
96    */
97   struct ArrayReference {
98     ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false)
baseArrayReference99         : base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { }
100     bool operator<(const ArrayReference& other) const {
101       return
102           (base < other.base) ||
103           (base == other.base &&
104            (offset < other.offset || (offset == other.offset &&
105                                       (type < other.type ||
106                                        (type == other.type &&
107                                         (lhs < other.lhs ||
108                                          (lhs == other.lhs &&
109                                           is_string_char_at < other.is_string_char_at)))))));
110     }
111     HInstruction* base;      // base address
112     HInstruction* offset;    // offset + i
113     DataType::Type type;     // component type
114     bool lhs;                // def/use
115     bool is_string_char_at;  // compressed string read
116   };
117 
118   //
119   // Loop setup and traversal.
120   //
121 
122   void LocalRun();
123   void AddLoop(HLoopInformation* loop_info);
124   void RemoveLoop(LoopNode* node);
125 
126   // Traverses all loops inner to outer to perform simplifications and optimizations.
127   // Returns true if loops nested inside current loop (node) have changed.
128   bool TraverseLoopsInnerToOuter(LoopNode* node);
129 
130   //
131   // Optimization.
132   //
133 
134   void SimplifyInduction(LoopNode* node);
135   void SimplifyBlocks(LoopNode* node);
136 
137   // Performs optimizations specific to inner loop (empty loop removal,
138   // unrolling, vectorization). Returns true if anything changed.
139   bool OptimizeInnerLoop(LoopNode* node);
140 
141   //
142   // Vectorization analysis and synthesis.
143   //
144 
145   bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count);
146   void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count);
147   void GenerateNewLoop(LoopNode* node,
148                        HBasicBlock* block,
149                        HBasicBlock* new_preheader,
150                        HInstruction* lo,
151                        HInstruction* hi,
152                        HInstruction* step,
153                        uint32_t unroll);
154   bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code);
155   bool VectorizeUse(LoopNode* node,
156                     HInstruction* instruction,
157                     bool generate_code,
158                     DataType::Type type,
159                     uint64_t restrictions);
160   uint32_t GetVectorSizeInBytes();
161   bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions);
162   bool TrySetVectorLength(uint32_t length);
163   void GenerateVecInv(HInstruction* org, DataType::Type type);
164   void GenerateVecSub(HInstruction* org, HInstruction* offset);
165   void GenerateVecMem(HInstruction* org,
166                       HInstruction* opa,
167                       HInstruction* opb,
168                       HInstruction* offset,
169                       DataType::Type type);
170   void GenerateVecReductionPhi(HPhi* phi);
171   void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction);
172   HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction);
173   void GenerateVecOp(HInstruction* org,
174                      HInstruction* opa,
175                      HInstruction* opb,
176                      DataType::Type type,
177                      bool is_unsigned = false);
178 
179   // Vectorization idioms.
180   bool VectorizeHalvingAddIdiom(LoopNode* node,
181                                 HInstruction* instruction,
182                                 bool generate_code,
183                                 DataType::Type type,
184                                 uint64_t restrictions);
185   bool VectorizeSADIdiom(LoopNode* node,
186                          HInstruction* instruction,
187                          bool generate_code,
188                          DataType::Type type,
189                          uint64_t restrictions);
190 
191   // Vectorization heuristics.
192   Alignment ComputeAlignment(HInstruction* offset,
193                              DataType::Type type,
194                              bool is_string_char_at,
195                              uint32_t peeling = 0);
196   void SetAlignmentStrategy(uint32_t peeling_votes[],
197                             const ArrayReference* peeling_candidate);
198   uint32_t MaxNumberPeeled();
199   bool IsVectorizationProfitable(int64_t trip_count);
200   uint32_t GetUnrollingFactor(HBasicBlock* block, int64_t trip_count);
201 
202   //
203   // Helpers.
204   //
205 
206   bool TrySetPhiInduction(HPhi* phi, bool restrict_uses);
207   bool TrySetPhiReduction(HPhi* phi);
208 
209   // Detects loop header with a single induction (returned in main_phi), possibly
210   // other phis for reductions, but no other side effects. Returns true on success.
211   bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi);
212 
213   bool IsEmptyBody(HBasicBlock* block);
214   bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info,
215                            HInstruction* instruction,
216                            bool collect_loop_uses,
217                            /*out*/ uint32_t* use_count);
218   bool IsUsedOutsideLoop(HLoopInformation* loop_info,
219                          HInstruction* instruction);
220   bool TryReplaceWithLastValue(HLoopInformation* loop_info,
221                                HInstruction* instruction,
222                                HBasicBlock* block);
223   bool TryAssignLastValue(HLoopInformation* loop_info,
224                           HInstruction* instruction,
225                           HBasicBlock* block,
226                           bool collect_loop_uses);
227   void RemoveDeadInstructions(const HInstructionList& list);
228   bool CanRemoveCycle();  // Whether the current 'iset_' is removable.
229 
230   // Compiler driver (to query ISA features).
231   const CompilerDriver* compiler_driver_;
232 
233   // Range information based on prior induction variable analysis.
234   InductionVarRange induction_range_;
235 
236   // Phase-local heap memory allocator for the loop optimizer. Storage obtained
237   // through this allocator is immediately released when the loop optimizer is done.
238   ScopedArenaAllocator* loop_allocator_;
239 
240   // Global heap memory allocator. Used to build HIR.
241   ArenaAllocator* global_allocator_;
242 
243   // Entries into the loop hierarchy representation. The hierarchy resides
244   // in phase-local heap memory.
245   LoopNode* top_loop_;
246   LoopNode* last_loop_;
247 
248   // Temporary bookkeeping of a set of instructions.
249   // Contents reside in phase-local heap memory.
250   ScopedArenaSet<HInstruction*>* iset_;
251 
252   // Temporary bookkeeping of reduction instructions. Mapping is two-fold:
253   // (1) reductions in the loop-body are mapped back to their phi definition,
254   // (2) phi definitions are mapped to their initial value (updated during
255   //     code generation to feed the proper values into the new chain).
256   // Contents reside in phase-local heap memory.
257   ScopedArenaSafeMap<HInstruction*, HInstruction*>* reductions_;
258 
259   // Flag that tracks if any simplifications have occurred.
260   bool simplified_;
261 
262   // Number of "lanes" for selected packed type.
263   uint32_t vector_length_;
264 
265   // Set of array references in the vector loop.
266   // Contents reside in phase-local heap memory.
267   ScopedArenaSet<ArrayReference>* vector_refs_;
268 
269   // Static or dynamic loop peeling for alignment.
270   uint32_t vector_static_peeling_factor_;
271   const ArrayReference* vector_dynamic_peeling_candidate_;
272 
273   // Dynamic data dependence test of the form a != b.
274   HInstruction* vector_runtime_test_a_;
275   HInstruction* vector_runtime_test_b_;
276 
277   // Mapping used during vectorization synthesis for both the scalar peeling/cleanup
278   // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data
279   // structure maps original instructions into the new instructions.
280   // Contents reside in phase-local heap memory.
281   ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_map_;
282 
283   // Permanent mapping used during vectorization synthesis.
284   // Contents reside in phase-local heap memory.
285   ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_;
286 
287   // Temporary vectorization bookkeeping.
288   VectorMode vector_mode_;  // synthesis mode
289   HBasicBlock* vector_preheader_;  // preheader of the new loop
290   HBasicBlock* vector_header_;  // header of the new loop
291   HBasicBlock* vector_body_;  // body of the new loop
292   HInstruction* vector_index_;  // normalized index of the new loop
293 
294   friend class LoopOptimizationTest;
295 
296   DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);
297 };
298 
299 }  // namespace art
300 
301 #endif  // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_
302