1 /* 2 * Copyright (C) 2016 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 18 #define ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 19 20 #include "base/scoped_arena_allocator.h" 21 #include "base/scoped_arena_containers.h" 22 #include "induction_var_range.h" 23 #include "nodes.h" 24 #include "optimization.h" 25 26 namespace art { 27 28 class CompilerDriver; 29 30 /** 31 * Loop optimizations. Builds a loop hierarchy and applies optimizations to 32 * the detected nested loops, such as removal of dead induction and empty loops 33 * and inner loop vectorization. 34 */ 35 class HLoopOptimization : public HOptimization { 36 public: 37 HLoopOptimization(HGraph* graph, 38 CompilerDriver* compiler_driver, 39 HInductionVarAnalysis* induction_analysis, 40 OptimizingCompilerStats* stats, 41 const char* name = kLoopOptimizationPassName); 42 43 void Run() OVERRIDE; 44 45 static constexpr const char* kLoopOptimizationPassName = "loop_optimization"; 46 47 private: 48 /** 49 * A single loop inside the loop hierarchy representation. 50 */ 51 struct LoopNode : public ArenaObject<kArenaAllocLoopOptimization> { LoopNodeLoopNode52 explicit LoopNode(HLoopInformation* lp_info) 53 : loop_info(lp_info), 54 outer(nullptr), 55 inner(nullptr), 56 previous(nullptr), 57 next(nullptr) {} 58 HLoopInformation* loop_info; 59 LoopNode* outer; 60 LoopNode* inner; 61 LoopNode* previous; 62 LoopNode* next; 63 }; 64 65 /* 66 * Vectorization restrictions (bit mask). 67 */ 68 enum VectorRestrictions { 69 kNone = 0, // no restrictions 70 kNoMul = 1 << 0, // no multiplication 71 kNoDiv = 1 << 1, // no division 72 kNoShift = 1 << 2, // no shift 73 kNoShr = 1 << 3, // no arithmetic shift right 74 kNoHiBits = 1 << 4, // "wider" operations cannot bring in higher order bits 75 kNoSignedHAdd = 1 << 5, // no signed halving add 76 kNoUnroundedHAdd = 1 << 6, // no unrounded halving add 77 kNoAbs = 1 << 7, // no absolute value 78 kNoMinMax = 1 << 8, // no min/max 79 kNoStringCharAt = 1 << 9, // no StringCharAt 80 kNoReduction = 1 << 10, // no reduction 81 kNoSAD = 1 << 11, // no sum of absolute differences (SAD) 82 kNoWideSAD = 1 << 12, // no sum of absolute differences (SAD) with operand widening 83 }; 84 85 /* 86 * Vectorization mode during synthesis 87 * (sequential peeling/cleanup loop or vector loop). 88 */ 89 enum VectorMode { 90 kSequential, 91 kVector 92 }; 93 94 /* 95 * Representation of a unit-stride array reference. 96 */ 97 struct ArrayReference { 98 ArrayReference(HInstruction* b, HInstruction* o, DataType::Type t, bool l, bool c = false) baseArrayReference99 : base(b), offset(o), type(t), lhs(l), is_string_char_at(c) { } 100 bool operator<(const ArrayReference& other) const { 101 return 102 (base < other.base) || 103 (base == other.base && 104 (offset < other.offset || (offset == other.offset && 105 (type < other.type || 106 (type == other.type && 107 (lhs < other.lhs || 108 (lhs == other.lhs && 109 is_string_char_at < other.is_string_char_at))))))); 110 } 111 HInstruction* base; // base address 112 HInstruction* offset; // offset + i 113 DataType::Type type; // component type 114 bool lhs; // def/use 115 bool is_string_char_at; // compressed string read 116 }; 117 118 // 119 // Loop setup and traversal. 120 // 121 122 void LocalRun(); 123 void AddLoop(HLoopInformation* loop_info); 124 void RemoveLoop(LoopNode* node); 125 126 // Traverses all loops inner to outer to perform simplifications and optimizations. 127 // Returns true if loops nested inside current loop (node) have changed. 128 bool TraverseLoopsInnerToOuter(LoopNode* node); 129 130 // 131 // Optimization. 132 // 133 134 void SimplifyInduction(LoopNode* node); 135 void SimplifyBlocks(LoopNode* node); 136 137 // Performs optimizations specific to inner loop (empty loop removal, 138 // unrolling, vectorization). Returns true if anything changed. 139 bool OptimizeInnerLoop(LoopNode* node); 140 141 // 142 // Vectorization analysis and synthesis. 143 // 144 145 bool ShouldVectorize(LoopNode* node, HBasicBlock* block, int64_t trip_count); 146 void Vectorize(LoopNode* node, HBasicBlock* block, HBasicBlock* exit, int64_t trip_count); 147 void GenerateNewLoop(LoopNode* node, 148 HBasicBlock* block, 149 HBasicBlock* new_preheader, 150 HInstruction* lo, 151 HInstruction* hi, 152 HInstruction* step, 153 uint32_t unroll); 154 bool VectorizeDef(LoopNode* node, HInstruction* instruction, bool generate_code); 155 bool VectorizeUse(LoopNode* node, 156 HInstruction* instruction, 157 bool generate_code, 158 DataType::Type type, 159 uint64_t restrictions); 160 uint32_t GetVectorSizeInBytes(); 161 bool TrySetVectorType(DataType::Type type, /*out*/ uint64_t* restrictions); 162 bool TrySetVectorLength(uint32_t length); 163 void GenerateVecInv(HInstruction* org, DataType::Type type); 164 void GenerateVecSub(HInstruction* org, HInstruction* offset); 165 void GenerateVecMem(HInstruction* org, 166 HInstruction* opa, 167 HInstruction* opb, 168 HInstruction* offset, 169 DataType::Type type); 170 void GenerateVecReductionPhi(HPhi* phi); 171 void GenerateVecReductionPhiInputs(HPhi* phi, HInstruction* reduction); 172 HInstruction* ReduceAndExtractIfNeeded(HInstruction* instruction); 173 void GenerateVecOp(HInstruction* org, 174 HInstruction* opa, 175 HInstruction* opb, 176 DataType::Type type, 177 bool is_unsigned = false); 178 179 // Vectorization idioms. 180 bool VectorizeHalvingAddIdiom(LoopNode* node, 181 HInstruction* instruction, 182 bool generate_code, 183 DataType::Type type, 184 uint64_t restrictions); 185 bool VectorizeSADIdiom(LoopNode* node, 186 HInstruction* instruction, 187 bool generate_code, 188 DataType::Type type, 189 uint64_t restrictions); 190 191 // Vectorization heuristics. 192 Alignment ComputeAlignment(HInstruction* offset, 193 DataType::Type type, 194 bool is_string_char_at, 195 uint32_t peeling = 0); 196 void SetAlignmentStrategy(uint32_t peeling_votes[], 197 const ArrayReference* peeling_candidate); 198 uint32_t MaxNumberPeeled(); 199 bool IsVectorizationProfitable(int64_t trip_count); 200 uint32_t GetUnrollingFactor(HBasicBlock* block, int64_t trip_count); 201 202 // 203 // Helpers. 204 // 205 206 bool TrySetPhiInduction(HPhi* phi, bool restrict_uses); 207 bool TrySetPhiReduction(HPhi* phi); 208 209 // Detects loop header with a single induction (returned in main_phi), possibly 210 // other phis for reductions, but no other side effects. Returns true on success. 211 bool TrySetSimpleLoopHeader(HBasicBlock* block, /*out*/ HPhi** main_phi); 212 213 bool IsEmptyBody(HBasicBlock* block); 214 bool IsOnlyUsedAfterLoop(HLoopInformation* loop_info, 215 HInstruction* instruction, 216 bool collect_loop_uses, 217 /*out*/ uint32_t* use_count); 218 bool IsUsedOutsideLoop(HLoopInformation* loop_info, 219 HInstruction* instruction); 220 bool TryReplaceWithLastValue(HLoopInformation* loop_info, 221 HInstruction* instruction, 222 HBasicBlock* block); 223 bool TryAssignLastValue(HLoopInformation* loop_info, 224 HInstruction* instruction, 225 HBasicBlock* block, 226 bool collect_loop_uses); 227 void RemoveDeadInstructions(const HInstructionList& list); 228 bool CanRemoveCycle(); // Whether the current 'iset_' is removable. 229 230 // Compiler driver (to query ISA features). 231 const CompilerDriver* compiler_driver_; 232 233 // Range information based on prior induction variable analysis. 234 InductionVarRange induction_range_; 235 236 // Phase-local heap memory allocator for the loop optimizer. Storage obtained 237 // through this allocator is immediately released when the loop optimizer is done. 238 ScopedArenaAllocator* loop_allocator_; 239 240 // Global heap memory allocator. Used to build HIR. 241 ArenaAllocator* global_allocator_; 242 243 // Entries into the loop hierarchy representation. The hierarchy resides 244 // in phase-local heap memory. 245 LoopNode* top_loop_; 246 LoopNode* last_loop_; 247 248 // Temporary bookkeeping of a set of instructions. 249 // Contents reside in phase-local heap memory. 250 ScopedArenaSet<HInstruction*>* iset_; 251 252 // Temporary bookkeeping of reduction instructions. Mapping is two-fold: 253 // (1) reductions in the loop-body are mapped back to their phi definition, 254 // (2) phi definitions are mapped to their initial value (updated during 255 // code generation to feed the proper values into the new chain). 256 // Contents reside in phase-local heap memory. 257 ScopedArenaSafeMap<HInstruction*, HInstruction*>* reductions_; 258 259 // Flag that tracks if any simplifications have occurred. 260 bool simplified_; 261 262 // Number of "lanes" for selected packed type. 263 uint32_t vector_length_; 264 265 // Set of array references in the vector loop. 266 // Contents reside in phase-local heap memory. 267 ScopedArenaSet<ArrayReference>* vector_refs_; 268 269 // Static or dynamic loop peeling for alignment. 270 uint32_t vector_static_peeling_factor_; 271 const ArrayReference* vector_dynamic_peeling_candidate_; 272 273 // Dynamic data dependence test of the form a != b. 274 HInstruction* vector_runtime_test_a_; 275 HInstruction* vector_runtime_test_b_; 276 277 // Mapping used during vectorization synthesis for both the scalar peeling/cleanup 278 // loop (mode is kSequential) and the actual vector loop (mode is kVector). The data 279 // structure maps original instructions into the new instructions. 280 // Contents reside in phase-local heap memory. 281 ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_map_; 282 283 // Permanent mapping used during vectorization synthesis. 284 // Contents reside in phase-local heap memory. 285 ScopedArenaSafeMap<HInstruction*, HInstruction*>* vector_permanent_map_; 286 287 // Temporary vectorization bookkeeping. 288 VectorMode vector_mode_; // synthesis mode 289 HBasicBlock* vector_preheader_; // preheader of the new loop 290 HBasicBlock* vector_header_; // header of the new loop 291 HBasicBlock* vector_body_; // body of the new loop 292 HInstruction* vector_index_; // normalized index of the new loop 293 294 friend class LoopOptimizationTest; 295 296 DISALLOW_COPY_AND_ASSIGN(HLoopOptimization); 297 }; 298 299 } // namespace art 300 301 #endif // ART_COMPILER_OPTIMIZING_LOOP_OPTIMIZATION_H_ 302