1 //===- polly/ScheduleOptimizer.h - The Schedule Optimizer -------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef POLLY_SCHEDULEOPTIMIZER_H 10 #define POLLY_SCHEDULEOPTIMIZER_H 11 12 #include "llvm/ADT/ArrayRef.h" 13 #include "isl/isl-noexceptions.h" 14 15 namespace llvm { 16 17 class TargetTransformInfo; 18 } // namespace llvm 19 20 struct isl_schedule_node; 21 22 /// Parameters of the micro kernel. 23 /// 24 /// Parameters, which determine sizes of rank-1 (i.e., outer product) update 25 /// used in the optimized matrix multiplication. 26 struct MicroKernelParamsTy { 27 int Mr; 28 int Nr; 29 }; 30 31 /// Parameters of the macro kernel. 32 /// 33 /// Parameters, which determine sizes of blocks of partitioned matrices 34 /// used in the optimized matrix multiplication. 35 struct MacroKernelParamsTy { 36 int Mc; 37 int Nc; 38 int Kc; 39 }; 40 41 namespace polly { 42 43 struct Dependences; 44 class MemoryAccess; 45 class Scop; 46 47 /// Additional parameters of the schedule optimizer. 48 /// 49 /// Target Transform Info and the SCoP dependencies used by the schedule 50 /// optimizer. 51 struct OptimizerAdditionalInfoTy { 52 const llvm::TargetTransformInfo *TTI; 53 const Dependences *D; 54 }; 55 56 /// Parameters of the matrix multiplication operands. 57 /// 58 /// Parameters, which describe access relations that represent operands of the 59 /// matrix multiplication. 60 struct MatMulInfoTy { 61 MemoryAccess *A = nullptr; 62 MemoryAccess *B = nullptr; 63 MemoryAccess *ReadFromC = nullptr; 64 MemoryAccess *WriteToC = nullptr; 65 int i = -1; 66 int j = -1; 67 int k = -1; 68 }; 69 70 extern bool DisablePollyTiling; 71 } // namespace polly 72 73 class ScheduleTreeOptimizer { 74 public: 75 /// Apply schedule tree transformations. 76 /// 77 /// This function takes an (possibly already optimized) schedule tree and 78 /// applies a set of additional optimizations on the schedule tree. The 79 /// transformations applied include: 80 /// 81 /// - Tiling 82 /// - Prevectorization 83 /// 84 /// @param Schedule The schedule object the transformations will be applied 85 /// to. 86 /// @param OAI Target Transform Info and the SCoP dependencies. 87 /// @returns The transformed schedule. 88 static isl::schedule 89 optimizeSchedule(isl::schedule Schedule, 90 const polly::OptimizerAdditionalInfoTy *OAI = nullptr); 91 92 /// Apply schedule tree transformations. 93 /// 94 /// This function takes a node in an (possibly already optimized) schedule 95 /// tree and applies a set of additional optimizations on this schedule tree 96 /// node and its descendants. The transformations applied include: 97 /// 98 /// - Tiling 99 /// - Prevectorization 100 /// 101 /// @param Node The schedule object post-transformations will be applied to. 102 /// @param OAI Target Transform Info and the SCoP dependencies. 103 /// @returns The transformed schedule. 104 static isl::schedule_node 105 optimizeScheduleNode(isl::schedule_node Node, 106 const polly::OptimizerAdditionalInfoTy *OAI = nullptr); 107 108 /// Decide if the @p NewSchedule is profitable for @p S. 109 /// 110 /// @param S The SCoP we optimize. 111 /// @param NewSchedule The new schedule we computed. 112 /// 113 /// @return True, if we believe @p NewSchedule is an improvement for @p S. 114 static bool isProfitableSchedule(polly::Scop &S, isl::schedule NewSchedule); 115 116 /// Isolate a set of partial tile prefixes. 117 /// 118 /// This set should ensure that it contains only partial tile prefixes that 119 /// have exactly VectorWidth iterations. 120 /// 121 /// @param Node A schedule node band, which is a parent of a band node, 122 /// that contains a vector loop. 123 /// @return Modified isl_schedule_node. 124 static isl::schedule_node isolateFullPartialTiles(isl::schedule_node Node, 125 int VectorWidth); 126 127 private: 128 /// Tile a schedule node. 129 /// 130 /// @param Node The node to tile. 131 /// @param Identifier An name that identifies this kind of tiling and 132 /// that is used to mark the tiled loops in the 133 /// generated AST. 134 /// @param TileSizes A vector of tile sizes that should be used for 135 /// tiling. 136 /// @param DefaultTileSize A default tile size that is used for dimensions 137 /// that are not covered by the TileSizes vector. 138 static isl::schedule_node tileNode(isl::schedule_node Node, 139 const char *Identifier, 140 llvm::ArrayRef<int> TileSizes, 141 int DefaultTileSize); 142 143 /// Tile a schedule node and unroll point loops. 144 /// 145 /// @param Node The node to register tile. 146 /// @param TileSizes A vector of tile sizes that should be used for 147 /// tiling. 148 /// @param DefaultTileSize A default tile size that is used for dimensions 149 static isl::schedule_node applyRegisterTiling(isl::schedule_node Node, 150 llvm::ArrayRef<int> TileSizes, 151 int DefaultTileSize); 152 153 /// Apply the BLIS matmul optimization pattern. 154 /// 155 /// Make the loops containing the matrix multiplication be the innermost 156 /// loops and apply the BLIS matmul optimization pattern. BLIS implements 157 /// gemm as three nested loops around a macro-kernel, plus two packing 158 /// routines. The macro-kernel is implemented in terms of two additional 159 /// loops around a micro-kernel. The micro-kernel is a loop around a rank-1 160 /// (i.e., outer product) update. 161 /// 162 /// For a detailed description please see [1]. 163 /// 164 /// The order of the loops defines the data reused in the BLIS implementation 165 /// of gemm ([1]). In particular, elements of the matrix B, the second 166 /// operand of matrix multiplication, are reused between iterations of the 167 /// innermost loop. To keep the reused data in cache, only elements of matrix 168 /// A, the first operand of matrix multiplication, should be evicted during 169 /// an iteration of the innermost loop. To provide such a cache replacement 170 /// policy, elements of the matrix A can, in particular, be loaded first and, 171 /// consequently, be least-recently-used. 172 /// 173 /// In our case matrices are stored in row-major order instead of 174 /// column-major order used in the BLIS implementation ([1]). It affects only 175 /// on the form of the BLIS micro kernel and the computation of its 176 /// parameters. In particular, reused elements of the matrix B are 177 /// successively multiplied by specific elements of the matrix A. 178 /// 179 /// Refs.: 180 /// [1] - Analytical Modeling is Enough for High Performance BLIS 181 /// Tze Meng Low, Francisco D Igual, Tyler M Smith, Enrique S Quintana-Orti 182 /// Technical Report, 2014 183 /// http://www.cs.utexas.edu/users/flame/pubs/TOMS-BLIS-Analytical.pdf 184 /// 185 /// @see ScheduleTreeOptimizer::createMicroKernel 186 /// @see ScheduleTreeOptimizer::createMacroKernel 187 /// @see getMicroKernelParams 188 /// @see getMacroKernelParams 189 /// 190 /// TODO: Implement the packing transformation. 191 /// 192 /// @param Node The node that contains a band to be optimized. The node 193 /// is required to successfully pass 194 /// ScheduleTreeOptimizer::isMatrMultPattern. 195 /// @param TTI Target Transform Info. 196 /// @param MMI Parameters of the matrix multiplication operands. 197 /// @returns The transformed schedule. 198 static isl::schedule_node 199 optimizeMatMulPattern(isl::schedule_node Node, 200 const llvm::TargetTransformInfo *TTI, 201 polly::MatMulInfoTy &MMI); 202 203 /// Check if this node is a band node we want to tile. 204 /// 205 /// We look for innermost band nodes where individual dimensions are marked as 206 /// permutable. 207 /// 208 /// @param Node The node to check. 209 static bool isTileableBandNode(isl::schedule_node Node); 210 211 /// Pre-vectorizes one scheduling dimension of a schedule band. 212 /// 213 /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and 214 /// sinks the resulting point loop. 215 /// 216 /// Example (DimToVectorize=0, VectorWidth=4): 217 /// 218 /// | Before transformation: 219 /// | 220 /// | A[i,j] -> [i,j] 221 /// | 222 /// | for (i = 0; i < 128; i++) 223 /// | for (j = 0; j < 128; j++) 224 /// | A(i,j); 225 /// 226 /// | After transformation: 227 /// | 228 /// | for (it = 0; it < 32; it+=1) 229 /// | for (j = 0; j < 128; j++) 230 /// | for (ip = 0; ip <= 3; ip++) 231 /// | A(4 * it + ip,j); 232 /// 233 /// The goal of this transformation is to create a trivially vectorizable 234 /// loop. This means a parallel loop at the innermost level that has a 235 /// constant number of iterations corresponding to the target vector width. 236 /// 237 /// This transformation creates a loop at the innermost level. The loop has 238 /// a constant number of iterations, if the number of loop iterations at 239 /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is 240 /// currently constant and not yet target specific. This function does not 241 /// reason about parallelism. 242 static isl::schedule_node prevectSchedBand(isl::schedule_node Node, 243 unsigned DimToVectorize, 244 int VectorWidth); 245 246 /// Apply additional optimizations on the bands in the schedule tree. 247 /// 248 /// We are looking for an innermost band node and apply the following 249 /// transformations: 250 /// 251 /// - Tile the band 252 /// - if the band is tileable 253 /// - if the band has more than one loop dimension 254 /// 255 /// - Prevectorize the schedule of the band (or the point loop in case of 256 /// tiling). 257 /// - if vectorization is enabled 258 /// 259 /// @param Node The schedule node to (possibly) optimize. 260 /// @param User A pointer to forward some use information 261 /// (currently unused). 262 static isl_schedule_node *optimizeBand(isl_schedule_node *Node, void *User); 263 264 /// Apply additional optimizations on the bands in the schedule tree. 265 /// 266 /// We apply the following 267 /// transformations: 268 /// 269 /// - Tile the band 270 /// - Prevectorize the schedule of the band (or the point loop in case of 271 /// tiling). 272 /// - if vectorization is enabled 273 /// 274 /// @param Node The schedule node to (possibly) optimize. 275 /// @param User A pointer to forward some use information 276 /// (currently unused). 277 static isl::schedule_node standardBandOpts(isl::schedule_node Node, 278 void *User); 279 280 /// Check if this node contains a partial schedule that could 281 /// probably be optimized with analytical modeling. 282 /// 283 /// isMatrMultPattern tries to determine whether the following conditions 284 /// are true: 285 /// 1. the partial schedule contains only one statement. 286 /// 2. there are exactly three input dimensions. 287 /// 3. all memory accesses of the statement will have stride 0 or 1, if we 288 /// interchange loops (switch the variable used in the inner loop to 289 /// the outer loop). 290 /// 4. all memory accesses of the statement except from the last one, are 291 /// read memory access and the last one is write memory access. 292 /// 5. all subscripts of the last memory access of the statement don't 293 /// contain the variable used in the inner loop. 294 /// If this is the case, we could try to use an approach that is similar to 295 /// the one used to get close-to-peak performance of matrix multiplications. 296 /// 297 /// @param Node The node to check. 298 /// @param D The SCoP dependencies. 299 /// @param MMI Parameters of the matrix multiplication operands. 300 static bool isMatrMultPattern(isl::schedule_node Node, 301 const polly::Dependences *D, 302 polly::MatMulInfoTy &MMI); 303 304 /// Create the BLIS macro-kernel. 305 /// 306 /// We create the BLIS macro-kernel by applying a combination of tiling 307 /// of dimensions of the band node and interchanging of two innermost 308 /// modified dimensions. The values of of MacroKernelParams's fields are used 309 /// as tile sizes. 310 /// 311 /// @param Node The schedule node to be modified. 312 /// @param MacroKernelParams Parameters of the macro kernel 313 /// to be used as tile sizes. 314 static isl::schedule_node 315 createMacroKernel(isl::schedule_node Node, 316 MacroKernelParamsTy MacroKernelParams); 317 318 /// Create the BLIS macro-kernel. 319 /// 320 /// We create the BLIS macro-kernel by applying a combination of tiling 321 /// of dimensions of the band node and interchanging of two innermost 322 /// modified dimensions. The values passed in MicroKernelParam are used 323 /// as tile sizes. 324 /// 325 /// @param Node The schedule node to be modified. 326 /// @param MicroKernelParams Parameters of the micro kernel 327 /// to be used as tile sizes. 328 /// @see MicroKernelParamsTy 329 static isl::schedule_node 330 createMicroKernel(isl::schedule_node Node, 331 MicroKernelParamsTy MicroKernelParams); 332 }; 333 334 /// Build the desired set of partial tile prefixes. 335 /// 336 /// We build a set of partial tile prefixes, which are prefixes of the vector 337 /// loop that have exactly VectorWidth iterations. 338 /// 339 /// 1. Drop all constraints involving the dimension that represents the 340 /// vector loop. 341 /// 2. Constrain the last dimension to get a set, which has exactly VectorWidth 342 /// iterations. 343 /// 3. Subtract loop domain from it, project out the vector loop dimension and 344 /// get a set that contains prefixes, which do not have exactly VectorWidth 345 /// iterations. 346 /// 4. Project out the vector loop dimension of the set that was build on the 347 /// first step and subtract the set built on the previous step to get the 348 /// desired set of prefixes. 349 /// 350 /// @param ScheduleRange A range of a map, which describes a prefix schedule 351 /// relation. 352 isl::set getPartialTilePrefixes(isl::set ScheduleRange, int VectorWidth); 353 #endif // POLLY_SCHEDULEOPTIMIZER_H 354