1//===-- PPCScheduleP7.td - PPC P7 Scheduling Definitions ---*- tablegen -*-===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the itinerary class data for the POWER7 processor. 11// 12//===----------------------------------------------------------------------===// 13 14// Primary reference: 15// IBM POWER7 multicore server processor 16// B. Sinharoy, et al. 17// IBM J. Res. & Dev. (55) 3. May/June 2011. 18 19// Scheduling for the P7 involves tracking two types of resources: 20// 1. The dispatch bundle slots 21// 2. The functional unit resources 22 23// Dispatch units: 24def P7_DU1 : FuncUnit; 25def P7_DU2 : FuncUnit; 26def P7_DU3 : FuncUnit; 27def P7_DU4 : FuncUnit; 28def P7_DU5 : FuncUnit; 29def P7_DU6 : FuncUnit; 30 31def P7_LS1 : FuncUnit; // Load/Store pipeline 1 32def P7_LS2 : FuncUnit; // Load/Store pipeline 2 33 34def P7_FX1 : FuncUnit; // FX pipeline 1 35def P7_FX2 : FuncUnit; // FX pipeline 2 36 37// VS pipeline 1 (vector integer ops. always here) 38def P7_VS1 : FuncUnit; // VS pipeline 1 39// VS pipeline 2 (128-bit stores and perms. here) 40def P7_VS2 : FuncUnit; // VS pipeline 2 41 42def P7_CRU : FuncUnit; // CR unit (CR logicals and move-from-SPRs) 43def P7_BRU : FuncUnit; // BR unit 44 45// Notes: 46// Each LSU pipeline can also execute FX add and logical instructions. 47// Each LSU pipeline can complete a load or store in one cycle. 48// 49// Each store is broken into two parts, AGEN goes to the LSU while a 50// "data steering" op. goes to the FXU or VSU. 51// 52// FX loads have a two cycle load-to-use latency (so one "bubble" cycle). 53// VSU loads have a three cycle load-to-use latency (so two "bubble" cycle). 54// 55// Frequent FX ops. take only one cycle and results can be used again in the 56// next cycle (there is a self-bypass). Getting results from the other FX 57// pipeline takes an additional cycle. 58// 59// The VSU XS is similar to the POWER6, but with a pipeline length of 2 cycles 60// (instead of 3 cycles on the POWER6). VSU XS handles vector FX-style ops. 61// Dispatch of an instruction to VS1 that uses four single prec. inputs 62// (either to a float or XC op). prevents dispatch in that cycle to VS2 of any 63// floating point instruction. 64// 65// The VSU PM is similar to the POWER6, but with a pipeline length of 3 cycles 66// (instead of 4 cycles on the POWER6). vsel is handled by the PM pipeline 67// (unlike on the POWER6). 68// 69// FMA from the VSUs can forward results in 6 cycles. VS1 XS and vector FP 70// share the same write-back, and have a 5-cycle latency difference, so the 71// IFU/IDU will not dispatch an XS instructon 5 cycles after a vector FP 72// op. has been dispatched to VS1. 73// 74// Three cycles after an L1 cache hit, a dependent VSU instruction can issue. 75// 76// Instruction dispatch groups have (at most) four non-branch instructions, and 77// two branches. Unlike on the POWER4/5, a branch does not automatically 78// end the dispatch group, but a second branch must be the last in the group. 79 80def P7Itineraries : ProcessorItineraries< 81 [P7_DU1, P7_DU2, P7_DU3, P7_DU4, P7_DU5, P7_DU6, 82 P7_LS1, P7_LS2, P7_FX1, P7_FX2, P7_VS1, P7_VS2, P7_CRU, P7_BRU], [], [ 83 InstrItinData<IIC_IntSimple , [InstrStage<1, [P7_DU1, P7_DU2, 84 P7_DU3, P7_DU4], 0>, 85 InstrStage<1, [P7_FX1, P7_FX2, 86 P7_LS1, P7_LS2]>], 87 [1, 1, 1]>, 88 InstrItinData<IIC_IntGeneral , [InstrStage<1, [P7_DU1, P7_DU2, 89 P7_DU3, P7_DU4], 0>, 90 InstrStage<1, [P7_FX1, P7_FX2]>], 91 [1, 1, 1]>, 92 InstrItinData<IIC_IntISEL, [InstrStage<1, [P7_DU1], 0>, 93 InstrStage<1, [P7_FX1, P7_FX2], 0>, 94 InstrStage<1, [P7_BRU]>], 95 [1, 1, 1, 1]>, 96 InstrItinData<IIC_IntCompare , [InstrStage<1, [P7_DU1, P7_DU2, 97 P7_DU3, P7_DU4], 0>, 98 InstrStage<1, [P7_FX1, P7_FX2]>], 99 [1, 1, 1]>, 100 // FIXME: Add record-form itinerary data. 101 InstrItinData<IIC_IntDivW , [InstrStage<1, [P7_DU1], 0>, 102 InstrStage<1, [P7_DU2], 0>, 103 InstrStage<36, [P7_FX1, P7_FX2]>], 104 [36, 1, 1]>, 105 InstrItinData<IIC_IntDivD , [InstrStage<1, [P7_DU1], 0>, 106 InstrStage<1, [P7_DU2], 0>, 107 InstrStage<68, [P7_FX1, P7_FX2]>], 108 [68, 1, 1]>, 109 InstrItinData<IIC_IntMulHW , [InstrStage<1, [P7_DU1, P7_DU2, 110 P7_DU3, P7_DU4], 0>, 111 InstrStage<1, [P7_FX1, P7_FX2]>], 112 [4, 1, 1]>, 113 InstrItinData<IIC_IntMulHWU , [InstrStage<1, [P7_DU1, P7_DU2, 114 P7_DU3, P7_DU4], 0>, 115 InstrStage<1, [P7_FX1, P7_FX2]>], 116 [4, 1, 1]>, 117 InstrItinData<IIC_IntMulLI , [InstrStage<1, [P7_DU1, P7_DU2, 118 P7_DU3, P7_DU4], 0>, 119 InstrStage<1, [P7_FX1, P7_FX2]>], 120 [4, 1, 1]>, 121 InstrItinData<IIC_IntRotate , [InstrStage<1, [P7_DU1, P7_DU2, 122 P7_DU3, P7_DU4], 0>, 123 InstrStage<1, [P7_FX1, P7_FX2]>], 124 [1, 1, 1]>, 125 InstrItinData<IIC_IntRotateD , [InstrStage<1, [P7_DU1, P7_DU2, 126 P7_DU3, P7_DU4], 0>, 127 InstrStage<1, [P7_FX1, P7_FX2]>], 128 [1, 1, 1]>, 129 InstrItinData<IIC_IntShift , [InstrStage<1, [P7_DU1, P7_DU2, 130 P7_DU3, P7_DU4], 0>, 131 InstrStage<1, [P7_FX1, P7_FX2]>], 132 [1, 1, 1]>, 133 InstrItinData<IIC_IntTrapW , [InstrStage<1, [P7_DU1, P7_DU2, 134 P7_DU3, P7_DU4], 0>, 135 InstrStage<1, [P7_FX1, P7_FX2]>], 136 [1, 1]>, 137 InstrItinData<IIC_IntTrapD , [InstrStage<1, [P7_DU1, P7_DU2, 138 P7_DU3, P7_DU4], 0>, 139 InstrStage<1, [P7_FX1, P7_FX2]>], 140 [1, 1]>, 141 InstrItinData<IIC_BrB , [InstrStage<1, [P7_DU5, P7_DU6], 0>, 142 InstrStage<1, [P7_BRU]>], 143 [3, 1, 1]>, 144 InstrItinData<IIC_BrCR , [InstrStage<1, [P7_DU1], 0>, 145 InstrStage<1, [P7_CRU]>], 146 [3, 1, 1]>, 147 InstrItinData<IIC_BrMCR , [InstrStage<1, [P7_DU5, P7_DU6], 0>, 148 InstrStage<1, [P7_BRU]>], 149 [3, 1, 1]>, 150 InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU5, P7_DU6], 0>, 151 InstrStage<1, [P7_BRU]>], 152 [3, 1, 1]>, 153 InstrItinData<IIC_LdStLoad , [InstrStage<1, [P7_DU1, P7_DU2, 154 P7_DU3, P7_DU4], 0>, 155 InstrStage<1, [P7_LS1, P7_LS2]>], 156 [2, 1, 1]>, 157 InstrItinData<IIC_LdStLoadUpd , [InstrStage<1, [P7_DU1], 0>, 158 InstrStage<1, [P7_DU2], 0>, 159 InstrStage<1, [P7_LS1, P7_LS2], 0>, 160 InstrStage<1, [P7_FX1, P7_FX2]>], 161 [2, 2, 1, 1]>, 162 InstrItinData<IIC_LdStLoadUpdX, [InstrStage<1, [P7_DU1], 0>, 163 InstrStage<1, [P7_DU2], 0>, 164 InstrStage<1, [P7_DU3], 0>, 165 InstrStage<1, [P7_DU4], 0>, 166 InstrStage<1, [P7_FX1, P7_FX2]>, 167 InstrStage<1, [P7_LS1, P7_LS2], 0>, 168 InstrStage<1, [P7_FX1, P7_FX2]>], 169 [3, 3, 1, 1]>, 170 InstrItinData<IIC_LdStLD , [InstrStage<1, [P7_DU1, P7_DU2, 171 P7_DU3, P7_DU4], 0>, 172 InstrStage<1, [P7_LS1, P7_LS2]>], 173 [2, 1, 1]>, 174 InstrItinData<IIC_LdStLDU , [InstrStage<1, [P7_DU1], 0>, 175 InstrStage<1, [P7_DU2], 0>, 176 InstrStage<1, [P7_LS1, P7_LS2], 0>, 177 InstrStage<1, [P7_FX1, P7_FX2]>], 178 [2, 2, 1, 1]>, 179 InstrItinData<IIC_LdStLDUX , [InstrStage<1, [P7_DU1], 0>, 180 InstrStage<1, [P7_DU2], 0>, 181 InstrStage<1, [P7_DU3], 0>, 182 InstrStage<1, [P7_DU4], 0>, 183 InstrStage<1, [P7_FX1, P7_FX2]>, 184 InstrStage<1, [P7_LS1, P7_LS2], 0>, 185 InstrStage<1, [P7_FX1, P7_FX2]>], 186 [3, 3, 1, 1]>, 187 InstrItinData<IIC_LdStLFD , [InstrStage<1, [P7_DU1, P7_DU2, 188 P7_DU3, P7_DU4], 0>, 189 InstrStage<1, [P7_LS1, P7_LS2]>], 190 [3, 1, 1]>, 191 InstrItinData<IIC_LdStLVecX , [InstrStage<1, [P7_DU1, P7_DU2, 192 P7_DU3, P7_DU4], 0>, 193 InstrStage<1, [P7_LS1, P7_LS2]>], 194 [3, 1, 1]>, 195 InstrItinData<IIC_LdStLFDU , [InstrStage<1, [P7_DU1], 0>, 196 InstrStage<1, [P7_DU2], 0>, 197 InstrStage<1, [P7_LS1, P7_LS2], 0>, 198 InstrStage<1, [P7_FX1, P7_FX2]>], 199 [3, 3, 1, 1]>, 200 InstrItinData<IIC_LdStLFDUX , [InstrStage<1, [P7_DU1], 0>, 201 InstrStage<1, [P7_DU2], 0>, 202 InstrStage<1, [P7_LS1, P7_LS2], 0>, 203 InstrStage<1, [P7_FX1, P7_FX2]>], 204 [3, 3, 1, 1]>, 205 InstrItinData<IIC_LdStLHA , [InstrStage<1, [P7_DU1], 0>, 206 InstrStage<1, [P7_DU2], 0>, 207 InstrStage<1, [P7_LS1, P7_LS2]>, 208 InstrStage<1, [P7_FX1, P7_FX2]>], 209 [3, 1, 1]>, 210 InstrItinData<IIC_LdStLHAU , [InstrStage<1, [P7_DU1], 0>, 211 InstrStage<1, [P7_DU2], 0>, 212 InstrStage<1, [P7_LS1, P7_LS2], 0>, 213 InstrStage<1, [P7_FX1, P7_FX2]>, 214 InstrStage<1, [P7_FX1, P7_FX2]>], 215 [4, 4, 1, 1]>, 216 InstrItinData<IIC_LdStLHAUX , [InstrStage<1, [P7_DU1], 0>, 217 InstrStage<1, [P7_DU2], 0>, 218 InstrStage<1, [P7_DU3], 0>, 219 InstrStage<1, [P7_DU4], 0>, 220 InstrStage<1, [P7_FX1, P7_FX2]>, 221 InstrStage<1, [P7_LS1, P7_LS2], 0>, 222 InstrStage<1, [P7_FX1, P7_FX2]>, 223 InstrStage<1, [P7_FX1, P7_FX2]>], 224 [4, 4, 1, 1]>, 225 InstrItinData<IIC_LdStLWA , [InstrStage<1, [P7_DU1], 0>, 226 InstrStage<1, [P7_DU2], 0>, 227 InstrStage<1, [P7_LS1, P7_LS2]>, 228 InstrStage<1, [P7_FX1, P7_FX2]>], 229 [3, 1, 1]>, 230 InstrItinData<IIC_LdStLWARX, [InstrStage<1, [P7_DU1], 0>, 231 InstrStage<1, [P7_DU2], 0>, 232 InstrStage<1, [P7_DU3], 0>, 233 InstrStage<1, [P7_DU4], 0>, 234 InstrStage<1, [P7_LS1, P7_LS2]>], 235 [3, 1, 1]>, 236 InstrItinData<IIC_LdStLDARX, [InstrStage<1, [P7_DU1], 0>, 237 InstrStage<1, [P7_DU2], 0>, 238 InstrStage<1, [P7_DU3], 0>, 239 InstrStage<1, [P7_DU4], 0>, 240 InstrStage<1, [P7_LS1, P7_LS2]>], 241 [3, 1, 1]>, 242 InstrItinData<IIC_LdStLMW , [InstrStage<1, [P7_DU1, P7_DU2, 243 P7_DU3, P7_DU4], 0>, 244 InstrStage<1, [P7_LS1, P7_LS2]>], 245 [2, 1, 1]>, 246 InstrItinData<IIC_LdStStore , [InstrStage<1, [P7_DU1, P7_DU2, 247 P7_DU3, P7_DU4], 0>, 248 InstrStage<1, [P7_LS1, P7_LS2], 0>, 249 InstrStage<1, [P7_FX1, P7_FX2]>], 250 [1, 1, 1]>, 251 InstrItinData<IIC_LdStSTD , [InstrStage<1, [P7_DU1, P7_DU2, 252 P7_DU3, P7_DU4], 0>, 253 InstrStage<1, [P7_LS1, P7_LS2], 0>, 254 InstrStage<1, [P7_FX1, P7_FX2]>], 255 [1, 1, 1]>, 256 InstrItinData<IIC_LdStSTDU , [InstrStage<1, [P7_DU1], 0>, 257 InstrStage<1, [P7_DU2], 0>, 258 InstrStage<1, [P7_LS1, P7_LS2], 0>, 259 InstrStage<1, [P7_FX1, P7_FX2]>, 260 InstrStage<1, [P7_FX1, P7_FX2]>], 261 [2, 1, 1, 1]>, 262 InstrItinData<IIC_LdStSTDUX , [InstrStage<1, [P7_DU1], 0>, 263 InstrStage<1, [P7_DU2], 0>, 264 InstrStage<1, [P7_DU3], 0>, 265 InstrStage<1, [P7_DU4], 0>, 266 InstrStage<1, [P7_LS1, P7_LS2], 0>, 267 InstrStage<1, [P7_FX1, P7_FX2]>, 268 InstrStage<1, [P7_FX1, P7_FX2]>], 269 [2, 1, 1, 1]>, 270 InstrItinData<IIC_LdStSTFD , [InstrStage<1, [P7_DU1, P7_DU2, 271 P7_DU3, P7_DU4], 0>, 272 InstrStage<1, [P7_LS1, P7_LS2], 0>, 273 InstrStage<1, [P7_VS1, P7_VS2]>], 274 [1, 1, 1]>, 275 InstrItinData<IIC_LdStSTFDU , [InstrStage<1, [P7_DU1], 0>, 276 InstrStage<1, [P7_DU2], 0>, 277 InstrStage<1, [P7_LS1, P7_LS2], 0>, 278 InstrStage<1, [P7_FX1, P7_FX2], 0>, 279 InstrStage<1, [P7_VS1, P7_VS2]>], 280 [2, 1, 1, 1]>, 281 InstrItinData<IIC_LdStSTVEBX , [InstrStage<1, [P7_DU1, P7_DU2, 282 P7_DU3, P7_DU4], 0>, 283 InstrStage<1, [P7_LS1, P7_LS2], 0>, 284 InstrStage<1, [P7_VS2]>], 285 [1, 1, 1]>, 286 InstrItinData<IIC_LdStSTDCX , [InstrStage<1, [P7_DU1], 0>, 287 InstrStage<1, [P7_DU2], 0>, 288 InstrStage<1, [P7_DU3], 0>, 289 InstrStage<1, [P7_DU4], 0>, 290 InstrStage<1, [P7_LS1, P7_LS2]>], 291 [1, 1, 1]>, 292 InstrItinData<IIC_LdStSTWCX , [InstrStage<1, [P7_DU1], 0>, 293 InstrStage<1, [P7_DU2], 0>, 294 InstrStage<1, [P7_DU3], 0>, 295 InstrStage<1, [P7_DU4], 0>, 296 InstrStage<1, [P7_LS1, P7_LS2]>], 297 [1, 1, 1]>, 298 InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU1], 0>, 299 InstrStage<1, [P7_DU2], 0>, 300 InstrStage<1, [P7_DU3], 0>, 301 InstrStage<1, [P7_DU4], 0>, 302 InstrStage<1, [P7_CRU]>, 303 InstrStage<1, [P7_FX1, P7_FX2]>], 304 [3, 1]>, // mtcr 305 InstrItinData<IIC_SprMFCR , [InstrStage<1, [P7_DU1], 0>, 306 InstrStage<1, [P7_CRU]>], 307 [6, 1]>, 308 InstrItinData<IIC_SprMFCRF , [InstrStage<1, [P7_DU1], 0>, 309 InstrStage<1, [P7_CRU]>], 310 [3, 1]>, 311 InstrItinData<IIC_SprMTSPR , [InstrStage<1, [P7_DU1], 0>, 312 InstrStage<1, [P7_FX1]>], 313 [4, 1]>, // mtctr 314 InstrItinData<IIC_FPGeneral , [InstrStage<1, [P7_DU1, P7_DU2, 315 P7_DU3, P7_DU4], 0>, 316 InstrStage<1, [P7_VS1, P7_VS2]>], 317 [5, 1, 1]>, 318 InstrItinData<IIC_FPCompare , [InstrStage<1, [P7_DU1, P7_DU2, 319 P7_DU3, P7_DU4], 0>, 320 InstrStage<1, [P7_VS1, P7_VS2]>], 321 [8, 1, 1]>, 322 InstrItinData<IIC_FPDivD , [InstrStage<1, [P7_DU1, P7_DU2, 323 P7_DU3, P7_DU4], 0>, 324 InstrStage<1, [P7_VS1, P7_VS2]>], 325 [33, 1, 1]>, 326 InstrItinData<IIC_FPDivS , [InstrStage<1, [P7_DU1, P7_DU2, 327 P7_DU3, P7_DU4], 0>, 328 InstrStage<1, [P7_VS1, P7_VS2]>], 329 [27, 1, 1]>, 330 InstrItinData<IIC_FPSqrtD , [InstrStage<1, [P7_DU1, P7_DU2, 331 P7_DU3, P7_DU4], 0>, 332 InstrStage<1, [P7_VS1, P7_VS2]>], 333 [44, 1, 1]>, 334 InstrItinData<IIC_FPSqrtS , [InstrStage<1, [P7_DU1, P7_DU2, 335 P7_DU3, P7_DU4], 0>, 336 InstrStage<1, [P7_VS1, P7_VS2]>], 337 [32, 1, 1]>, 338 InstrItinData<IIC_FPFused , [InstrStage<1, [P7_DU1, P7_DU2, 339 P7_DU3, P7_DU4], 0>, 340 InstrStage<1, [P7_VS1, P7_VS2]>], 341 [5, 1, 1, 1]>, 342 InstrItinData<IIC_FPRes , [InstrStage<1, [P7_DU1, P7_DU2, 343 P7_DU3, P7_DU4], 0>, 344 InstrStage<1, [P7_VS1, P7_VS2]>], 345 [5, 1, 1]>, 346 InstrItinData<IIC_VecGeneral , [InstrStage<1, [P7_DU1], 0>, 347 InstrStage<1, [P7_VS1]>], 348 [2, 1, 1]>, 349 InstrItinData<IIC_VecVSL , [InstrStage<1, [P7_DU1], 0>, 350 InstrStage<1, [P7_VS1]>], 351 [2, 1, 1]>, 352 InstrItinData<IIC_VecVSR , [InstrStage<1, [P7_DU1], 0>, 353 InstrStage<1, [P7_VS1]>], 354 [2, 1, 1]>, 355 InstrItinData<IIC_VecFP , [InstrStage<1, [P7_DU1], 0>, 356 InstrStage<1, [P7_VS1, P7_VS2]>], 357 [6, 1, 1]>, 358 InstrItinData<IIC_VecFPCompare, [InstrStage<1, [P7_DU1], 0>, 359 InstrStage<1, [P7_VS1, P7_VS2]>], 360 [6, 1, 1]>, 361 InstrItinData<IIC_VecFPRound , [InstrStage<1, [P7_DU1], 0>, 362 InstrStage<1, [P7_VS1, P7_VS2]>], 363 [6, 1, 1]>, 364 InstrItinData<IIC_VecComplex , [InstrStage<1, [P7_DU1], 0>, 365 InstrStage<1, [P7_VS1]>], 366 [7, 1, 1]>, 367 InstrItinData<IIC_VecPerm , [InstrStage<1, [P7_DU1, P7_DU2], 0>, 368 InstrStage<1, [P7_VS2]>], 369 [3, 1, 1]> 370]>; 371 372// ===---------------------------------------------------------------------===// 373// P7 machine model for scheduling and other instruction cost heuristics. 374 375def P7Model : SchedMachineModel { 376 let IssueWidth = 6; // 4 (non-branch) instructions are dispatched per cycle. 377 // Note that the dispatch bundle size is 6 (including 378 // branches), but the total internal issue bandwidth per 379 // cycle (from all queues) is 8. 380 381 let MinLatency = 0; // Out-of-order dispatch. 382 let LoadLatency = 3; // Optimistic load latency assuming bypass. 383 // This is overriden by OperandCycles if the 384 // Itineraries are queried instead. 385 let MispredictPenalty = 16; 386 387 // Try to make sure we have at least 10 dispatch groups in a loop. 388 let LoopMicroOpBufferSize = 40; 389 390 let Itineraries = P7Itineraries; 391} 392 393