1//===-- PPCScheduleP9.td - PPC P9 Scheduling Definitions ---*- tablegen -*-===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the itinerary class data for the POWER9 processor. 11// 12//===----------------------------------------------------------------------===// 13include "PPCInstrInfo.td" 14 15def P9Model : SchedMachineModel { 16 // The maximum number of instructions to be issued at the same time. 17 // While a value of 8 is technically correct since 8 instructions can be 18 // fetched from the instruction cache. However, only 6 instructions may be 19 // actually dispatched at a time. 20 let IssueWidth = 8; 21 22 // Load latency is 4 or 5 cycles depending on the load. This latency assumes 23 // that we have a cache hit. For a cache miss the load latency will be more. 24 // There are two instructions (lxvl, lxvll) that have a latencty of 6 cycles. 25 // However it is not worth bumping this value up to 6 when the vast majority 26 // of instructions are 4 or 5 cycles. 27 let LoadLatency = 5; 28 29 // A total of 16 cycles to recover from a branch mispredict. 30 let MispredictPenalty = 16; 31 32 // Try to make sure we have at least 10 dispatch groups in a loop. 33 // A dispatch group is 6 instructions. 34 let LoopMicroOpBufferSize = 60; 35 36 let CompleteModel = 1; 37 38 // Do not support QPX (Quad Processing eXtension) or SPE (Signal Procesing 39 // Engine) on Power 9. 40 let UnsupportedFeatures = [HasQPX, HasSPE]; 41 42} 43 44let SchedModel = P9Model in { 45 46 // ***************** Processor Resources ***************** 47 48 //Dispatcher: 49 def DISPATCHER : ProcResource<12>; 50 51 // Issue Ports 52 // An instruction can go down one of two issue queues. 53 // Address Generation (AGEN) mainly for loads and stores. 54 // Execution (EXEC) for most other instructions. 55 // Some instructions cannot be run on just any issue queue and may require an 56 // Even or an Odd queue. The EXECE represents the even queues and the EXECO 57 // represents the odd queues. 58 def IP_AGEN : ProcResource<4>; 59 def IP_EXEC : ProcResource<4>; 60 def IP_EXECE : ProcResource<2> { 61 //Even Exec Ports 62 let Super = IP_EXEC; 63 } 64 def IP_EXECO : ProcResource<2> { 65 //Odd Exec Ports 66 let Super = IP_EXEC; 67 } 68 69 // Pipeline Groups 70 // Four ALU (Fixed Point Arithmetic) units in total. Two even, two Odd. 71 def ALU : ProcResource<4>; 72 def ALUE : ProcResource<2> { 73 //Even ALU pipelines 74 let Super = ALU; 75 } 76 def ALUO : ProcResource<2> { 77 //Odd ALU pipelines 78 let Super = ALU; 79 } 80 81 // Two DIV (Fixed Point Divide) units. 82 def DIV : ProcResource<2>; 83 84 // Four DP (Floating Point) units in total. Two even, two Odd. 85 def DP : ProcResource<4>; 86 def DPE : ProcResource<2> { 87 //Even DP pipelines 88 let Super = DP; 89 } 90 def DPO : ProcResource<2> { 91 //Odd DP pipelines 92 let Super = DP; 93 } 94 95 // Four LS (Load or Store) units. 96 def LS : ProcResource<4>; 97 98 // Two PM (Permute) units. 99 def PM : ProcResource<2>; 100 101 // Only one DFU (Decimal Floating Point and Quad Precision) unit. 102 def DFU : ProcResource<1>; 103 104 // Only one Branch unit. 105 def BR : ProcResource<1> { 106 let BufferSize = 16; 107 } 108 109 // Only one CY (Crypto) unit. 110 def CY : ProcResource<1>; 111 112 // ***************** SchedWriteRes Definitions ***************** 113 114 //Dispatcher 115 def DISP_1C : SchedWriteRes<[DISPATCHER]> { 116 let NumMicroOps = 0; 117 let Latency = 1; 118 } 119 120 // Issue Ports 121 def IP_AGEN_1C : SchedWriteRes<[IP_AGEN]> { 122 let NumMicroOps = 0; 123 let Latency = 1; 124 } 125 126 def IP_EXEC_1C : SchedWriteRes<[IP_EXEC]> { 127 let NumMicroOps = 0; 128 let Latency = 1; 129 } 130 131 def IP_EXECE_1C : SchedWriteRes<[IP_EXECE]> { 132 let NumMicroOps = 0; 133 let Latency = 1; 134 } 135 136 def IP_EXECO_1C : SchedWriteRes<[IP_EXECO]> { 137 let NumMicroOps = 0; 138 let Latency = 1; 139 } 140 141 //Pipeline Groups 142 143 // ALU Units 144 // An ALU may take either 2 or 3 cycles to complete the operation. 145 // However, the ALU unit is only every busy for 1 cycle at a time and may 146 // receive new instructions each cycle. 147 def P9_ALU_2C : SchedWriteRes<[ALU]> { 148 let Latency = 2; 149 } 150 151 def P9_ALUE_2C : SchedWriteRes<[ALUE]> { 152 let Latency = 2; 153 } 154 155 def P9_ALUO_2C : SchedWriteRes<[ALUO]> { 156 let Latency = 2; 157 } 158 159 def P9_ALU_3C : SchedWriteRes<[ALU]> { 160 let Latency = 3; 161 } 162 163 def P9_ALUE_3C : SchedWriteRes<[ALUE]> { 164 let Latency = 3; 165 } 166 167 def P9_ALUO_3C : SchedWriteRes<[ALUO]> { 168 let Latency = 3; 169 } 170 171 // DIV Unit 172 // A DIV unit may take from 5 to 40 cycles to complete. 173 // Some DIV operations may keep the unit busy for up to 8 cycles. 174 def P9_DIV_5C : SchedWriteRes<[DIV]> { 175 let Latency = 5; 176 } 177 178 def P9_DIV_12C : SchedWriteRes<[DIV]> { 179 let Latency = 12; 180 } 181 182 def P9_DIV_16C_8 : SchedWriteRes<[DIV]> { 183 let ResourceCycles = [8]; 184 let Latency = 16; 185 } 186 187 def P9_DIV_24C_8 : SchedWriteRes<[DIV]> { 188 let ResourceCycles = [8]; 189 let Latency = 24; 190 } 191 192 def P9_DIV_40C_8 : SchedWriteRes<[DIV]> { 193 let ResourceCycles = [8]; 194 let Latency = 40; 195 } 196 197 // DP Unit 198 // A DP unit may take from 2 to 36 cycles to complete. 199 // Some DP operations keep the unit busy for up to 10 cycles. 200 def P9_DP_2C : SchedWriteRes<[DP]> { 201 let Latency = 2; 202 } 203 204 def P9_DP_5C : SchedWriteRes<[DP]> { 205 let Latency = 5; 206 } 207 208 def P9_DP_7C : SchedWriteRes<[DP]> { 209 let Latency = 7; 210 } 211 212 def P9_DPE_7C : SchedWriteRes<[DPE]> { 213 let Latency = 7; 214 } 215 216 def P9_DPO_7C : SchedWriteRes<[DPO]> { 217 let Latency = 7; 218 } 219 220 def P9_DP_22C_5 : SchedWriteRes<[DP]> { 221 let ResourceCycles = [5]; 222 let Latency = 22; 223 } 224 225 def P9_DP_24C_8 : SchedWriteRes<[DP]> { 226 let ResourceCycles = [8]; 227 let Latency = 24; 228 } 229 230 def P9_DPO_24C_8 : SchedWriteRes<[DPO]> { 231 let ResourceCycles = [8]; 232 let Latency = 24; 233 } 234 235 def P9_DPE_24C_8 : SchedWriteRes<[DPE]> { 236 let ResourceCycles = [8]; 237 let Latency = 24; 238 } 239 240 def P9_DP_26C_5 : SchedWriteRes<[DP]> { 241 let ResourceCycles = [5]; 242 let Latency = 22; 243 } 244 245 def P9_DP_27C_7 : SchedWriteRes<[DP]> { 246 let ResourceCycles = [7]; 247 let Latency = 27; 248 } 249 250 def P9_DPE_27C_10 : SchedWriteRes<[DP]> { 251 let ResourceCycles = [10]; 252 let Latency = 27; 253 } 254 255 def P9_DPO_27C_10 : SchedWriteRes<[DP]> { 256 let ResourceCycles = [10]; 257 let Latency = 27; 258 } 259 260 def P9_DP_33C_8 : SchedWriteRes<[DP]> { 261 let ResourceCycles = [8]; 262 let Latency = 33; 263 } 264 265 def P9_DPE_33C_8 : SchedWriteRes<[DPE]> { 266 let ResourceCycles = [8]; 267 let Latency = 33; 268 } 269 270 def P9_DPO_33C_8 : SchedWriteRes<[DPO]> { 271 let ResourceCycles = [8]; 272 let Latency = 33; 273 } 274 275 def P9_DP_36C_10 : SchedWriteRes<[DP]> { 276 let ResourceCycles = [10]; 277 let Latency = 36; 278 } 279 280 def P9_DPE_36C_10 : SchedWriteRes<[DP]> { 281 let ResourceCycles = [10]; 282 let Latency = 36; 283 } 284 285 def P9_DPO_36C_10 : SchedWriteRes<[DP]> { 286 let ResourceCycles = [10]; 287 let Latency = 36; 288 } 289 290 // PM Unit 291 // Three cycle permute operations. 292 def P9_PM_3C : SchedWriteRes<[PM]> { 293 let Latency = 3; 294 } 295 296 // Load and Store Units 297 // Loads can have 4, 5 or 6 cycles of latency. 298 // Stores are listed as having a single cycle of latency. This is not 299 // completely accurate since it takes more than 1 cycle to actually store 300 // the value. However, since the store does not produce a result it can be 301 // considered complete after one cycle. 302 def P9_LS_1C : SchedWriteRes<[LS]> { 303 let Latency = 1; 304 } 305 306 def P9_LS_4C : SchedWriteRes<[LS]> { 307 let Latency = 4; 308 } 309 310 def P9_LS_5C : SchedWriteRes<[LS]> { 311 let Latency = 5; 312 } 313 314 def P9_LS_6C : SchedWriteRes<[LS]> { 315 let Latency = 6; 316 } 317 318 // DFU Unit 319 // Some of the most expensive ops use the DFU. 320 // Can take from 12 cycles to 76 cycles to obtain a result. 321 // The unit may be busy for up to 62 cycles. 322 def P9_DFU_12C : SchedWriteRes<[DFU]> { 323 let Latency = 12; 324 } 325 326 def P9_DFU_23C : SchedWriteRes<[DFU]> { 327 let Latency = 23; 328 let ResourceCycles = [11]; 329 } 330 331 def P9_DFU_24C : SchedWriteRes<[DFU]> { 332 let Latency = 24; 333 let ResourceCycles = [12]; 334 } 335 336 def P9_DFU_37C : SchedWriteRes<[DFU]> { 337 let Latency = 37; 338 let ResourceCycles = [25]; 339 } 340 341 def P9_DFU_58C : SchedWriteRes<[DFU]> { 342 let Latency = 58; 343 let ResourceCycles = [44]; 344 } 345 346 def P9_DFU_76C : SchedWriteRes<[DFU]> { 347 let Latency = 76; 348 let ResourceCycles = [62]; 349 } 350 351 // 2 or 5 cycle latencies for the branch unit. 352 def P9_BR_2C : SchedWriteRes<[BR]> { 353 let Latency = 2; 354 } 355 356 def P9_BR_5C : SchedWriteRes<[BR]> { 357 let Latency = 5; 358 } 359 360 // 6 cycle latency for the crypto unit 361 def P9_CY_6C : SchedWriteRes<[CY]> { 362 let Latency = 6; 363 } 364 365 // ***************** WriteSeq Definitions ***************** 366 367 // These are combinations of the resources listed above. 368 // The idea is that some cracked instructions cannot be done in parallel and 369 // so the latencies for their resources must be added. 370 def P9_LoadAndALUOp_6C : WriteSequence<[P9_LS_4C, P9_ALU_2C]>; 371 def P9_LoadAndALUOp_7C : WriteSequence<[P9_LS_5C, P9_ALU_2C]>; 372 def P9_LoadAndALU2Op_7C : WriteSequence<[P9_LS_4C, P9_ALU_3C]>; 373 def P9_LoadAndALU2Op_8C : WriteSequence<[P9_LS_5C, P9_ALU_3C]>; 374 def P9_LoadAndPMOp_8C : WriteSequence<[P9_LS_5C, P9_PM_3C]>; 375 def P9_LoadAndLoadOp_8C : WriteSequence<[P9_LS_4C, P9_LS_4C]>; 376 def P9_IntDivAndALUOp_18C_8 : WriteSequence<[P9_DIV_16C_8, P9_ALU_2C]>; 377 def P9_IntDivAndALUOp_26C_8 : WriteSequence<[P9_DIV_24C_8, P9_ALU_2C]>; 378 def P9_IntDivAndALUOp_42C_8 : WriteSequence<[P9_DIV_40C_8, P9_ALU_2C]>; 379 def P9_StoreAndALUOp_3C : WriteSequence<[P9_LS_1C, P9_ALU_2C]>; 380 def P9_StoreAndALUOp_4C : WriteSequence<[P9_LS_1C, P9_ALU_3C]>; 381 def P9_ALUOpAndALUOp_4C : WriteSequence<[P9_ALU_2C, P9_ALU_2C]>; 382 def P9_ALU2OpAndALU2Op_6C : WriteSequence<[P9_ALU_3C, P9_ALU_3C]>; 383 def P9_ALUOpAndALUOpAndALUOp_6C : 384 WriteSequence<[P9_ALU_2C, P9_ALU_2C, P9_ALU_2C]>; 385 def P9_DPOpAndALUOp_7C : WriteSequence<[P9_DP_5C, P9_ALU_2C]>; 386 def P9_DPOpAndALUOp_9C : WriteSequence<[P9_DP_7C, P9_ALU_2C]>; 387 def P9_DPOpAndALU2Op_10C : WriteSequence<[P9_DP_7C, P9_ALU_3C]>; 388 def P9_DPOpAndALUOp_24C_5 : WriteSequence<[P9_DP_22C_5, P9_ALU_2C]>; 389 def P9_DPOpAndALUOp_35C_8 : WriteSequence<[P9_DP_33C_8, P9_ALU_2C]>; 390 def P9_DPOpAndALU2Op_25C_5 : WriteSequence<[P9_DP_22C_5, P9_ALU_3C]>; 391 def P9_DPOpAndALU2Op_29C_5 : WriteSequence<[P9_DP_26C_5, P9_ALU_3C]>; 392 def P9_DPOpAndALU2Op_36C_8 : WriteSequence<[P9_DP_33C_8, P9_ALU_3C]>; 393 def P9_DPOpAndALU2Op_39C_10 : WriteSequence<[P9_DP_36C_10, P9_ALU_3C]>; 394 def P9_BROpAndALUOp_7C : WriteSequence<[P9_BR_5C, P9_ALU_2C]>; 395 396 // Include the resource requirements of individual instructions. 397 include "P9InstrResources.td" 398 399} 400 401