1//=- AArch64SchedCyclone.td - Cyclone Scheduling Definitions -*- tablegen -*-=// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the machine model for AArch64 Cyclone to support 11// instruction scheduling and other instruction cost heuristics. 12// 13//===----------------------------------------------------------------------===// 14 15def CycloneModel : SchedMachineModel { 16 let IssueWidth = 6; // 6 micro-ops are dispatched per cycle. 17 let MicroOpBufferSize = 192; // Based on the reorder buffer. 18 let LoadLatency = 4; // Optimistic load latency. 19 let MispredictPenalty = 16; // 14-19 cycles are typical. 20 let CompleteModel = 1; 21 22 list<Predicate> UnsupportedFeatures = [HasSVE]; 23} 24 25//===----------------------------------------------------------------------===// 26// Define each kind of processor resource and number available on Cyclone. 27 28// 4 integer pipes 29def CyUnitI : ProcResource<4> { 30 let BufferSize = 48; 31} 32 33// 2 branch units: I[0..1] 34def CyUnitB : ProcResource<2> { 35 let Super = CyUnitI; 36 let BufferSize = 24; 37} 38 39// 1 indirect-branch unit: I[0] 40def CyUnitBR : ProcResource<1> { 41 let Super = CyUnitB; 42} 43 44// 2 shifter pipes: I[2..3] 45// When an instruction consumes a CyUnitIS, it also consumes a CyUnitI 46def CyUnitIS : ProcResource<2> { 47 let Super = CyUnitI; 48 let BufferSize = 24; 49} 50 51// 1 mul pipe: I[0] 52def CyUnitIM : ProcResource<1> { 53 let Super = CyUnitBR; 54 let BufferSize = 32; 55} 56 57// 1 div pipe: I[1] 58def CyUnitID : ProcResource<1> { 59 let Super = CyUnitB; 60 let BufferSize = 16; 61} 62 63// 1 integer division unit. This is driven by the ID pipe, but only 64// consumes the pipe for one cycle at issue and another cycle at writeback. 65def CyUnitIntDiv : ProcResource<1>; 66 67// 2 ld/st pipes. 68def CyUnitLS : ProcResource<2> { 69 let BufferSize = 28; 70} 71 72// 3 fp/vector pipes. 73def CyUnitV : ProcResource<3> { 74 let BufferSize = 48; 75} 76// 2 fp/vector arithmetic and multiply pipes: V[0-1] 77def CyUnitVM : ProcResource<2> { 78 let Super = CyUnitV; 79 let BufferSize = 32; 80} 81// 1 fp/vector division/sqrt pipe: V[2] 82def CyUnitVD : ProcResource<1> { 83 let Super = CyUnitV; 84 let BufferSize = 16; 85} 86// 1 fp compare pipe: V[0] 87def CyUnitVC : ProcResource<1> { 88 let Super = CyUnitVM; 89 let BufferSize = 16; 90} 91 92// 2 fp division/square-root units. These are driven by the VD pipe, 93// but only consume the pipe for one cycle at issue and a cycle at writeback. 94def CyUnitFloatDiv : ProcResource<2>; 95 96//===----------------------------------------------------------------------===// 97// Define scheduler read/write resources and latency on Cyclone. 98// This mirrors sections 7.7-7.9 of the Tuning Guide v1.0.1. 99 100let SchedModel = CycloneModel in { 101 102//--- 103// 7.8.1. Moves 104//--- 105 106// A single nop micro-op (uX). 107def WriteX : SchedWriteRes<[]> { let Latency = 0; } 108 109// Move zero is a register rename (to machine register zero). 110// The move is replaced by a single nop micro-op. 111// MOVZ Rd, #0 112// AND Rd, Rzr, #imm 113def WriteZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>; 114def WriteImmZ : SchedWriteVariant<[ 115 SchedVar<WriteZPred, [WriteX]>, 116 SchedVar<NoSchedPred, [WriteImm]>]>; 117def : InstRW<[WriteImmZ], (instrs MOVZWi,MOVZXi,ANDWri,ANDXri)>; 118 119// Move GPR is a register rename and single nop micro-op. 120// ORR Xd, XZR, Xm 121// ADD Xd, Xn, #0 122def WriteIMovPred : SchedPredicate<[{TII->isGPRCopy(*MI)}]>; 123def WriteVMovPred : SchedPredicate<[{TII->isFPRCopy(*MI)}]>; 124def WriteMov : SchedWriteVariant<[ 125 SchedVar<WriteIMovPred, [WriteX]>, 126 SchedVar<WriteVMovPred, [WriteX]>, 127 SchedVar<NoSchedPred, [WriteI]>]>; 128def : InstRW<[WriteMov], (instrs COPY,ORRXrr,ADDXrr)>; 129 130// Move non-zero immediate is an integer ALU op. 131// MOVN,MOVZ,MOVK 132def : WriteRes<WriteImm, [CyUnitI]>; 133 134//--- 135// 7.8.2-7.8.5. Arithmetic and Logical, Comparison, Conditional, 136// Shifts and Bitfield Operations 137//--- 138 139// ADR,ADRP 140// ADD(S)ri,SUB(S)ri,AND(S)ri,EORri,ORRri 141// ADD(S)rr,SUB(S)rr,AND(S)rr,BIC(S)rr,EONrr,EORrr,ORNrr,ORRrr 142// ADC(S),SBC(S) 143// Aliases: CMN, CMP, TST 144// 145// Conditional operations. 146// CCMNi,CCMPi,CCMNr,CCMPr, 147// CSEL,CSINC,CSINV,CSNEG 148// 149// Bit counting and reversal operations. 150// CLS,CLZ,RBIT,REV,REV16,REV32 151def : WriteRes<WriteI, [CyUnitI]>; 152 153// ADD with shifted register operand is a single micro-op that 154// consumes a shift pipeline for two cycles. 155// ADD(S)rs,SUB(S)rs,AND(S)rs,BIC(S)rs,EONrs,EORrs,ORNrs,ORRrs 156// EXAMPLE: ADDrs Xn, Xm LSL #imm 157def : WriteRes<WriteISReg, [CyUnitIS]> { 158 let Latency = 2; 159 let ResourceCycles = [2]; 160} 161 162// ADD with extended register operand is the same as shifted reg operand. 163// ADD(S)re,SUB(S)re 164// EXAMPLE: ADDXre Xn, Xm, UXTB #1 165def : WriteRes<WriteIEReg, [CyUnitIS]> { 166 let Latency = 2; 167 let ResourceCycles = [2]; 168} 169 170// Variable shift and bitfield operations. 171// ASRV,LSLV,LSRV,RORV,BFM,SBFM,UBFM 172def : WriteRes<WriteIS, [CyUnitIS]>; 173 174// EXTR Shifts a pair of registers and requires two micro-ops. 175// The second micro-op is delayed, as modeled by ReadExtrHi. 176// EXTR Xn, Xm, #imm 177def : WriteRes<WriteExtr, [CyUnitIS, CyUnitIS]> { 178 let Latency = 2; 179 let NumMicroOps = 2; 180} 181 182// EXTR's first register read is delayed by one cycle, effectively 183// shortening its writer's latency. 184// EXTR Xn, Xm, #imm 185def : ReadAdvance<ReadExtrHi, 1>; 186 187//--- 188// 7.8.6. Multiplies 189//--- 190 191// MUL/MNEG are aliases for MADD/MSUB. 192// MADDW,MSUBW,SMADDL,SMSUBL,UMADDL,UMSUBL 193def : WriteRes<WriteIM32, [CyUnitIM]> { 194 let Latency = 4; 195} 196// MADDX,MSUBX,SMULH,UMULH 197def : WriteRes<WriteIM64, [CyUnitIM]> { 198 let Latency = 5; 199} 200 201//--- 202// 7.8.7. Divide 203//--- 204 205// 32-bit divide takes 7-13 cycles. 10 cycles covers a 20-bit quotient. 206// The ID pipe is consumed for 2 cycles: issue and writeback. 207// SDIVW,UDIVW 208def : WriteRes<WriteID32, [CyUnitID, CyUnitIntDiv]> { 209 let Latency = 10; 210 let ResourceCycles = [2, 10]; 211} 212// 64-bit divide takes 7-21 cycles. 13 cycles covers a 32-bit quotient. 213// The ID pipe is consumed for 2 cycles: issue and writeback. 214// SDIVX,UDIVX 215def : WriteRes<WriteID64, [CyUnitID, CyUnitIntDiv]> { 216 let Latency = 13; 217 let ResourceCycles = [2, 13]; 218} 219 220//--- 221// 7.8.8,7.8.10. Load/Store, single element 222//--- 223 224// Integer loads take 4 cycles and use one LS unit for one cycle. 225def : WriteRes<WriteLD, [CyUnitLS]> { 226 let Latency = 4; 227} 228 229// Store-load forwarding is 4 cycles. 230// 231// Note: The store-exclusive sequence incorporates this 232// latency. However, general heuristics should not model the 233// dependence between a store and subsequent may-alias load because 234// hardware speculation works. 235def : WriteRes<WriteST, [CyUnitLS]> { 236 let Latency = 4; 237} 238 239// Load from base address plus an optionally scaled register offset. 240// Rt latency is latency WriteIS + WriteLD. 241// EXAMPLE: LDR Xn, Xm [, lsl 3] 242def CyWriteLDIdx : SchedWriteVariant<[ 243 SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register. 244 SchedVar<NoSchedPred, [WriteLD]>]>; // Load from register offset. 245def : SchedAlias<WriteLDIdx, CyWriteLDIdx>; // Map AArch64->Cyclone type. 246 247// EXAMPLE: STR Xn, Xm [, lsl 3] 248def CyWriteSTIdx : SchedWriteVariant<[ 249 SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register. 250 SchedVar<NoSchedPred, [WriteST]>]>; // Store to register offset. 251def : SchedAlias<WriteSTIdx, CyWriteSTIdx>; // Map AArch64->Cyclone type. 252 253// Read the (unshifted) base register Xn in the second micro-op one cycle later. 254// EXAMPLE: LDR Xn, Xm [, lsl 3] 255def ReadBaseRS : SchedReadAdvance<1>; 256def CyReadAdrBase : SchedReadVariant<[ 257 SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset. 258 SchedVar<NoSchedPred, [ReadDefault]>]>; // Read base reg with no shift. 259def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type. 260 261//--- 262// 7.8.9,7.8.11. Load/Store, paired 263//--- 264 265// Address pre/post increment is a simple ALU op with one cycle latency. 266def : WriteRes<WriteAdr, [CyUnitI]>; 267 268// LDP high register write is fused with the load, but a nop micro-op remains. 269def : WriteRes<WriteLDHi, []> { 270 let Latency = 4; 271} 272 273// STP is a vector op and store, except for QQ, which is just two stores. 274def : SchedAlias<WriteSTP, WriteVSTShuffle>; 275def : InstRW<[WriteST, WriteST], (instrs STPQi)>; 276 277//--- 278// 7.8.13. Branches 279//--- 280 281// Branches take a single micro-op. 282// The misprediction penalty is defined as a SchedMachineModel property. 283def : WriteRes<WriteBr, [CyUnitB]> {let Latency = 0;} 284def : WriteRes<WriteBrReg, [CyUnitBR]> {let Latency = 0;} 285 286//--- 287// 7.8.14. Never-issued Instructions, Barrier and Hint Operations 288//--- 289 290// NOP,SEV,SEVL,WFE,WFI,YIELD 291def : WriteRes<WriteHint, []> {let Latency = 0;} 292// ISB 293def : InstRW<[WriteI], (instrs ISB)>; 294// SLREX,DMB,DSB 295def : WriteRes<WriteBarrier, [CyUnitLS]>; 296 297// System instructions get an invalid latency because the latency of 298// other operations across them is meaningless. 299def : WriteRes<WriteSys, []> {let Latency = -1;} 300 301//===----------------------------------------------------------------------===// 302// 7.9 Vector Unit Instructions 303 304// Simple vector operations take 2 cycles. 305def : WriteRes<WriteV, [CyUnitV]> {let Latency = 2;} 306 307// Define some longer latency vector op types for Cyclone. 308def CyWriteV3 : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 309def CyWriteV4 : SchedWriteRes<[CyUnitV]> {let Latency = 4;} 310def CyWriteV5 : SchedWriteRes<[CyUnitV]> {let Latency = 5;} 311def CyWriteV6 : SchedWriteRes<[CyUnitV]> {let Latency = 6;} 312 313// Simple floating-point operations take 2 cycles. 314def : WriteRes<WriteF, [CyUnitV]> {let Latency = 2;} 315 316//--- 317// 7.9.1 Vector Moves 318//--- 319 320// TODO: Add Cyclone-specific zero-cycle zeros. LLVM currently 321// generates expensive int-float conversion instead: 322// FMOVDi Dd, #0.0 323// FMOVv2f64ns Vd.2d, #0.0 324 325// FMOVSi,FMOVDi 326def : WriteRes<WriteFImm, [CyUnitV]> {let Latency = 2;} 327 328// MOVI,MVNI are WriteV 329// FMOVv2f32ns,FMOVv2f64ns,FMOVv4f32ns are WriteV 330 331// Move FPR is a register rename and single nop micro-op. 332// ORR.16b Vd,Vn,Vn 333// COPY is handled above in the WriteMov Variant. 334def WriteVMov : SchedWriteVariant<[ 335 SchedVar<WriteVMovPred, [WriteX]>, 336 SchedVar<NoSchedPred, [WriteV]>]>; 337def : InstRW<[WriteVMov], (instrs ORRv16i8)>; 338 339// FMOVSr,FMOVDr are WriteF. 340 341// MOV V,V is a WriteV. 342 343// CPY D,V[x] is a WriteV 344 345// INS V[x],V[y] is a WriteV. 346 347// FMOVWSr,FMOVXDr,FMOVXDHighr 348def : WriteRes<WriteFCopy, [CyUnitLS]> { 349 let Latency = 5; 350} 351 352// FMOVSWr,FMOVDXr 353def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>; 354 355// INS V[x],R 356def CyWriteCopyToFPR : WriteSequence<[WriteVLD, WriteV]>; 357def : InstRW<[CyWriteCopyToFPR], (instregex "INSv")>; 358 359// SMOV,UMOV R,V[x] 360def CyWriteCopyToGPR : WriteSequence<[WriteLD, WriteI]>; 361def : InstRW<[CyWriteCopyToGPR], (instregex "SMOVv","UMOVv")>; 362 363// DUP V,R 364def : InstRW<[CyWriteCopyToFPR], (instregex "DUPv")>; 365 366// DUP V,V[x] is a WriteV. 367 368//--- 369// 7.9.2 Integer Arithmetic, Logical, and Comparisons 370//--- 371 372// BIC,ORR V,#imm are WriteV 373 374def : InstRW<[CyWriteV3], (instregex "ABSv")>; 375 376// MVN,NEG,NOT are WriteV 377 378def : InstRW<[CyWriteV3], (instregex "SQABSv","SQNEGv")>; 379 380// ADDP is a WriteV. 381def CyWriteVADDLP : SchedWriteRes<[CyUnitV]> {let Latency = 2;} 382def : InstRW<[CyWriteVADDLP], (instregex "SADDLPv","UADDLPv")>; 383 384def : InstRW<[CyWriteV3], 385 (instregex "ADDVv","SMAXVv","UMAXVv","SMINVv","UMINVv")>; 386 387def : InstRW<[CyWriteV3], (instregex "SADDLV","UADDLV")>; 388 389// ADD,SUB are WriteV 390 391// Forward declare. 392def CyWriteVABD : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 393 394// Add/Diff and accumulate uses the vector multiply unit. 395def CyWriteVAccum : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} 396def CyReadVAccum : SchedReadAdvance<1, 397 [CyWriteVAccum, CyWriteVADDLP, CyWriteVABD]>; 398 399def : InstRW<[CyWriteVAccum, CyReadVAccum], 400 (instregex "SADALP","UADALP")>; 401 402def : InstRW<[CyWriteVAccum, CyReadVAccum], 403 (instregex "SABAv","UABAv","SABALv","UABALv")>; 404 405def : InstRW<[CyWriteV3], (instregex "SQADDv","SQSUBv","UQADDv","UQSUBv")>; 406 407def : InstRW<[CyWriteV3], (instregex "SUQADDv","USQADDv")>; 408 409def : InstRW<[CyWriteV4], (instregex "ADDHNv","RADDHNv", "RSUBHNv", "SUBHNv")>; 410 411// WriteV includes: 412// AND,BIC,CMTST,EOR,ORN,ORR 413// ADDP 414// SHADD,SHSUB,SRHADD,UHADD,UHSUB,URHADD 415// SADDL,SSUBL,UADDL,USUBL 416// SADDW,SSUBW,UADDW,USUBW 417 418def : InstRW<[CyWriteV3], (instregex "CMEQv","CMGEv","CMGTv", 419 "CMLEv","CMLTv", 420 "CMHIv","CMHSv")>; 421 422def : InstRW<[CyWriteV3], (instregex "SMAXv","SMINv","UMAXv","UMINv", 423 "SMAXPv","SMINPv","UMAXPv","UMINPv")>; 424 425def : InstRW<[CyWriteVABD], (instregex "SABDv","UABDv", 426 "SABDLv","UABDLv")>; 427 428//--- 429// 7.9.3 Floating Point Arithmetic and Comparisons 430//--- 431 432// FABS,FNEG are WriteF 433 434def : InstRW<[CyWriteV4], (instrs FADDPv2i32p)>; 435def : InstRW<[CyWriteV5], (instrs FADDPv2i64p)>; 436 437def : InstRW<[CyWriteV3], (instregex "FMAXPv2i","FMAXNMPv2i", 438 "FMINPv2i","FMINNMPv2i")>; 439 440def : InstRW<[CyWriteV4], (instregex "FMAXVv","FMAXNMVv","FMINVv","FMINNMVv")>; 441 442def : InstRW<[CyWriteV4], (instrs FADDSrr,FADDv2f32,FADDv4f32, 443 FSUBSrr,FSUBv2f32,FSUBv4f32, 444 FADDPv2f32,FADDPv4f32, 445 FABD32,FABDv2f32,FABDv4f32)>; 446def : InstRW<[CyWriteV5], (instrs FADDDrr,FADDv2f64, 447 FSUBDrr,FSUBv2f64, 448 FADDPv2f64, 449 FABD64,FABDv2f64)>; 450 451def : InstRW<[CyWriteV3], (instregex "FCMEQ","FCMGT","FCMLE","FCMLT")>; 452 453def : InstRW<[CyWriteV3], (instregex "FACGE","FACGT", 454 "FMAXS","FMAXD","FMAXv", 455 "FMINS","FMIND","FMINv", 456 "FMAXNMS","FMAXNMD","FMAXNMv", 457 "FMINNMS","FMINNMD","FMINNMv", 458 "FMAXPv2f","FMAXPv4f", 459 "FMINPv2f","FMINPv4f", 460 "FMAXNMPv2f","FMAXNMPv4f", 461 "FMINNMPv2f","FMINNMPv4f")>; 462 463// FCMP,FCMPE,FCCMP,FCCMPE 464def : WriteRes<WriteFCmp, [CyUnitVC]> {let Latency = 4;} 465 466// FCSEL is a WriteF. 467 468//--- 469// 7.9.4 Shifts and Bitfield Operations 470//--- 471 472// SHL is a WriteV 473 474def CyWriteVSHR : SchedWriteRes<[CyUnitV]> {let Latency = 2;} 475def : InstRW<[CyWriteVSHR], (instregex "SSHRv","USHRv")>; 476 477def CyWriteVSRSHR : SchedWriteRes<[CyUnitV]> {let Latency = 3;} 478def : InstRW<[CyWriteVSRSHR], (instregex "SRSHRv","URSHRv")>; 479 480// Shift and accumulate uses the vector multiply unit. 481def CyWriteVShiftAcc : SchedWriteRes<[CyUnitVM]> {let Latency = 3;} 482def CyReadVShiftAcc : SchedReadAdvance<1, 483 [CyWriteVShiftAcc, CyWriteVSHR, CyWriteVSRSHR]>; 484def : InstRW<[CyWriteVShiftAcc, CyReadVShiftAcc], 485 (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>; 486 487// SSHL,USHL are WriteV. 488 489def : InstRW<[CyWriteV3], (instregex "SRSHLv","URSHLv")>; 490 491// SQSHL,SQSHLU,UQSHL are WriteV. 492 493def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>; 494 495// WriteV includes: 496// SHLL,SSHLL,USHLL 497// SLI,SRI 498// BIF,BIT,BSL 499// EXT 500// CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN 501// XTN2 502 503def : InstRW<[CyWriteV4], 504 (instregex "RSHRNv","SHRNv", 505 "SQRSHRNv","SQRSHRUNv","SQSHRNv","SQSHRUNv", 506 "UQRSHRNv","UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>; 507 508//--- 509// 7.9.5 Multiplication 510//--- 511 512def CyWriteVMul : SchedWriteRes<[CyUnitVM]> { let Latency = 4;} 513def : InstRW<[CyWriteVMul], (instregex "MULv","SMULLv","UMULLv", 514 "SQDMULLv","SQDMULHv","SQRDMULHv")>; 515 516// FMUL,FMULX,FNMUL default to WriteFMul. 517def : WriteRes<WriteFMul, [CyUnitVM]> { let Latency = 4;} 518 519def CyWriteV64Mul : SchedWriteRes<[CyUnitVM]> { let Latency = 5;} 520def : InstRW<[CyWriteV64Mul], (instrs FMULDrr,FMULv2f64,FMULv2i64_indexed, 521 FNMULDrr,FMULX64,FMULXv2f64,FMULXv2i64_indexed)>; 522 523def CyReadVMulAcc : SchedReadAdvance<1, [CyWriteVMul, CyWriteV64Mul]>; 524def : InstRW<[CyWriteVMul, CyReadVMulAcc], 525 (instregex "MLA","MLS","SMLAL","SMLSL","UMLAL","UMLSL", 526 "SQDMLAL","SQDMLSL")>; 527 528def CyWriteSMul : SchedWriteRes<[CyUnitVM]> { let Latency = 8;} 529def CyWriteDMul : SchedWriteRes<[CyUnitVM]> { let Latency = 10;} 530def CyReadSMul : SchedReadAdvance<4, [CyWriteSMul]>; 531def CyReadDMul : SchedReadAdvance<5, [CyWriteDMul]>; 532 533def : InstRW<[CyWriteSMul, CyReadSMul], 534 (instrs FMADDSrrr,FMSUBSrrr,FNMADDSrrr,FNMSUBSrrr, 535 FMLAv2f32,FMLAv4f32, 536 FMLAv1i32_indexed,FMLAv1i64_indexed,FMLAv2i32_indexed)>; 537def : InstRW<[CyWriteDMul, CyReadDMul], 538 (instrs FMADDDrrr,FMSUBDrrr,FNMADDDrrr,FNMSUBDrrr, 539 FMLAv2f64,FMLAv2i64_indexed, 540 FMLSv2f64,FMLSv2i64_indexed)>; 541 542def CyWritePMUL : SchedWriteRes<[CyUnitVD]> { let Latency = 3; } 543def : InstRW<[CyWritePMUL], (instregex "PMULv", "PMULLv")>; 544 545//--- 546// 7.9.6 Divide and Square Root 547//--- 548 549// FDIV,FSQRT 550// TODO: Add 64-bit variant with 19 cycle latency. 551// TODO: Specialize FSQRT for longer latency. 552def : WriteRes<WriteFDiv, [CyUnitVD, CyUnitFloatDiv]> { 553 let Latency = 17; 554 let ResourceCycles = [2, 17]; 555} 556 557def : InstRW<[CyWriteV4], (instregex "FRECPEv","FRECPXv","URECPEv","URSQRTEv")>; 558 559def WriteFRSQRTE : SchedWriteRes<[CyUnitVM]> { let Latency = 4; } 560def : InstRW<[WriteFRSQRTE], (instregex "FRSQRTEv")>; 561 562def WriteFRECPS : SchedWriteRes<[CyUnitVM]> { let Latency = 8; } 563def WriteFRSQRTS : SchedWriteRes<[CyUnitVM]> { let Latency = 10; } 564def : InstRW<[WriteFRECPS], (instregex "FRECPSv")>; 565def : InstRW<[WriteFRSQRTS], (instregex "FRSQRTSv")>; 566 567//--- 568// 7.9.7 Integer-FP Conversions 569//--- 570 571// FCVT lengthen f16/s32 572def : InstRW<[WriteV], (instrs FCVTSHr,FCVTDHr,FCVTDSr)>; 573 574// FCVT,FCVTN,FCVTXN 575// SCVTF,UCVTF V,V 576// FRINT(AIMNPXZ) V,V 577def : WriteRes<WriteFCvt, [CyUnitV]> {let Latency = 4;} 578 579// SCVT/UCVT S/D, Rd = VLD5+V4: 9 cycles. 580def CyWriteCvtToFPR : WriteSequence<[WriteVLD, CyWriteV4]>; 581def : InstRW<[CyWriteCopyToFPR], (instregex "FCVT[AMNPZ][SU][SU][WX][SD]r")>; 582 583// FCVT Rd, S/D = V6+LD4: 10 cycles 584def CyWriteCvtToGPR : WriteSequence<[CyWriteV6, WriteLD]>; 585def : InstRW<[CyWriteCvtToGPR], (instregex "[SU]CVTF[SU][WX][SD]r")>; 586 587// FCVTL is a WriteV 588 589//--- 590// 7.9.8-7.9.10 Cryptography, Data Transposition, Table Lookup 591//--- 592 593def CyWriteCrypto2 : SchedWriteRes<[CyUnitVD]> {let Latency = 2;} 594def : InstRW<[CyWriteCrypto2], (instrs AESIMCrr, AESMCrr, SHA1Hrr, 595 AESDrr, AESErr, SHA1SU1rr, SHA256SU0rr, 596 SHA1SU0rrr)>; 597 598def CyWriteCrypto3 : SchedWriteRes<[CyUnitVD]> {let Latency = 3;} 599def : InstRW<[CyWriteCrypto3], (instrs SHA256SU1rrr)>; 600 601def CyWriteCrypto6 : SchedWriteRes<[CyUnitVD]> {let Latency = 6;} 602def : InstRW<[CyWriteCrypto6], (instrs SHA1Crrr, SHA1Mrrr, SHA1Prrr, 603 SHA256Hrrr,SHA256H2rrr)>; 604 605// TRN,UZP,ZUP are WriteV. 606 607// TBL,TBX are WriteV. 608 609//--- 610// 7.9.11-7.9.14 Load/Store, single element and paired 611//--- 612 613// Loading into the vector unit takes 5 cycles vs 4 for integer loads. 614def : WriteRes<WriteVLD, [CyUnitLS]> { 615 let Latency = 5; 616} 617 618// Store-load forwarding is 4 cycles. 619def : WriteRes<WriteVST, [CyUnitLS]> { 620 let Latency = 4; 621} 622 623// WriteVLDPair/VSTPair sequences are expanded by the target description. 624 625//--- 626// 7.9.15 Load, element operations 627//--- 628 629// Only the first WriteVLD and WriteAdr for writeback matches def operands. 630// Subsequent WriteVLDs consume resources. Since all loaded values have the 631// same latency, this is acceptable. 632 633// Vd is read 5 cycles after issuing the vector load. 634def : ReadAdvance<ReadVLD, 5>; 635 636def : InstRW<[WriteVLD], 637 (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 638def : InstRW<[WriteVLD, WriteAdr], 639 (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 640 641// Register writes from the load's high half are fused micro-ops. 642def : InstRW<[WriteVLD], 643 (instregex "LD1Twov(8b|4h|2s|1d)$")>; 644def : InstRW<[WriteVLD, WriteAdr], 645 (instregex "LD1Twov(8b|4h|2s|1d)_POST")>; 646def : InstRW<[WriteVLD, WriteVLD], 647 (instregex "LD1Twov(16b|8h|4s|2d)$")>; 648def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 649 (instregex "LD1Twov(16b|8h|4s|2d)_POST")>; 650 651def : InstRW<[WriteVLD, WriteVLD], 652 (instregex "LD1Threev(8b|4h|2s|1d)$")>; 653def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 654 (instregex "LD1Threev(8b|4h|2s|1d)_POST")>; 655def : InstRW<[WriteVLD, WriteVLD, WriteVLD], 656 (instregex "LD1Threev(16b|8h|4s|2d)$")>; 657def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD], 658 (instregex "LD1Threev(16b|8h|4s|2d)_POST")>; 659 660def : InstRW<[WriteVLD, WriteVLD], 661 (instregex "LD1Fourv(8b|4h|2s|1d)$")>; 662def : InstRW<[WriteVLD, WriteAdr, WriteVLD], 663 (instregex "LD1Fourv(8b|4h|2s|1d)_POST")>; 664def : InstRW<[WriteVLD, WriteVLD, WriteVLD, WriteVLD], 665 (instregex "LD1Fourv(16b|8h|4s|2d)$")>; 666def : InstRW<[WriteVLD, WriteAdr, WriteVLD, WriteVLD, WriteVLD], 667 (instregex "LD1Fourv(16b|8h|4s|2d)_POST")>; 668 669def : InstRW<[WriteVLDShuffle, ReadVLD], 670 (instregex "LD1i(8|16|32)$")>; 671def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr], 672 (instregex "LD1i(8|16|32)_POST")>; 673 674def : InstRW<[WriteVLDShuffle, ReadVLD], (instrs LD1i64)>; 675def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr],(instrs LD1i64_POST)>; 676 677def : InstRW<[WriteVLDShuffle], 678 (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 679def : InstRW<[WriteVLDShuffle, WriteAdr], 680 (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; 681 682def : InstRW<[WriteVLDShuffle, WriteV], 683 (instregex "LD2Twov(8b|4h|2s)$")>; 684def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], 685 (instregex "LD2Twov(8b|4h|2s)_POST$")>; 686def : InstRW<[WriteVLDShuffle, WriteVLDShuffle], 687 (instregex "LD2Twov(16b|8h|4s|2d)$")>; 688def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle], 689 (instregex "LD2Twov(16b|8h|4s|2d)_POST")>; 690 691def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], 692 (instregex "LD2i(8|16|32)$")>; 693def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], 694 (instregex "LD2i(8|16|32)_POST")>; 695def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV], 696 (instregex "LD2i64$")>; 697def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV], 698 (instregex "LD2i64_POST")>; 699 700def : InstRW<[WriteVLDShuffle, WriteV], 701 (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 702def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV], 703 (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 704 705def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], 706 (instregex "LD3Threev(8b|4h|2s)$")>; 707def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], 708 (instregex "LD3Threev(8b|4h|2s)_POST")>; 709def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteVLDShuffle], 710 (instregex "LD3Threev(16b|8h|4s|2d)$")>; 711def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteVLDShuffle], 712 (instregex "LD3Threev(16b|8h|4s|2d)_POST")>; 713 714def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV], 715 (instregex "LD3i(8|16|32)$")>; 716def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV], 717 (instregex "LD3i(8|16|32)_POST")>; 718 719def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV], 720 (instregex "LD3i64$")>; 721def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], 722 (instregex "LD3i64_POST")>; 723 724def : InstRW<[WriteVLDShuffle, WriteV, WriteV], 725 (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)$")>; 726def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV], 727 (instregex "LD3Rv(8b|4h|2s|16b|8h|4s)_POST")>; 728 729def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV], 730 (instrs LD3Rv1d,LD3Rv2d)>; 731def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV], 732 (instrs LD3Rv1d_POST,LD3Rv2d_POST)>; 733 734def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], 735 (instregex "LD4Fourv(8b|4h|2s)$")>; 736def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], 737 (instregex "LD4Fourv(8b|4h|2s)_POST")>; 738def : InstRW<[WriteVLDPairShuffle, WriteVLDPairShuffle, 739 WriteVLDPairShuffle, WriteVLDPairShuffle], 740 (instregex "LD4Fourv(16b|8h|4s|2d)$")>; 741def : InstRW<[WriteVLDPairShuffle, WriteAdr, WriteVLDPairShuffle, 742 WriteVLDPairShuffle, WriteVLDPairShuffle], 743 (instregex "LD4Fourv(16b|8h|4s|2d)_POST")>; 744 745def : InstRW<[WriteVLDShuffle, ReadVLD, WriteV, WriteV, WriteV], 746 (instregex "LD4i(8|16|32)$")>; 747def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteV, WriteV, WriteV], 748 (instregex "LD4i(8|16|32)_POST")>; 749 750 751def : InstRW<[WriteVLDShuffle, ReadVLD, WriteVLDShuffle, WriteV, WriteV], 752 (instrs LD4i64)>; 753def : InstRW<[WriteVLDShuffle, ReadVLD, WriteAdr, WriteVLDShuffle, WriteV], 754 (instrs LD4i64_POST)>; 755 756def : InstRW<[WriteVLDShuffle, WriteV, WriteV, WriteV], 757 (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)$")>; 758def : InstRW<[WriteVLDShuffle, WriteAdr, WriteV, WriteV, WriteV], 759 (instregex "LD4Rv(8b|4h|2s|16b|8h|4s)_POST")>; 760 761def : InstRW<[WriteVLDShuffle, WriteVLDShuffle, WriteV, WriteV], 762 (instrs LD4Rv1d,LD4Rv2d)>; 763def : InstRW<[WriteVLDShuffle, WriteAdr, WriteVLDShuffle, WriteV, WriteV], 764 (instrs LD4Rv1d_POST,LD4Rv2d_POST)>; 765 766//--- 767// 7.9.16 Store, element operations 768//--- 769 770// Only the WriteAdr for writeback matches a def operands. 771// Subsequent WriteVLDs only consume resources. 772 773def : InstRW<[WriteVST], 774 (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; 775def : InstRW<[WriteAdr, WriteVST], 776 (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST")>; 777 778def : InstRW<[WriteVSTShuffle], 779 (instregex "ST1Twov(8b|4h|2s|1d)$")>; 780def : InstRW<[WriteAdr, WriteVSTShuffle], 781 (instregex "ST1Twov(8b|4h|2s|1d)_POST")>; 782def : InstRW<[WriteVST, WriteVST], 783 (instregex "ST1Twov(16b|8h|4s|2d)$")>; 784def : InstRW<[WriteAdr, WriteVST, WriteVST], 785 (instregex "ST1Twov(16b|8h|4s|2d)_POST")>; 786 787def : InstRW<[WriteVSTShuffle, WriteVST], 788 (instregex "ST1Threev(8b|4h|2s|1d)$")>; 789def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVST], 790 (instregex "ST1Threev(8b|4h|2s|1d)_POST")>; 791def : InstRW<[WriteVST, WriteVST, WriteVST], 792 (instregex "ST1Threev(16b|8h|4s|2d)$")>; 793def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST], 794 (instregex "ST1Threev(16b|8h|4s|2d)_POST")>; 795 796def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 797 (instregex "ST1Fourv(8b|4h|2s|1d)$")>; 798def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 799 (instregex "ST1Fourv(8b|4h|2s|1d)_POST")>; 800def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST], 801 (instregex "ST1Fourv(16b|8h|4s|2d)$")>; 802def : InstRW<[WriteAdr, WriteVST, WriteVST, WriteVST, WriteVST], 803 (instregex "ST1Fourv(16b|8h|4s|2d)_POST")>; 804 805def : InstRW<[WriteVSTShuffle], (instregex "ST1i(8|16|32)$")>; 806def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST1i(8|16|32)_POST")>; 807 808def : InstRW<[WriteVSTShuffle], (instrs ST1i64)>; 809def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST1i64_POST)>; 810 811def : InstRW<[WriteVSTShuffle], 812 (instregex "ST2Twov(8b|4h|2s)$")>; 813def : InstRW<[WriteAdr, WriteVSTShuffle], 814 (instregex "ST2Twov(8b|4h|2s)_POST")>; 815def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 816 (instregex "ST2Twov(16b|8h|4s|2d)$")>; 817def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 818 (instregex "ST2Twov(16b|8h|4s|2d)_POST")>; 819 820def : InstRW<[WriteVSTShuffle], (instregex "ST2i(8|16|32)$")>; 821def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST2i(8|16|32)_POST")>; 822def : InstRW<[WriteVSTShuffle], (instrs ST2i64)>; 823def : InstRW<[WriteAdr, WriteVSTShuffle], (instrs ST2i64_POST)>; 824 825def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], 826 (instregex "ST3Threev(8b|4h|2s)$")>; 827def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], 828 (instregex "ST3Threev(8b|4h|2s)_POST")>; 829def : InstRW<[WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], 830 (instregex "ST3Threev(16b|8h|4s|2d)$")>; 831def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle, WriteVSTShuffle], 832 (instregex "ST3Threev(16b|8h|4s|2d)_POST")>; 833 834def : InstRW<[WriteVSTShuffle], (instregex "ST3i(8|16|32)$")>; 835def : InstRW<[WriteAdr, WriteVSTShuffle], (instregex "ST3i(8|16|32)_POST")>; 836 837def :InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64)>; 838def :InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle], (instrs ST3i64_POST)>; 839 840def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle], 841 (instregex "ST4Fourv(8b|4h|2s|1d)$")>; 842def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle], 843 (instregex "ST4Fourv(8b|4h|2s|1d)_POST")>; 844def : InstRW<[WriteVSTPairShuffle, WriteVSTPairShuffle, 845 WriteVSTPairShuffle, WriteVSTPairShuffle], 846 (instregex "ST4Fourv(16b|8h|4s|2d)$")>; 847def : InstRW<[WriteAdr, WriteVSTPairShuffle, WriteVSTPairShuffle, 848 WriteVSTPairShuffle, WriteVSTPairShuffle], 849 (instregex "ST4Fourv(16b|8h|4s|2d)_POST")>; 850 851def : InstRW<[WriteVSTPairShuffle], (instregex "ST4i(8|16|32)$")>; 852def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>; 853 854def : InstRW<[WriteVSTShuffle, WriteVSTShuffle], (instrs ST4i64)>; 855def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>; 856 857// Atomic operations are not supported. 858def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } 859 860//--- 861// Unused SchedRead types 862//--- 863 864def : ReadAdvance<ReadI, 0>; 865def : ReadAdvance<ReadISReg, 0>; 866def : ReadAdvance<ReadIEReg, 0>; 867def : ReadAdvance<ReadIM, 0>; 868def : ReadAdvance<ReadIMA, 0>; 869def : ReadAdvance<ReadID, 0>; 870 871} // SchedModel = CycloneModel 872