1//=- AArch64SchedM1.td - Samsung Exynos-M1 Scheduling Defs ---*- tablegen -*-=//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the machine model for Samsung Exynos-M1 to support
11// instruction scheduling and other instruction cost heuristics.
12//
13//===----------------------------------------------------------------------===//
14
15//===----------------------------------------------------------------------===//
16// The Exynos-M1 is a traditional superscalar microprocessor with a
17// 4-wide in-order stage for decode and dispatch and a wider issue stage.
18// The execution units and loads and stores are out-of-order.
19
20def ExynosM1Model : SchedMachineModel {
21  let IssueWidth            =  4; // Up to 4 uops per cycle.
22  let MicroOpBufferSize     = 96; // ROB size.
23  let LoopMicroOpBufferSize = 24; // Based on the instruction queue size.
24  let LoadLatency           =  4; // Optimistic load cases.
25  let MispredictPenalty     = 14; // Minimum branch misprediction penalty.
26  let CompleteModel         =  0; // Use the default model otherwise.
27}
28
29//===----------------------------------------------------------------------===//
30// Define each kind of processor resource and number available on the Exynos-M1,
31// which has 9 pipelines, each with its own queue with out-of-order dispatch.
32
33def M1UnitA  : ProcResource<2>; // Simple integer
34def M1UnitC  : ProcResource<1>; // Simple and complex integer
35def M1UnitB  : ProcResource<2>; // Branch
36def M1UnitL  : ProcResource<1>; // Load
37def M1UnitS  : ProcResource<1>; // Store
38def M1PipeF0 : ProcResource<1>; // FP #0
39def M1PipeF1 : ProcResource<1>; // FP #1
40
41let Super = M1PipeF0 in {
42  def M1UnitFMAC   : ProcResource<1>; // FP multiplication
43  def M1UnitFCVT   : ProcResource<1>; // FP conversion
44  def M1UnitNAL0   : ProcResource<1>; // Simple vector.
45  def M1UnitNMISC  : ProcResource<1>; // Miscellanea
46  def M1UnitNCRYPT : ProcResource<1>; // Cryptographic
47}
48
49let Super = M1PipeF1 in {
50  def M1UnitFADD : ProcResource<1>; // Simple FP
51  let BufferSize = 1 in
52  def M1UnitFVAR : ProcResource<1>; // FP division & square root (serialized)
53  def M1UnitNAL1 : ProcResource<1>; // Simple vector.
54  def M1UnitFST  : ProcResource<1>; // FP store
55}
56
57let SchedModel = ExynosM1Model in {
58  def M1UnitALU  : ProcResGroup<[M1UnitA,
59                                 M1UnitC]>;    // All simple integer.
60  def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
61                                 M1UnitNAL1]>; // All simple vector.
62}
63
64let SchedModel = ExynosM1Model in {
65
66//===----------------------------------------------------------------------===//
67// Coarse scheduling model for the Exynos-M1.
68
69// Branch instructions.
70// TODO: Non-conditional direct branches take zero cycles and units.
71def : WriteRes<WriteBr,    [M1UnitB]> { let Latency = 1; }
72def : WriteRes<WriteBrReg, [M1UnitC]> { let Latency = 1; }
73// TODO: Branch and link is much different.
74
75// Arithmetic and logical integer instructions.
76def : WriteRes<WriteI,     [M1UnitALU]> { let Latency = 1; }
77// TODO: Shift over 3 and some extensions take 2 cycles.
78def : WriteRes<WriteISReg, [M1UnitALU]> { let Latency = 1; }
79def : WriteRes<WriteIEReg, [M1UnitALU]> { let Latency = 1; }
80def : WriteRes<WriteIS,    [M1UnitALU]> { let Latency = 1; }
81
82// Move instructions.
83def : WriteRes<WriteImm, [M1UnitALU]> { let Latency = 1; }
84
85// Divide and multiply instructions.
86// TODO: Division blocks the divider inside C.
87def : WriteRes<WriteID32, [M1UnitC]> { let Latency = 13; }
88def : WriteRes<WriteID64, [M1UnitC]> { let Latency = 21; }
89// TODO: Long multiplication take 5 cycles and also the ALU.
90// TODO: Multiplication with accumulation can be advanced.
91def : WriteRes<WriteIM32, [M1UnitC]> { let Latency = 3; }
92// TODO: 64-bit multiplication has a throughput of 1/2.
93def : WriteRes<WriteIM64, [M1UnitC]> { let Latency = 4; }
94
95// Miscellaneous instructions.
96def : WriteRes<WriteExtr, [M1UnitALU,
97                           M1UnitALU]> { let Latency = 2; }
98
99// TODO: The latency for the post or pre register is 1 cycle.
100def : WriteRes<WriteAdr, []> { let Latency = 0; }
101
102// Load instructions.
103def : WriteRes<WriteLD,    [M1UnitL]>   { let Latency = 4; }
104// TODO: Extended address requires also the ALU.
105def : WriteRes<WriteLDIdx, [M1UnitL]>   { let Latency = 5; }
106def : WriteRes<WriteLDHi,  [M1UnitALU]> { let Latency = 4; }
107
108// Store instructions.
109def : WriteRes<WriteST,    [M1UnitS]> { let Latency = 1; }
110// TODO: Extended address requires also the ALU.
111def : WriteRes<WriteSTIdx, [M1UnitS]> { let Latency = 1; }
112def : WriteRes<WriteSTP,   [M1UnitS]> { let Latency = 1; }
113def : WriteRes<WriteSTX,   [M1UnitS]> { let Latency = 1; }
114
115// FP data instructions.
116def : WriteRes<WriteF,    [M1UnitFADD]>  { let Latency = 3; }
117// TODO: FCCMP is much different.
118def : WriteRes<WriteFCmp, [M1UnitNMISC]> { let Latency = 4; }
119// TODO: DP takes longer.
120def : WriteRes<WriteFDiv, [M1UnitFVAR]>  { let Latency = 15; }
121// TODO: MACC takes longer.
122def : WriteRes<WriteFMul, [M1UnitFMAC]>  { let Latency = 4; }
123
124// FP miscellaneous instructions.
125// TODO: Conversion between register files is much different.
126def : WriteRes<WriteFCvt,  [M1UnitFCVT]> { let Latency = 3; }
127def : WriteRes<WriteFImm,  [M1UnitNALU]> { let Latency = 1; }
128// TODO: Copy from FPR to GPR is much different.
129def : WriteRes<WriteFCopy, [M1UnitS]>    { let Latency = 4; }
130
131// FP load instructions.
132// TODO: ASIMD loads are much different.
133def : WriteRes<WriteVLD, [M1UnitL]> { let Latency = 5; }
134
135// FP store instructions.
136// TODO: ASIMD stores are much different.
137def : WriteRes<WriteVST, [M1UnitS, M1UnitFST]> { let Latency = 1; }
138
139// ASIMD FP instructions.
140// TODO: Other operations are much different.
141def : WriteRes<WriteV, [M1UnitFADD]> { let Latency = 3; }
142
143// Other miscellaneous instructions.
144def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
145def : WriteRes<WriteBarrier, []> { let Latency = 1; }
146def : WriteRes<WriteHint,    []> { let Latency = 1; }
147def : WriteRes<WriteSys,     []> { let Latency = 1; }
148
149//===----------------------------------------------------------------------===//
150// Generic fast forwarding.
151
152// TODO: Add FP register forwarding rules.
153
154def : ReadAdvance<ReadI,       0>;
155def : ReadAdvance<ReadISReg,   0>;
156def : ReadAdvance<ReadIEReg,   0>;
157def : ReadAdvance<ReadIM,      0>;
158// Integer multiply-accumulate.
159// TODO: The forwarding for WriteIM64 saves actually 3 cycles.
160def : ReadAdvance<ReadIMA,     2, [WriteIM32, WriteIM64]>;
161def : ReadAdvance<ReadID,      0>;
162def : ReadAdvance<ReadExtrHi,  0>;
163def : ReadAdvance<ReadAdrBase, 0>;
164def : ReadAdvance<ReadVLD,     0>;
165
166//===----------------------------------------------------------------------===//
167// Finer scheduling model for the Exynos-M1.
168
169def M1WriteNEONA   : SchedWriteRes<[M1UnitNALU,
170                                    M1UnitNALU,
171                                    M1UnitFADD]>   { let Latency = 9; }
172def M1WriteNEONB   : SchedWriteRes<[M1UnitNALU,
173                                    M1UnitFST]>    { let Latency = 5; }
174def M1WriteNEONC   : SchedWriteRes<[M1UnitNALU,
175                                    M1UnitFST]>    { let Latency = 6; }
176def M1WriteNEOND   : SchedWriteRes<[M1UnitNALU,
177                                    M1UnitFST,
178                                    M1UnitL]>      { let Latency = 10; }
179def M1WriteNEONE   : SchedWriteRes<[M1UnitFCVT,
180                                    M1UnitFST]>    { let Latency = 8; }
181def M1WriteNEONF   : SchedWriteRes<[M1UnitFCVT,
182                                    M1UnitFST,
183                                    M1UnitL]>      { let Latency = 13; }
184def M1WriteNEONG   : SchedWriteRes<[M1UnitNMISC,
185                                    M1UnitFST]>    { let Latency = 6; }
186def M1WriteNEONH   : SchedWriteRes<[M1UnitNALU,
187                                    M1UnitFST]>    { let Latency = 3; }
188def M1WriteNEONI   : SchedWriteRes<[M1UnitFST,
189                                    M1UnitL]>      { let Latency = 9; }
190def M1WriteNEONJ   : SchedWriteRes<[M1UnitNMISC,
191                                    M1UnitFMAC]>   { let Latency = 6; }
192def M1WriteNEONK   : SchedWriteRes<[M1UnitNMISC,
193                                    M1UnitFMAC]>   { let Latency = 7; }
194def M1WriteALU1    : SchedWriteRes<[M1UnitALU]>    { let Latency = 1; }
195def M1WriteB       : SchedWriteRes<[M1UnitB]>      { let Latency = 1; }
196// FIXME: This is the worst case, conditional branch and link.
197def M1WriteBL      : SchedWriteRes<[M1UnitB,
198                                    M1UnitALU]>    { let Latency = 1; }
199// FIXME: This is the worst case, when using LR.
200def M1WriteBLR     : SchedWriteRes<[M1UnitB,
201                                    M1UnitALU,
202                                    M1UnitALU]>    { let Latency = 2; }
203def M1WriteC1      : SchedWriteRes<[M1UnitC]>      { let Latency = 1; }
204def M1WriteC2      : SchedWriteRes<[M1UnitC]>      { let Latency = 2; }
205def M1WriteFADD3   : SchedWriteRes<[M1UnitFADD]>   { let Latency = 3; }
206def M1WriteFCVT3   : SchedWriteRes<[M1UnitFCVT]>   { let Latency = 3; }
207def M1WriteFCVT4   : SchedWriteRes<[M1UnitFCVT]>   { let Latency = 4; }
208def M1WriteFMAC4   : SchedWriteRes<[M1UnitFMAC]>   { let Latency = 4; }
209def M1WriteFMAC5   : SchedWriteRes<[M1UnitFMAC]>   { let Latency = 5; }
210def M1WriteFVAR15  : SchedWriteRes<[M1UnitFVAR]>   { let Latency = 15; }
211def M1WriteFVAR23  : SchedWriteRes<[M1UnitFVAR]>   { let Latency = 23; }
212def M1WriteNALU1   : SchedWriteRes<[M1UnitNALU]>   { let Latency = 1; }
213def M1WriteNALU2   : SchedWriteRes<[M1UnitNALU]>   { let Latency = 2; }
214def M1WriteNAL11   : SchedWriteRes<[M1UnitNAL1]>   { let Latency = 1; }
215def M1WriteNAL12   : SchedWriteRes<[M1UnitNAL1]>   { let Latency = 2; }
216def M1WriteNAL13   : SchedWriteRes<[M1UnitNAL1]>   { let Latency = 3; }
217def M1WriteNCRYPT1 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
218def M1WriteNCRYPT5 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 5; }
219def M1WriteNMISC1  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 1; }
220def M1WriteNMISC2  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 2; }
221def M1WriteNMISC3  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 3; }
222def M1WriteNMISC4  : SchedWriteRes<[M1UnitNMISC]>  { let Latency = 4; }
223def M1WriteS4      : SchedWriteRes<[M1UnitS]>      { let Latency = 4; }
224def M1WriteTB      : SchedWriteRes<[M1UnitC,
225                                    M1UnitALU]>    { let Latency = 2; }
226
227// Branch instructions
228def : InstRW<[M1WriteB ],  (instrs Bcc)>;
229def : InstRW<[M1WriteBL],  (instrs BL)>;
230def : InstRW<[M1WriteBLR], (instrs BLR)>;
231def : InstRW<[M1WriteC1],  (instregex "^CBN?Z[WX]")>;
232def : InstRW<[M1WriteTB],  (instregex "^TBN?Z[WX]")>;
233
234// Arithmetic and logical integer instructions.
235def : InstRW<[M1WriteALU1], (instrs COPY)>;
236
237// Divide and multiply instructions.
238
239// Miscellaneous instructions.
240
241// Load instructions.
242
243// Store instructions.
244
245// FP data instructions.
246def : InstRW<[M1WriteNALU1],  (instregex "^F(ABS|NEG)[DS]r")>;
247def : InstRW<[M1WriteFADD3],  (instregex "^F(ADD|SUB)[DS]rr")>;
248def : InstRW<[M1WriteNEONG],  (instregex "^FCCMPE?[DS]rr")>;
249def : InstRW<[M1WriteNMISC4], (instregex "^FCMPE?[DS]r")>;
250def : InstRW<[M1WriteFVAR15], (instrs FDIVSrr)>;
251def : InstRW<[M1WriteFVAR23], (instrs FDIVDrr)>;
252def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN).+rr")>;
253def : InstRW<[M1WriteFMAC4],  (instregex "^FN?MUL[DS]rr")>;
254def : InstRW<[M1WriteFMAC5],  (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
255def : InstRW<[M1WriteFCVT3],  (instregex "^FRINT.+r")>;
256def : InstRW<[M1WriteNEONH],  (instregex "^FCSEL[DS]rrr")>;
257def : InstRW<[M1WriteFVAR15], (instrs FSQRTSr)>;
258def : InstRW<[M1WriteFVAR23], (instrs FSQRTDr)>;
259
260// FP miscellaneous instructions.
261def : InstRW<[M1WriteFCVT3], (instregex "^FCVT[DS][DS]r")>;
262def : InstRW<[M1WriteNEONF], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>;
263def : InstRW<[M1WriteNEONE], (instregex "^[SU]CVTF[SU]")>;
264def : InstRW<[M1WriteNALU1], (instregex "^FMOV[DS][ir]")>;
265def : InstRW<[M1WriteS4],    (instregex "^FMOV[WX][DS](High)?r")>;
266def : InstRW<[M1WriteNEONI], (instregex "^FMOV[DS][WX](High)?r")>;
267
268// FP load instructions.
269
270// FP store instructions.
271
272// ASIMD instructions.
273def : InstRW<[M1WriteNMISC3], (instregex "^[SU]ABAL?v")>;
274def : InstRW<[M1WriteNMISC1], (instregex "^[SU]ABDL?v")>;
275def : InstRW<[M1WriteNMISC1], (instregex "^(SQ)?ABSv")>;
276def : InstRW<[M1WriteNMISC1], (instregex "^SQNEGv")>;
277def : InstRW<[M1WriteNALU1],  (instregex "^(ADD|NEG|SUB)v")>;
278def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?H(ADD|SUB)v")>;
279def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?AD[AD](L|LP|P|W)V?2?v")>;
280def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?SUB[LW]2?v")>;
281def : InstRW<[M1WriteNMISC3], (instregex "^R?(ADD|SUB)HN?2?v")>;
282def : InstRW<[M1WriteNMISC3], (instregex "^[SU]+Q(ADD|SUB)v")>;
283def : InstRW<[M1WriteNMISC3], (instregex "^[SU]RHADDv")>;
284def : InstRW<[M1WriteNMISC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>;
285def : InstRW<[M1WriteNALU1],  (instregex "^CMTSTv")>;
286def : InstRW<[M1WriteNALU1],  (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>;
287def : InstRW<[M1WriteNMISC1], (instregex "^[SU](MIN|MAX)v")>;
288def : InstRW<[M1WriteNMISC2], (instregex "^[SU](MIN|MAX)Pv")>;
289def : InstRW<[M1WriteNMISC3], (instregex "^[SU](MIN|MAX)Vv")>;
290def : InstRW<[M1WriteNMISC4], (instregex "^(MUL|SQR?DMULH)v")>;
291def : InstRW<[M1WriteNMISC4], (instregex "^ML[AS]v")>;
292def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD|SQRD)ML[AS][HL]v")>;
293def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)MULLv")>;
294def : InstRW<[M1WriteNAL13],  (instregex "^(S|SR|U|UR)SRAv")>;
295def : InstRW<[M1WriteNALU1],  (instregex "^[SU]?SH(L|LL|R)2?v")>;
296def : InstRW<[M1WriteNALU1],  (instregex "^S[LR]Iv")>;
297def : InstRW<[M1WriteNAL13],  (instregex "^[SU]?(Q|QR|R)?SHR(N|U|UN)?2?v")>;
298def : InstRW<[M1WriteNAL13],  (instregex "^[SU](Q|QR|R)SHLU?v")>;
299
300// ASIMD FP instructions.
301def : InstRW<[M1WriteNALU1],  (instregex "^F(ABS|NEG)v")>;
302def : InstRW<[M1WriteNMISC3], (instregex "^F(ABD|ADD|SUB)v")>;
303def : InstRW<[M1WriteNEONA],  (instregex "^FADDP")>;
304def : InstRW<[M1WriteNMISC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>;
305def : InstRW<[M1WriteFCVT3],  (instregex "^[FVSU]CVTX?[AFLMNPZ][SU]?(_Int)?v")>;
306def : InstRW<[M1WriteFVAR15], (instregex "FDIVv.f32")>;
307def : InstRW<[M1WriteFVAR23], (instregex "FDIVv2f64")>;
308def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>;
309def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>;
310def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>;
311def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>;
312def : InstRW<[M1WriteNEONJ],  (instregex "^FMULX?v.i")>;
313def : InstRW<[M1WriteFMAC4],  (instregex "^FMULX?v.f")>;
314def : InstRW<[M1WriteNEONK],  (instregex "^FML[AS]v.i")>;
315def : InstRW<[M1WriteFMAC5],  (instregex "^FML[AS]v.f")>;
316def : InstRW<[M1WriteFCVT3],  (instregex "^FRINT[AIMNPXZ]v")>;
317
318// ASIMD miscellaneous instructions.
319def : InstRW<[M1WriteNALU1],  (instregex "^RBITv")>;
320def : InstRW<[M1WriteNAL11],  (instregex "^(BIF|BIT|BSL)v")>;
321def : InstRW<[M1WriteNALU1],  (instregex "^CPY")>;
322def : InstRW<[M1WriteNEONB],  (instregex "^DUPv.+gpr")>;
323def : InstRW<[M1WriteNALU1],  (instregex "^DUPv.+lane")>;
324def : InstRW<[M1WriteNAL13],  (instregex "^[SU]?Q?XTU?Nv")>;
325def : InstRW<[M1WriteNEONC],  (instregex "^INSv.+gpr")>;
326def : InstRW<[M1WriteFCVT4],  (instregex "^[FU](RECP|RSQRT)Ev")>;
327def : InstRW<[M1WriteNMISC1], (instregex "^[FU](RECP|RSQRT)Xv")>;
328def : InstRW<[M1WriteFMAC5],  (instregex "^F(RECP|RSQRT)Sv")>;
329def : InstRW<[M1WriteNALU1],  (instregex "^REV(16|32|64)v")>;
330def : InstRW<[M1WriteNAL11],  (instregex "^TB[LX]v8i8One")>;
331def : InstRW<[WriteSequence<[M1WriteNAL11], 2>],
332                              (instregex "^TB[LX]v8i8Two")>;
333def : InstRW<[WriteSequence<[M1WriteNAL11], 3>],
334                              (instregex "^TB[LX]v8i8Three")>;
335def : InstRW<[WriteSequence<[M1WriteNAL11], 4>],
336                              (instregex "^TB[LX]v8i8Four")>;
337def : InstRW<[M1WriteNAL12],  (instregex "^TB[LX]v16i8One")>;
338def : InstRW<[WriteSequence<[M1WriteNAL12], 2>],
339                              (instregex "^TB[LX]v16i8Two")>;
340def : InstRW<[WriteSequence<[M1WriteNAL12], 3>],
341                              (instregex "^TB[LX]v16i8Three")>;
342def : InstRW<[WriteSequence<[M1WriteNAL12], 4>],
343                              (instregex "^TB[LX]v16i8Four")>;
344def : InstRW<[M1WriteNEOND],  (instregex "^[SU]MOVv")>;
345def : InstRW<[M1WriteNALU1],  (instregex "^INSv.+lane")>;
346def : InstRW<[M1WriteNALU1],  (instregex "^(TRN|UZP)[12](v8i8|v4i16|v2i32)")>;
347def : InstRW<[M1WriteNALU2],  (instregex "^(TRN|UZP)[12](v16i8|v8i16|v4i32|v2i64)")>;
348def : InstRW<[M1WriteNALU1],  (instregex "^ZIP[12]v")>;
349
350// ASIMD load instructions.
351
352// ASIMD store instructions.
353
354// Cryptography instructions.
355def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
356def M1ReadAES  : SchedReadAdvance<1, [M1WriteAES]>;
357def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AES")>;
358
359def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>;
360def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>;
361def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>;
362def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA256SU0")>;
363def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA256(H|SU1)")>;
364
365// CRC instructions.
366def : InstRW<[M1WriteC2], (instregex "^CRC32")>;
367
368} // SchedModel = ExynosM1Model
369