1; REQUIRES: asserts
2; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -aarch64-enable-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s
3; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -force-fast-cluster -verify-misched -debug-only=machine-scheduler -aarch64-enable-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK-FAST
4
5; CHECK: ********** MI Scheduling **********
6; CHECK-LABEL: stp_i64_scale:%bb.0
7; CHECK:Cluster ld/st SU(3) - SU(4)
8; CHECK:Cluster ld/st SU(2) - SU(5)
9; CHECK:SU(4):   STRXui %1:gpr64, %0:gpr64common, 1
10; CHECK:SU(3):   STRXui %1:gpr64, %0:gpr64common, 2
11; CHECK:SU(2):   STRXui %1:gpr64, %0:gpr64common, 3
12; CHECK:SU(5):   STRXui %1:gpr64, %0:gpr64common, 4
13define i64 @stp_i64_scale(i64* nocapture %P, i64 %v) {
14entry:
15  %arrayidx = getelementptr inbounds i64, i64* %P, i64 3
16  store i64 %v, i64* %arrayidx
17  %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2
18  store i64 %v, i64* %arrayidx1
19  %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1
20  store i64 %v, i64* %arrayidx2
21  %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4
22  store i64 %v, i64* %arrayidx3
23  ret i64 %v
24}
25
26; CHECK: ********** MI Scheduling **********
27; CHECK-LABEL: stp_i32_scale:%bb.0
28; CHECK:Cluster ld/st SU(3) - SU(4)
29; CHECK:Cluster ld/st SU(2) - SU(5)
30; CHECK:SU(4):   STRWui %1:gpr32, %0:gpr64common, 1
31; CHECK:SU(3):   STRWui %1:gpr32, %0:gpr64common, 2
32; CHECK:SU(2):   STRWui %1:gpr32, %0:gpr64common, 3
33; CHECK:SU(5):   STRWui %1:gpr32, %0:gpr64common, 4
34define i32 @stp_i32_scale(i32* nocapture %P, i32 %v) {
35entry:
36  %arrayidx = getelementptr inbounds i32, i32* %P, i32 3
37  store i32 %v, i32* %arrayidx
38  %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 2
39  store i32 %v, i32* %arrayidx1
40  %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 1
41  store i32 %v, i32* %arrayidx2
42  %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 4
43  store i32 %v, i32* %arrayidx3
44  ret i32 %v
45}
46
47; CHECK:********** MI Scheduling **********
48; CHECK-LABEL:stp_i64_unscale:%bb.0 entry
49; CHECK:Cluster ld/st SU(2) - SU(5)
50; CHECK:Cluster ld/st SU(3) - SU(4)
51; CHECK:SU(2):   STURXi %1:gpr64, %0:gpr64common, -24
52; CHECK:SU(3):   STURXi %1:gpr64, %0:gpr64common, -8
53; CHECK:SU(4):   STURXi %1:gpr64, %0:gpr64common, -16
54; CHECK:SU(5):   STURXi %1:gpr64, %0:gpr64common, -32
55define void @stp_i64_unscale(i64* nocapture %P, i64 %v) #0 {
56entry:
57  %arrayidx = getelementptr inbounds i64, i64* %P, i64 -3
58  store i64 %v, i64* %arrayidx
59  %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 -1
60  store i64 %v, i64* %arrayidx1
61  %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 -2
62  store i64 %v, i64* %arrayidx2
63  %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 -4
64  store i64 %v, i64* %arrayidx3
65  ret void
66}
67
68; CHECK:********** MI Scheduling **********
69; CHECK-LABEL:stp_i32_unscale:%bb.0 entry
70; CHECK:Cluster ld/st SU(2) - SU(5)
71; CHECK:Cluster ld/st SU(3) - SU(4)
72; CHECK:SU(2):   STURWi %1:gpr32, %0:gpr64common, -12
73; CHECK:SU(3):   STURWi %1:gpr32, %0:gpr64common, -4
74; CHECK:SU(4):   STURWi %1:gpr32, %0:gpr64common, -8
75; CHECK:SU(5):   STURWi %1:gpr32, %0:gpr64common, -16
76define void @stp_i32_unscale(i32* nocapture %P, i32 %v) #0 {
77entry:
78  %arrayidx = getelementptr inbounds i32, i32* %P, i32 -3
79  store i32 %v, i32* %arrayidx
80  %arrayidx1 = getelementptr inbounds i32, i32* %P, i32 -1
81  store i32 %v, i32* %arrayidx1
82  %arrayidx2 = getelementptr inbounds i32, i32* %P, i32 -2
83  store i32 %v, i32* %arrayidx2
84  %arrayidx3 = getelementptr inbounds i32, i32* %P, i32 -4
85  store i32 %v, i32* %arrayidx3
86  ret void
87}
88
89; CHECK:********** MI Scheduling **********
90; CHECK-LABEL:stp_double:%bb.0
91; CHECK:Cluster ld/st SU(3) - SU(4)
92; CHECK:Cluster ld/st SU(2) - SU(5)
93; CHECK:SU(3):   STRDui %1:fpr64, %0:gpr64common, 1
94; CHECK:SU(4):   STRDui %1:fpr64, %0:gpr64common, 2
95; CHECK:SU(2):   STRDui %1:fpr64, %0:gpr64common, 3
96; CHECK:SU(5):   STRDui %1:fpr64, %0:gpr64common, 4
97define void @stp_double(double* nocapture %P, double %v)  {
98entry:
99  %arrayidx = getelementptr inbounds double, double* %P, i64 3
100  store double %v, double* %arrayidx
101  %arrayidx1 = getelementptr inbounds double, double* %P, i64 1
102  store double %v, double* %arrayidx1
103  %arrayidx2 = getelementptr inbounds double, double* %P, i64 2
104  store double %v, double* %arrayidx2
105  %arrayidx3 = getelementptr inbounds double, double* %P, i64 4
106  store double %v, double* %arrayidx3
107  ret void
108}
109
110; CHECK:********** MI Scheduling **********
111; CHECK-LABEL:stp_float:%bb.0
112; CHECK:Cluster ld/st SU(3) - SU(4)
113; CHECK:Cluster ld/st SU(2) - SU(5)
114; CHECK:SU(3):   STRSui %1:fpr32, %0:gpr64common, 1
115; CHECK:SU(4):   STRSui %1:fpr32, %0:gpr64common, 2
116; CHECK:SU(2):   STRSui %1:fpr32, %0:gpr64common, 3
117; CHECK:SU(5):   STRSui %1:fpr32, %0:gpr64common, 4
118define void @stp_float(float* nocapture %P, float %v)  {
119entry:
120  %arrayidx = getelementptr inbounds float, float* %P, i64 3
121  store float %v, float* %arrayidx
122  %arrayidx1 = getelementptr inbounds float, float* %P, i64 1
123  store float %v, float* %arrayidx1
124  %arrayidx2 = getelementptr inbounds float, float* %P, i64 2
125  store float %v, float* %arrayidx2
126  %arrayidx3 = getelementptr inbounds float, float* %P, i64 4
127  store float %v, float* %arrayidx3
128  ret void
129}
130
131; CHECK: ********** MI Scheduling **********
132; CHECK-LABEL: stp_volatile:%bb.0
133; CHECK-NOT: Cluster ld/st
134; CHECK:SU(2):   STRXui %1:gpr64, %0:gpr64common, 3 :: (volatile
135; CHECK:SU(3):   STRXui %1:gpr64, %0:gpr64common, 2 :: (volatile
136; CHECK:SU(4):   STRXui %1:gpr64, %0:gpr64common, 1 :: (volatile
137; CHECK:SU(5):   STRXui %1:gpr64, %0:gpr64common, 4 :: (volatile
138define i64 @stp_volatile(i64* nocapture %P, i64 %v) {
139entry:
140  %arrayidx = getelementptr inbounds i64, i64* %P, i64 3
141  store volatile i64 %v, i64* %arrayidx
142  %arrayidx1 = getelementptr inbounds i64, i64* %P, i64 2
143  store volatile i64 %v, i64* %arrayidx1
144  %arrayidx2 = getelementptr inbounds i64, i64* %P, i64 1
145  store volatile i64 %v, i64* %arrayidx2
146  %arrayidx3 = getelementptr inbounds i64, i64* %P, i64 4
147  store volatile i64 %v, i64* %arrayidx3
148  ret i64 %v
149}
150
151; CHECK: ********** MI Scheduling **********
152; CHECK-LABEL: stp_i64_with_ld:%bb.0
153; CHECK:Cluster ld/st SU(5) - SU(10)
154; CHECK:Cluster ld/st SU(15) - SU(20)
155; CHECK:SU(5):   STRXui %7:gpr64, %0:gpr64common, 0 ::
156; CHECK:SU(10):   STRXui %12:gpr64, %0:gpr64common, 1 ::
157; CHECK:SU(15):   STRXui %17:gpr64, %0:gpr64common, 2 ::
158; CHECK:SU(20):   STRXui %22:gpr64, %0:gpr64common, 3 ::
159define void @stp_i64_with_ld(i64* noalias nocapture %a, i64* noalias nocapture readnone %b, i64* noalias nocapture readnone %c) {
160entry:
161  %arrayidx = getelementptr inbounds i64, i64* %a, i64 8
162  %0 = load i64, i64* %arrayidx, align 8
163  %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 16
164  %1 = load i64, i64* %arrayidx3, align 8
165  %mul = mul nsw i64 %1, %0
166  %2 = load i64, i64* %a, align 8
167  %add6 = add nsw i64 %2, %mul
168  store i64 %add6, i64* %a, align 8
169  %arrayidx.1 = getelementptr inbounds i64, i64* %a, i64 9
170  %3 = load i64, i64* %arrayidx.1, align 8
171  %arrayidx3.1 = getelementptr inbounds i64, i64* %a, i64 17
172  %4 = load i64, i64* %arrayidx3.1, align 8
173  %mul.1 = mul nsw i64 %4, %3
174  %arrayidx5.1 = getelementptr inbounds i64, i64* %a, i64 1
175  %5 = load i64, i64* %arrayidx5.1, align 8
176  %add6.1 = add nsw i64 %5, %mul.1
177  store i64 %add6.1, i64* %arrayidx5.1, align 8
178  %arrayidx.2 = getelementptr inbounds i64, i64* %a, i64 10
179  %6 = load i64, i64* %arrayidx.2, align 8
180  %arrayidx3.2 = getelementptr inbounds i64, i64* %a, i64 18
181  %7 = load i64, i64* %arrayidx3.2, align 8
182  %mul.2 = mul nsw i64 %7, %6
183  %arrayidx5.2 = getelementptr inbounds i64, i64* %a, i64 2
184  %8 = load i64, i64* %arrayidx5.2, align 8
185  %add6.2 = add nsw i64 %8, %mul.2
186  store i64 %add6.2, i64* %arrayidx5.2, align 8
187  %arrayidx.3 = getelementptr inbounds i64, i64* %a, i64 11
188  %9 = load i64, i64* %arrayidx.3, align 8
189  %arrayidx3.3 = getelementptr inbounds i64, i64* %a, i64 19
190  %10 = load i64, i64* %arrayidx3.3, align 8
191  %mul.3 = mul nsw i64 %10, %9
192  %arrayidx5.3 = getelementptr inbounds i64, i64* %a, i64 3
193  %11 = load i64, i64* %arrayidx5.3, align 8
194  %add6.3 = add nsw i64 %11, %mul.3
195  store i64 %add6.3, i64* %arrayidx5.3, align 8
196  ret void
197}
198
199; Verify that the SU(2) and SU(4) are the preds of SU(3)
200; CHECK: ********** MI Scheduling **********
201; CHECK-LABEL: stp_missing_preds_edges:%bb.0
202; CHECK:Cluster ld/st SU(3) - SU(5)
203; CHECK: Copy Pred SU(4)
204; CHECK: Copy Pred SU(2)
205; CHECK:SU(2):   %0:gpr64common = COPY $x0
206; CHECK:SU(3):   STRWui %1:gpr32, %0:gpr64common, 0
207; CHECK:SU(4):   %3:gpr32common = nsw ADDWri %2:gpr32common, 5, 0
208; CHECK:SU(5):   STRWui %3:gpr32common, %0:gpr64common, 1
209define void @stp_missing_preds_edges(i32* %p, i32 %m, i32 %n) {
210entry:
211  store i32 %m, i32* %p, align 4
212  %add = add nsw i32 %n, 5
213  %arrayidx1 = getelementptr inbounds i32, i32* %p, i64 1
214  store i32 %add, i32* %arrayidx1, align 4
215  ret void
216}
217
218; Verify that the SU(4) and SU(7) can be clustered even with
219; different preds
220; CHECK: ********** MI Scheduling **********
221; CHECK-LABEL: cluster_with_different_preds:%bb.0
222; CHECK:Cluster ld/st SU(4) - SU(7)
223; CHECK:SU(3):   STRWui %2:gpr32, %0:gpr64common, 0 ::
224; CHECK:SU(4):   %3:gpr32 = LDRWui %1:gpr64common, 0 ::
225; CHECK:Predecessors:
226; CHECK: SU(3): Ord  Latency=1 Memory
227; CHECK:SU(6):   STRBBui %4:gpr32, %1:gpr64common, 4 ::
228; CHECK:SU(7):   %5:gpr32 = LDRWui %1:gpr64common, 1 ::
229; CHECK:Predecessors:
230; CHECK:SU(6): Ord  Latency=1 Memory
231; CHECK-FAST: cluster_with_different_preds:%bb.0
232; CHECK-FAST-NOT: Cluster ld/st
233; CHECK-FAST:SU(3):   STRWui %2:gpr32, %0:gpr64common, 0 ::
234; CHECK-FAST:SU(4):   %3:gpr32 = LDRWui %1:gpr64common, 0 ::
235define i32 @cluster_with_different_preds(i32* %p, i32* %q) {
236entry:
237  store i32 3, i32* %p, align 4
238  %0 = load i32, i32* %q, align 4
239  %add.ptr = getelementptr inbounds i32, i32* %q, i64 1
240  %1 = bitcast i32* %add.ptr to i8*
241  store i8 5, i8* %1, align 1
242  %2 = load i32, i32* %add.ptr, align 4
243  %add = add nsw i32 %2, %0
244  ret i32 %add
245}
246