1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s
3;
4; This file tests the look-ahead operand reordering heuristic.
5;
6;
7; This checks that operand reordering will reorder the operands of the adds
8; by taking into consideration the instructions beyond the immediate
9; predecessors.
10;
11; A[0] B[0] C[0] D[0]  C[1] D[1] A[1] B[1]
12;     \  /   \  /          \  /   \  /
13;       -     -              -     -
14;        \   /                \   /
15;          +                    +
16;          |                    |
17;         S[0]                 S[1]
18;
19define void @lookahead_basic(double* %array) {
20; CHECK-LABEL: @lookahead_basic(
21; CHECK-NEXT:  entry:
22; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0
23; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1
24; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 2
25; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 3
26; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 4
27; CHECK-NEXT:    [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5
28; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6
29; CHECK-NEXT:    [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7
30; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
31; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
32; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>*
33; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
34; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>*
35; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
36; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>*
37; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
38; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
39; CHECK-NEXT:    [[TMP9:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
40; CHECK-NEXT:    [[TMP10:%.*]] = fadd fast <2 x double> [[TMP8]], [[TMP9]]
41; CHECK-NEXT:    [[TMP11:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
42; CHECK-NEXT:    store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8
43; CHECK-NEXT:    ret void
44;
45entry:
46  %idx0 = getelementptr inbounds double, double* %array, i64 0
47  %idx1 = getelementptr inbounds double, double* %array, i64 1
48  %idx2 = getelementptr inbounds double, double* %array, i64 2
49  %idx3 = getelementptr inbounds double, double* %array, i64 3
50  %idx4 = getelementptr inbounds double, double* %array, i64 4
51  %idx5 = getelementptr inbounds double, double* %array, i64 5
52  %idx6 = getelementptr inbounds double, double* %array, i64 6
53  %idx7 = getelementptr inbounds double, double* %array, i64 7
54
55  %A_0 = load double, double *%idx0, align 8
56  %A_1 = load double, double *%idx1, align 8
57  %B_0 = load double, double *%idx2, align 8
58  %B_1 = load double, double *%idx3, align 8
59  %C_0 = load double, double *%idx4, align 8
60  %C_1 = load double, double *%idx5, align 8
61  %D_0 = load double, double *%idx6, align 8
62  %D_1 = load double, double *%idx7, align 8
63
64  %subAB_0 = fsub fast double %A_0, %B_0
65  %subCD_0 = fsub fast double %C_0, %D_0
66
67  %subAB_1 = fsub fast double %A_1, %B_1
68  %subCD_1 = fsub fast double %C_1, %D_1
69
70  %addABCD_0 = fadd fast double %subAB_0, %subCD_0
71  %addCDAB_1 = fadd fast double %subCD_1, %subAB_1
72
73  store double %addABCD_0, double *%idx0, align 8
74  store double %addCDAB_1, double *%idx1, align 8
75  ret void
76}
77
78
79; Check whether the look-ahead operand reordering heuristic will avoid
80; bundling the alt opcodes. The vectorized code should have no shuffles.
81;
82; A[0] B[0] A[0] B[0]  A[1] A[1] A[1] B[1]
83;     \  /   \  /          \  /   \  /
84;       +     -              -     +
85;        \   /                \   /
86;          +                    +
87;          |                    |
88;         S[0]                 S[1]
89;
90define void @lookahead_alt1(double* %array) {
91; CHECK-LABEL: @lookahead_alt1(
92; CHECK-NEXT:  entry:
93; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0
94; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1
95; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 2
96; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 3
97; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 4
98; CHECK-NEXT:    [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5
99; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6
100; CHECK-NEXT:    [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7
101; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
102; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
103; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>*
104; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
105; CHECK-NEXT:    [[TMP4:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
106; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
107; CHECK-NEXT:    [[TMP6:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP4]]
108; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
109; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
110; CHECK-NEXT:    ret void
111;
112entry:
113  %idx0 = getelementptr inbounds double, double* %array, i64 0
114  %idx1 = getelementptr inbounds double, double* %array, i64 1
115  %idx2 = getelementptr inbounds double, double* %array, i64 2
116  %idx3 = getelementptr inbounds double, double* %array, i64 3
117  %idx4 = getelementptr inbounds double, double* %array, i64 4
118  %idx5 = getelementptr inbounds double, double* %array, i64 5
119  %idx6 = getelementptr inbounds double, double* %array, i64 6
120  %idx7 = getelementptr inbounds double, double* %array, i64 7
121
122  %A_0 = load double, double *%idx0, align 8
123  %A_1 = load double, double *%idx1, align 8
124  %B_0 = load double, double *%idx2, align 8
125  %B_1 = load double, double *%idx3, align 8
126
127  %addAB_0_L = fadd fast double %A_0, %B_0
128  %subAB_0_R = fsub fast double %A_0, %B_0
129
130  %subAB_1_L = fsub fast double %A_1, %B_1
131  %addAB_1_R = fadd fast double %A_1, %B_1
132
133  %addABCD_0 = fadd fast double %addAB_0_L, %subAB_0_R
134  %addCDAB_1 = fadd fast double %subAB_1_L, %addAB_1_R
135
136  store double %addABCD_0, double *%idx0, align 8
137  store double %addCDAB_1, double *%idx1, align 8
138  ret void
139}
140
141
142; This code should get vectorized all the way to the loads with shuffles for
143; the alt opcodes.
144;
145; A[0] B[0] C[0] D[0]  C[1] D[1] A[1] B[1]
146;     \  /   \  /          \  /   \  /
147;       +     -              +     -
148;        \   /                \   /
149;          +                    +
150;          |                    |
151;         S[0]                 S[1]
152;
153define void @lookahead_alt2(double* %array) {
154; CHECK-LABEL: @lookahead_alt2(
155; CHECK-NEXT:  entry:
156; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0
157; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1
158; CHECK-NEXT:    [[IDX2:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 2
159; CHECK-NEXT:    [[IDX3:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 3
160; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 4
161; CHECK-NEXT:    [[IDX5:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 5
162; CHECK-NEXT:    [[IDX6:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 6
163; CHECK-NEXT:    [[IDX7:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 7
164; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
165; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
166; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[IDX2]] to <2 x double>*
167; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
168; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[IDX4]] to <2 x double>*
169; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8
170; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDX6]] to <2 x double>*
171; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8
172; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]]
173; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]]
174; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
175; CHECK-NEXT:    [[TMP11:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]]
176; CHECK-NEXT:    [[TMP12:%.*]] = fsub fast <2 x double> [[TMP1]], [[TMP3]]
177; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> <i32 0, i32 3>
178; CHECK-NEXT:    [[TMP14:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP10]]
179; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
180; CHECK-NEXT:    store <2 x double> [[TMP14]], <2 x double>* [[TMP15]], align 8
181; CHECK-NEXT:    ret void
182;
183entry:
184  %idx0 = getelementptr inbounds double, double* %array, i64 0
185  %idx1 = getelementptr inbounds double, double* %array, i64 1
186  %idx2 = getelementptr inbounds double, double* %array, i64 2
187  %idx3 = getelementptr inbounds double, double* %array, i64 3
188  %idx4 = getelementptr inbounds double, double* %array, i64 4
189  %idx5 = getelementptr inbounds double, double* %array, i64 5
190  %idx6 = getelementptr inbounds double, double* %array, i64 6
191  %idx7 = getelementptr inbounds double, double* %array, i64 7
192
193  %A_0 = load double, double *%idx0, align 8
194  %A_1 = load double, double *%idx1, align 8
195  %B_0 = load double, double *%idx2, align 8
196  %B_1 = load double, double *%idx3, align 8
197  %C_0 = load double, double *%idx4, align 8
198  %C_1 = load double, double *%idx5, align 8
199  %D_0 = load double, double *%idx6, align 8
200  %D_1 = load double, double *%idx7, align 8
201
202  %addAB_0 = fadd fast double %A_0, %B_0
203  %subCD_0 = fsub fast double %C_0, %D_0
204
205  %addCD_1 = fadd fast double %C_1, %D_1
206  %subAB_1 = fsub fast double %A_1, %B_1
207
208  %addABCD_0 = fadd fast double %addAB_0, %subCD_0
209  %addCDAB_1 = fadd fast double %addCD_1, %subAB_1
210
211  store double %addABCD_0, double *%idx0, align 8
212  store double %addCDAB_1, double *%idx1, align 8
213  ret void
214}
215
216
217;
218; A[0] B[0] C[0] D[0]  A[1] B[2] A[2] B[1]
219;     \  /   \  /       /  \  /   \  /
220;       -     -        U     -     -
221;        \   /                \   /
222;          +                    +
223;          |                    |
224;         S[0]                 S[1]
225;
226; SLP should reorder the operands of the RHS add taking into consideration the cost of external uses.
227; It is more profitable to reorder the operands of the RHS add, because A[1] has an external use.
228
229define void @lookahead_external_uses(double* %A, double *%B, double *%C, double *%D, double *%S, double *%Ext1, double *%Ext2) {
230; CHECK-LABEL: @lookahead_external_uses(
231; CHECK-NEXT:  entry:
232; CHECK-NEXT:    [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0
233; CHECK-NEXT:    [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0
234; CHECK-NEXT:    [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0
235; CHECK-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
236; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
237; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double*> undef, double* [[A]], i32 0
238; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double*> [[TMP0]], double* [[A]], i32 1
239; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, <2 x double*> [[TMP1]], <2 x i64> <i64 0, i64 2>
240; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
241; CHECK-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
242; CHECK-NEXT:    [[D0:%.*]] = load double, double* [[IDXD0]], align 8
243; CHECK-NEXT:    [[A1:%.*]] = load double, double* [[IDXA1]], align 8
244; CHECK-NEXT:    [[B2:%.*]] = load double, double* [[IDXB2]], align 8
245; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[TMP2]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x double> undef)
246; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double*> [[TMP2]], i32 0
247; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[IDXB0]] to <2 x double>*
248; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8
249; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
250; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A1]], i32 1
251; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> undef, double [[D0]], i32 0
252; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[B2]], i32 1
253; CHECK-NEXT:    [[TMP11:%.*]] = fsub fast <2 x double> [[TMP8]], [[TMP10]]
254; CHECK-NEXT:    [[TMP12:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP6]]
255; CHECK-NEXT:    [[TMP13:%.*]] = fadd fast <2 x double> [[TMP12]], [[TMP11]]
256; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
257; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
258; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
259; CHECK-NEXT:    store <2 x double> [[TMP13]], <2 x double>* [[TMP14]], align 8
260; CHECK-NEXT:    store double [[A1]], double* [[EXT1:%.*]], align 8
261; CHECK-NEXT:    ret void
262;
263entry:
264  %IdxA0 = getelementptr inbounds double, double* %A, i64 0
265  %IdxB0 = getelementptr inbounds double, double* %B, i64 0
266  %IdxC0 = getelementptr inbounds double, double* %C, i64 0
267  %IdxD0 = getelementptr inbounds double, double* %D, i64 0
268
269  %IdxA1 = getelementptr inbounds double, double* %A, i64 1
270  %IdxB2 = getelementptr inbounds double, double* %B, i64 2
271  %IdxA2 = getelementptr inbounds double, double* %A, i64 2
272  %IdxB1 = getelementptr inbounds double, double* %B, i64 1
273
274  %A0 = load double, double *%IdxA0, align 8
275  %B0 = load double, double *%IdxB0, align 8
276  %C0 = load double, double *%IdxC0, align 8
277  %D0 = load double, double *%IdxD0, align 8
278
279  %A1 = load double, double *%IdxA1, align 8
280  %B2 = load double, double *%IdxB2, align 8
281  %A2 = load double, double *%IdxA2, align 8
282  %B1 = load double, double *%IdxB1, align 8
283
284  %subA0B0 = fsub fast double %A0, %B0
285  %subC0D0 = fsub fast double %C0, %D0
286
287  %subA1B2 = fsub fast double %A1, %B2
288  %subA2B1 = fsub fast double %A2, %B1
289
290  %add0 = fadd fast double %subA0B0, %subC0D0
291  %add1 = fadd fast double %subA1B2, %subA2B1
292
293  %IdxS0 = getelementptr inbounds double, double* %S, i64 0
294  %IdxS1 = getelementptr inbounds double, double* %S, i64 1
295
296  store double %add0, double *%IdxS0, align 8
297  store double %add1, double *%IdxS1, align 8
298
299  ; External use
300  store double %A1, double *%Ext1, align 8
301  ret void
302}
303
304; A[0] B[0] C[0] D[0]  A[1] B[2] A[2] B[1]
305;     \  /   \  /       /  \  /   \  / \
306;       -     -    U1,U2,U3  -     -  U4,U5
307;        \   /                \   /
308;          +                    +
309;          |                    |
310;         S[0]                 S[1]
311;
312;
313; If we limit the users budget for the look-ahead heuristic to 2, then the
314; look-ahead heuristic has no way of choosing B[1] (with 2 external users)
315; over A[1] (with 3 external users).
316; The result is that the operands are of the Add not reordered and the loads
317; from A get vectorized instead of the loads from B.
318;
319define void @lookahead_limit_users_budget(double* %A, double *%B, double *%C, double *%D, double *%S, double *%Ext1, double *%Ext2, double *%Ext3, double *%Ext4, double *%Ext5) {
320; CHECK-LABEL: @lookahead_limit_users_budget(
321; CHECK-NEXT:  entry:
322; CHECK-NEXT:    [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
323; CHECK-NEXT:    [[IDXB0:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 0
324; CHECK-NEXT:    [[IDXC0:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 0
325; CHECK-NEXT:    [[IDXD0:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 0
326; CHECK-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
327; CHECK-NEXT:    [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2
328; CHECK-NEXT:    [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2
329; CHECK-NEXT:    [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1
330; CHECK-NEXT:    [[A0:%.*]] = load double, double* [[IDXA0]], align 8
331; CHECK-NEXT:    [[B0:%.*]] = load double, double* [[IDXB0]], align 8
332; CHECK-NEXT:    [[C0:%.*]] = load double, double* [[IDXC0]], align 8
333; CHECK-NEXT:    [[D0:%.*]] = load double, double* [[IDXD0]], align 8
334; CHECK-NEXT:    [[A1:%.*]] = load double, double* [[IDXA1]], align 8
335; CHECK-NEXT:    [[B2:%.*]] = load double, double* [[IDXB2]], align 8
336; CHECK-NEXT:    [[A2:%.*]] = load double, double* [[IDXA2]], align 8
337; CHECK-NEXT:    [[B1:%.*]] = load double, double* [[IDXB1]], align 8
338; CHECK-NEXT:    [[SUBA0B0:%.*]] = fsub fast double [[A0]], [[B0]]
339; CHECK-NEXT:    [[SUBC0D0:%.*]] = fsub fast double [[C0]], [[D0]]
340; CHECK-NEXT:    [[SUBA1B2:%.*]] = fsub fast double [[A1]], [[B2]]
341; CHECK-NEXT:    [[SUBA2B1:%.*]] = fsub fast double [[A2]], [[B1]]
342; CHECK-NEXT:    [[ADD0:%.*]] = fadd fast double [[SUBA0B0]], [[SUBC0D0]]
343; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast double [[SUBA1B2]], [[SUBA2B1]]
344; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
345; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
346; CHECK-NEXT:    store double [[ADD0]], double* [[IDXS0]], align 8
347; CHECK-NEXT:    store double [[ADD1]], double* [[IDXS1]], align 8
348; CHECK-NEXT:    store double [[A1]], double* [[EXT1:%.*]], align 8
349; CHECK-NEXT:    store double [[A1]], double* [[EXT2:%.*]], align 8
350; CHECK-NEXT:    store double [[A1]], double* [[EXT3:%.*]], align 8
351; CHECK-NEXT:    store double [[B1]], double* [[EXT4:%.*]], align 8
352; CHECK-NEXT:    store double [[B1]], double* [[EXT5:%.*]], align 8
353; CHECK-NEXT:    ret void
354;
355entry:
356  %IdxA0 = getelementptr inbounds double, double* %A, i64 0
357  %IdxB0 = getelementptr inbounds double, double* %B, i64 0
358  %IdxC0 = getelementptr inbounds double, double* %C, i64 0
359  %IdxD0 = getelementptr inbounds double, double* %D, i64 0
360
361  %IdxA1 = getelementptr inbounds double, double* %A, i64 1
362  %IdxB2 = getelementptr inbounds double, double* %B, i64 2
363  %IdxA2 = getelementptr inbounds double, double* %A, i64 2
364  %IdxB1 = getelementptr inbounds double, double* %B, i64 1
365
366  %A0 = load double, double *%IdxA0, align 8
367  %B0 = load double, double *%IdxB0, align 8
368  %C0 = load double, double *%IdxC0, align 8
369  %D0 = load double, double *%IdxD0, align 8
370
371  %A1 = load double, double *%IdxA1, align 8
372  %B2 = load double, double *%IdxB2, align 8
373  %A2 = load double, double *%IdxA2, align 8
374  %B1 = load double, double *%IdxB1, align 8
375
376  %subA0B0 = fsub fast double %A0, %B0
377  %subC0D0 = fsub fast double %C0, %D0
378
379  %subA1B2 = fsub fast double %A1, %B2
380  %subA2B1 = fsub fast double %A2, %B1
381
382  %add0 = fadd fast double %subA0B0, %subC0D0
383  %add1 = fadd fast double %subA1B2, %subA2B1
384
385  %IdxS0 = getelementptr inbounds double, double* %S, i64 0
386  %IdxS1 = getelementptr inbounds double, double* %S, i64 1
387
388  store double %add0, double *%IdxS0, align 8
389  store double %add1, double *%IdxS1, align 8
390
391  ; External uses of A1
392  store double %A1, double *%Ext1, align 8
393  store double %A1, double *%Ext2, align 8
394  store double %A1, double *%Ext3, align 8
395
396  ; External uses of B1
397  store double %B1, double *%Ext4, align 8
398  store double %B1, double *%Ext5, align 8
399
400  ret void
401}
402
403; This checks that the lookahead code does not crash when instructions with the same opcodes have different numbers of operands (in this case the calls).
404
405%Class = type { i8 }
406declare double @_ZN1i2ayEv(%Class*)
407declare double @_ZN1i2axEv()
408
409define void @lookahead_crash(double* %A, double *%S, %Class *%Arg0) {
410; CHECK-LABEL: @lookahead_crash(
411; CHECK-NEXT:    [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
412; CHECK-NEXT:    [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1
413; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDXA0]] to <2 x double>*
414; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8
415; CHECK-NEXT:    [[C0:%.*]] = call double @_ZN1i2ayEv(%Class* [[ARG0:%.*]])
416; CHECK-NEXT:    [[C1:%.*]] = call double @_ZN1i2axEv()
417; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
418; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[C1]], i32 1
419; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]]
420; CHECK-NEXT:    [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0
421; CHECK-NEXT:    [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1
422; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[IDXS0]] to <2 x double>*
423; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8
424; CHECK-NEXT:    ret void
425;
426  %IdxA0 = getelementptr inbounds double, double* %A, i64 0
427  %IdxA1 = getelementptr inbounds double, double* %A, i64 1
428
429  %A0 = load double, double *%IdxA0, align 8
430  %A1 = load double, double *%IdxA1, align 8
431
432  %C0 = call double @_ZN1i2ayEv(%Class *%Arg0)
433  %C1 = call double @_ZN1i2axEv()
434
435  %add0 = fadd fast double %A0, %C0
436  %add1 = fadd fast double %A1, %C1
437
438  %IdxS0 = getelementptr inbounds double, double* %S, i64 0
439  %IdxS1 = getelementptr inbounds double, double* %S, i64 1
440  store double %add0, double *%IdxS0, align 8
441  store double %add1, double *%IdxS1, align 8
442  ret void
443}
444
445; This checks that we choose to group consecutive extracts from the same vectors.
446define void @ChecksExtractScores(double* %storeArray, double* %array, <2 x double> *%vecPtr1, <2 x double>* %vecPtr2) {
447; CHECK-LABEL: @ChecksExtractScores(
448; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0
449; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1
450; CHECK-NEXT:    [[LOADA0:%.*]] = load double, double* [[IDX0]], align 4
451; CHECK-NEXT:    [[LOADA1:%.*]] = load double, double* [[IDX1]], align 4
452; CHECK-NEXT:    [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4
453; CHECK-NEXT:    [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4
454; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double [[LOADA0]], i32 0
455; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[LOADA0]], i32 1
456; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]]
457; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> undef, double [[LOADA1]], i32 0
458; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[LOADA1]], i32 1
459; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]]
460; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]]
461; CHECK-NEXT:    [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0
462; CHECK-NEXT:    [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1
463; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double* [[SIDX0]] to <2 x double>*
464; CHECK-NEXT:    store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8
465; CHECK-NEXT:    ret void
466;
467  %idx0 = getelementptr inbounds double, double* %array, i64 0
468  %idx1 = getelementptr inbounds double, double* %array, i64 1
469  %loadA0 = load double, double* %idx0, align 4
470  %loadA1 = load double, double* %idx1, align 4
471
472  %loadVec = load <2 x double>, <2 x double>* %vecPtr1, align 4
473  %extrA0 = extractelement <2 x double> %loadVec, i32 0
474  %extrA1 = extractelement <2 x double> %loadVec, i32 1
475  %loadVec2 = load <2 x double>, <2 x double>* %vecPtr2, align 4
476  %extrB0 = extractelement <2 x double> %loadVec2, i32 0
477  %extrB1 = extractelement <2 x double> %loadVec2, i32 1
478
479  %mul0 = fmul double %extrA0, %loadA0
480  %mul1 = fmul double %extrA1, %loadA0
481  %mul3 = fmul double %extrB0, %loadA1
482  %mul4 = fmul double %extrB1, %loadA1
483  %add0 = fadd double %mul0, %mul3
484  %add1 = fadd double %mul1, %mul4
485
486  %sidx0 = getelementptr inbounds double, double* %storeArray, i64 0
487  %sidx1 = getelementptr inbounds double, double* %storeArray, i64 1
488  store double %add0, double *%sidx0, align 8
489  store double %add1, double *%sidx1, align 8
490  ret void
491}
492
493
494define i1 @ExtractIdxNotConstantInt1(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
495; CHECK-LABEL: @ExtractIdxNotConstantInt1(
496; CHECK-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 undef
497; CHECK-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
498; CHECK-NEXT:    [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
499; CHECK-NEXT:    [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
500; CHECK-NEXT:    [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
501; CHECK-NEXT:    [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
502; CHECK-NEXT:    [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
503; CHECK-NEXT:    [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
504; CHECK-NEXT:    [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
505; CHECK-NEXT:    [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
506; CHECK-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
507; CHECK-NEXT:    ret i1 [[CMP_I185]]
508;
509  %vecext.i291.i166 = extractelement <4 x float> %vec, i64 undef
510  %sub14.i167 = fsub float undef, %vecext.i291.i166
511  %fm = fmul float %a, %sub14.i167
512  %sub25.i168 = fsub float %fm, %b
513  %vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2
514  %add36.i173 = fadd float %sub25.i168, 10.0
515  %mul72.i179 = fmul float %c, %vecext.i276.i169
516  %add78.i180 = fsub float %mul72.i179, 30.0
517  %add79.i181 = fadd float 2.0, %add78.i180
518  %mul123.i184 = fmul float %add36.i173, %add79.i181
519  %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
520  ret i1 %cmp.i185
521}
522
523
524define i1 @ExtractIdxNotConstantInt2(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
525; CHECK-LABEL: @ExtractIdxNotConstantInt2(
526; CHECK-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 1
527; CHECK-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
528; CHECK-NEXT:    [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
529; CHECK-NEXT:    [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
530; CHECK-NEXT:    [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 [[IDX2:%.*]]
531; CHECK-NEXT:    [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
532; CHECK-NEXT:    [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
533; CHECK-NEXT:    [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
534; CHECK-NEXT:    [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
535; CHECK-NEXT:    [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
536; CHECK-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
537; CHECK-NEXT:    ret i1 [[CMP_I185]]
538;
539  %vecext.i291.i166 = extractelement <4 x float> %vec, i64 1
540  %sub14.i167 = fsub float undef, %vecext.i291.i166
541  %fm = fmul float %a, %sub14.i167
542  %sub25.i168 = fsub float %fm, %b
543  %vecext.i276.i169 = extractelement <4 x float> %vec, i64 %idx2
544  %add36.i173 = fadd float %sub25.i168, 10.0
545  %mul72.i179 = fmul float %c, %vecext.i276.i169
546  %add78.i180 = fsub float %mul72.i179, 30.0
547  %add79.i181 = fadd float 2.0, %add78.i180
548  %mul123.i184 = fmul float %add36.i173, %add79.i181
549  %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
550  ret i1 %cmp.i185
551}
552
553
554define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) {
555; CHECK-LABEL: @foo(
556; CHECK-NEXT:    [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0
557; CHECK-NEXT:    [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]]
558; CHECK-NEXT:    [[FM:%.*]] = fmul float [[A:%.*]], [[SUB14_I167]]
559; CHECK-NEXT:    [[SUB25_I168:%.*]] = fsub float [[FM]], [[B:%.*]]
560; CHECK-NEXT:    [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1
561; CHECK-NEXT:    [[ADD36_I173:%.*]] = fadd float [[SUB25_I168]], 1.000000e+01
562; CHECK-NEXT:    [[MUL72_I179:%.*]] = fmul float [[C:%.*]], [[VECEXT_I276_I169]]
563; CHECK-NEXT:    [[ADD78_I180:%.*]] = fsub float [[MUL72_I179]], 3.000000e+01
564; CHECK-NEXT:    [[ADD79_I181:%.*]] = fadd float 2.000000e+00, [[ADD78_I180]]
565; CHECK-NEXT:    [[MUL123_I184:%.*]] = fmul float [[ADD36_I173]], [[ADD79_I181]]
566; CHECK-NEXT:    [[CMP_I185:%.*]] = fcmp ogt float [[MUL123_I184]], 0.000000e+00
567; CHECK-NEXT:    ret i1 [[CMP_I185]]
568;
569  %vecext.i291.i166 = extractelement <4 x float> %vec, i64 0
570  %sub14.i167 = fsub float undef, %vecext.i291.i166
571  %fm = fmul float %a, %sub14.i167
572  %sub25.i168 = fsub float %fm, %b
573  %vecext.i276.i169 = extractelement <4 x float> %vec, i64 1
574  %add36.i173 = fadd float %sub25.i168, 10.0
575  %mul72.i179 = fmul float %c, %vecext.i276.i169
576  %add78.i180 = fsub float %mul72.i179, 30.0
577  %add79.i181 = fadd float 2.0, %add78.i180
578  %mul123.i184 = fmul float %add36.i173, %add79.i181
579  %cmp.i185 = fcmp ogt float %mul123.i184, 0.000000e+00
580  ret i1 %cmp.i185
581}
582
583; Same as @ChecksExtractScores, but the extratelement vector operands do not match.
584define void @ChecksExtractScores_different_vectors(double* %storeArray, double* %array, <2 x double> *%vecPtr1, <2 x double>* %vecPtr2, <2 x double>* %vecPtr3, <2 x double>* %vecPtr4) {
585; CHECK-LABEL: @ChecksExtractScores_different_vectors(
586; CHECK-NEXT:    [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0
587; CHECK-NEXT:    [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1
588; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>*
589; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
590; CHECK-NEXT:    [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4
591; CHECK-NEXT:    [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4
592; CHECK-NEXT:    [[EXTRA0:%.*]] = extractelement <2 x double> [[LOADVEC]], i32 0
593; CHECK-NEXT:    [[EXTRA1:%.*]] = extractelement <2 x double> [[LOADVEC2]], i32 1
594; CHECK-NEXT:    [[LOADVEC3:%.*]] = load <2 x double>, <2 x double>* [[VECPTR3:%.*]], align 4
595; CHECK-NEXT:    [[LOADVEC4:%.*]] = load <2 x double>, <2 x double>* [[VECPTR4:%.*]], align 4
596; CHECK-NEXT:    [[EXTRB0:%.*]] = extractelement <2 x double> [[LOADVEC3]], i32 0
597; CHECK-NEXT:    [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1
598; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[EXTRB0]], i32 0
599; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[EXTRA1]], i32 1
600; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
601; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0
602; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
603; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i32 1
604; CHECK-NEXT:    [[TMP9:%.*]] = fmul <2 x double> [[TMP4]], [[TMP8]]
605; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> undef, double [[EXTRA0]], i32 0
606; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[EXTRB1]], i32 1
607; CHECK-NEXT:    [[TMP12:%.*]] = fmul <2 x double> [[TMP11]], [[TMP2]]
608; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x double> [[TMP12]], [[TMP9]]
609; CHECK-NEXT:    [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0
610; CHECK-NEXT:    [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1
611; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double* [[SIDX0]] to <2 x double>*
612; CHECK-NEXT:    store <2 x double> [[TMP13]], <2 x double>* [[TMP14]], align 8
613; CHECK-NEXT:    ret void
614;
615  %idx0 = getelementptr inbounds double, double* %array, i64 0
616  %idx1 = getelementptr inbounds double, double* %array, i64 1
617  %loadA0 = load double, double* %idx0, align 4
618  %loadA1 = load double, double* %idx1, align 4
619
620  %loadVec = load <2 x double>, <2 x double>* %vecPtr1, align 4
621  %loadVec2 = load <2 x double>, <2 x double>* %vecPtr2, align 4
622  %extrA0 = extractelement <2 x double> %loadVec, i32 0
623  %extrA1 = extractelement <2 x double> %loadVec2, i32 1
624  %loadVec3= load <2 x double>, <2 x double>* %vecPtr3, align 4
625  %loadVec4 = load <2 x double>, <2 x double>* %vecPtr4, align 4
626  %extrB0 = extractelement <2 x double> %loadVec3, i32 0
627  %extrB1 = extractelement <2 x double> %loadVec4, i32 1
628
629  %mul0 = fmul double %extrA0, %loadA0
630  %mul1 = fmul double %extrA1, %loadA0
631  %mul3 = fmul double %extrB0, %loadA1
632  %mul4 = fmul double %extrB1, %loadA1
633  %add0 = fadd double %mul0, %mul3
634  %add1 = fadd double %mul1, %mul4
635
636  %sidx0 = getelementptr inbounds double, double* %storeArray, i64 0
637  %sidx1 = getelementptr inbounds double, double* %storeArray, i64 1
638  store double %add0, double *%sidx0, align 8
639  store double %add1, double *%sidx1, align 8
640  ret void
641}
642