1; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse | FileCheck %s
2; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse | FileCheck %s
3
4define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
5entry:
6; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
7; CHECK-LABEL: @test_cse
8; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
9  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
10  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
11  br label %for.cond
12
13for.cond:                                         ; preds = %for.body, %entry
14  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
15  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
16  %cmp = icmp slt i32 %i.0, %n
17  br i1 %cmp, label %for.body, label %for.end
18
19for.body:                                         ; preds = %for.cond
20  %0 = bitcast i32* %a to i8*
21  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
22  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
23  %3 = bitcast <16 x i8> %1 to <4 x i32>
24  %4 = bitcast <16 x i8> %2 to <4 x i32>
25  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
26  %5 = bitcast i32* %a to i8*
27  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
28  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
29  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
30  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
31  %inc = add nsw i32 %i.0, 1
32  br label %for.cond
33
34for.end:                                          ; preds = %for.cond
35  ret <4 x i32> %res.0
36}
37
38define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
39entry:
40; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE.
41; CHECK-LABEL: @test_cse2
42; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
43; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
44  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
45  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
46  br label %for.cond
47
48for.cond:                                         ; preds = %for.body, %entry
49  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
50  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
51  %cmp = icmp slt i32 %i.0, %n
52  br i1 %cmp, label %for.body, label %for.end
53
54for.body:                                         ; preds = %for.cond
55  %0 = bitcast i32* %a to i8*
56  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
57  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
58  %3 = bitcast <16 x i8> %1 to <4 x i32>
59  %4 = bitcast <16 x i8> %2 to <4 x i32>
60  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
61  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
62  %5 = bitcast i32* %a to i8*
63  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
64  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
65  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
66  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
67  %inc = add nsw i32 %i.0, 1
68  br label %for.cond
69
70for.end:                                          ; preds = %for.cond
71  ret <4 x i32> %res.0
72}
73
74define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 {
75entry:
76; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
77; CHECK-LABEL: @test_cse3
78; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
79; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
80  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
81  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
82  br label %for.cond
83
84for.cond:                                         ; preds = %for.body, %entry
85  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
86  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
87  %cmp = icmp slt i32 %i.0, %n
88  br i1 %cmp, label %for.body, label %for.end
89
90for.body:                                         ; preds = %for.cond
91  %0 = bitcast i32* %a to i8*
92  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0)
93  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
94  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
95  %1 = bitcast i32* %a to i8*
96  %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1)
97  %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0
98  %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1
99  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract)
100  %inc = add nsw i32 %i.0, 1
101  br label %for.cond
102
103for.end:                                          ; preds = %for.cond
104  ret <4 x i32> %res.0
105}
106
107
108define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) {
109entry:
110; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized
111; away by Early CSE.
112; CHECK-LABEL: @test_nocse
113; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
114  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
115  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
116  br label %for.cond
117
118for.cond:                                         ; preds = %for.body, %entry
119  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
120  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
121  %cmp = icmp slt i32 %i.0, %n
122  br i1 %cmp, label %for.body, label %for.end
123
124for.body:                                         ; preds = %for.cond
125  %0 = bitcast i32* %a to i8*
126  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
127  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
128  %3 = bitcast <16 x i8> %1 to <4 x i32>
129  %4 = bitcast <16 x i8> %2 to <4 x i32>
130  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
131  store i32 0, i32* %b, align 4
132  %5 = bitcast i32* %a to i8*
133  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
134  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
135  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
136  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
137  %inc = add nsw i32 %i.0, 1
138  br label %for.cond
139
140for.end:                                          ; preds = %for.cond
141  ret <4 x i32> %res.0
142}
143
144define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
145entry:
146; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due
147; to mismatch between st2 and ld3.
148; CHECK-LABEL: @test_nocse2
149; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8
150  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
151  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
152  br label %for.cond
153
154for.cond:                                         ; preds = %for.body, %entry
155  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
156  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
157  %cmp = icmp slt i32 %i.0, %n
158  br i1 %cmp, label %for.body, label %for.end
159
160for.body:                                         ; preds = %for.cond
161  %0 = bitcast i32* %a to i8*
162  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
163  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
164  %3 = bitcast <16 x i8> %1 to <4 x i32>
165  %4 = bitcast <16 x i8> %2 to <4 x i32>
166  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
167  %5 = bitcast i32* %a to i8*
168  %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
169  %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
170  %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
171  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract)
172  %inc = add nsw i32 %i.0, 1
173  br label %for.cond
174
175for.end:                                          ; preds = %for.cond
176  ret <4 x i32> %res.0
177}
178
179define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
180entry:
181; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to
182; mismatch between st2 and st3.
183; CHECK-LABEL: @test_nocse3
184; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8
185; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8
186  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
187  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
188  br label %for.cond
189
190for.cond:                                         ; preds = %for.body, %entry
191  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
192  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
193  %cmp = icmp slt i32 %i.0, %n
194  br i1 %cmp, label %for.body, label %for.end
195
196for.body:                                         ; preds = %for.cond
197  %0 = bitcast i32* %a to i8*
198  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
199  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
200  %3 = bitcast <16 x i8> %1 to <4 x i32>
201  %4 = bitcast <16 x i8> %2 to <4 x i32>
202  call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0)
203  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
204  %5 = bitcast i32* %a to i8*
205  %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
206  %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
207  %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
208  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract)
209  %inc = add nsw i32 %i.0, 1
210  br label %for.cond
211
212for.end:                                          ; preds = %for.cond
213  ret <4 x i32> %res.0
214}
215
216; Function Attrs: nounwind
217declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture)
218
219; Function Attrs: nounwind
220declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture)
221
222; Function Attrs: nounwind readonly
223declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*)
224
225; Function Attrs: nounwind readonly
226declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*)
227
228define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) {
229entry:
230  %add = add <4 x i32> %__p0, %__p1
231  ret <4 x i32> %add
232}
233