1; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse -earlycse-debug-hash | FileCheck %s
2; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -basic-aa -early-cse-memssa | FileCheck %s
3; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse | FileCheck %s
4; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -aa-pipeline=basic-aa -passes=early-cse-memssa | FileCheck %s
5
6define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
7entry:
8; Check that @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
9; CHECK-LABEL: @test_cse
10; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
11  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
12  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
13  br label %for.cond
14
15for.cond:                                         ; preds = %for.body, %entry
16  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
17  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
18  %cmp = icmp slt i32 %i.0, %n
19  br i1 %cmp, label %for.body, label %for.end
20
21for.body:                                         ; preds = %for.cond
22  %0 = bitcast i32* %a to i8*
23  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
24  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
25  %3 = bitcast <16 x i8> %1 to <4 x i32>
26  %4 = bitcast <16 x i8> %2 to <4 x i32>
27  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
28  %5 = bitcast i32* %a to i8*
29  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
30  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
31  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
32  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
33  %inc = add nsw i32 %i.0, 1
34  br label %for.cond
35
36for.end:                                          ; preds = %for.cond
37  ret <4 x i32> %res.0
38}
39
40define <4 x i32> @test_cse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
41entry:
42; Check that the first @llvm.aarch64.neon.st2 is optimized away by Early CSE.
43; CHECK-LABEL: @test_cse2
44; CHECK-NOT: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
45; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %s.coerce.fca.0.extract, <4 x i32> %s.coerce.fca.1.extract, i8* %0)
46  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
47  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
48  br label %for.cond
49
50for.cond:                                         ; preds = %for.body, %entry
51  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
52  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
53  %cmp = icmp slt i32 %i.0, %n
54  br i1 %cmp, label %for.body, label %for.end
55
56for.body:                                         ; preds = %for.cond
57  %0 = bitcast i32* %a to i8*
58  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
59  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
60  %3 = bitcast <16 x i8> %1 to <4 x i32>
61  %4 = bitcast <16 x i8> %2 to <4 x i32>
62  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
63  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
64  %5 = bitcast i32* %a to i8*
65  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
66  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
67  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
68  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
69  %inc = add nsw i32 %i.0, 1
70  br label %for.cond
71
72for.end:                                          ; preds = %for.cond
73  ret <4 x i32> %res.0
74}
75
76define <4 x i32> @test_cse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) #0 {
77entry:
78; Check that the first @llvm.aarch64.neon.ld2 is optimized away by Early CSE.
79; CHECK-LABEL: @test_cse3
80; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
81; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
82  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
83  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
84  br label %for.cond
85
86for.cond:                                         ; preds = %for.body, %entry
87  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
88  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
89  %cmp = icmp slt i32 %i.0, %n
90  br i1 %cmp, label %for.body, label %for.end
91
92for.body:                                         ; preds = %for.cond
93  %0 = bitcast i32* %a to i8*
94  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %0)
95  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
96  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
97  %1 = bitcast i32* %a to i8*
98  %vld22 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %1)
99  %vld22.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 0
100  %vld22.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld22, 1
101  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld22.fca.0.extract)
102  %inc = add nsw i32 %i.0, 1
103  br label %for.cond
104
105for.end:                                          ; preds = %for.cond
106  ret <4 x i32> %res.0
107}
108
109
110define <4 x i32> @test_nocse(i32* %a, i32* %b, [2 x <4 x i32>] %s.coerce, i32 %n) {
111entry:
112; Check that the store prevents @llvm.aarch64.neon.ld2 from being optimized
113; away by Early CSE.
114; CHECK-LABEL: @test_nocse
115; CHECK: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8
116  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
117  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
118  br label %for.cond
119
120for.cond:                                         ; preds = %for.body, %entry
121  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
122  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
123  %cmp = icmp slt i32 %i.0, %n
124  br i1 %cmp, label %for.body, label %for.end
125
126for.body:                                         ; preds = %for.cond
127  %0 = bitcast i32* %a to i8*
128  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
129  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
130  %3 = bitcast <16 x i8> %1 to <4 x i32>
131  %4 = bitcast <16 x i8> %2 to <4 x i32>
132  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
133  store i32 0, i32* %b, align 4
134  %5 = bitcast i32* %a to i8*
135  %vld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8* %5)
136  %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
137  %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
138  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld2.fca.0.extract, <4 x i32> %vld2.fca.0.extract)
139  %inc = add nsw i32 %i.0, 1
140  br label %for.cond
141
142for.end:                                          ; preds = %for.cond
143  ret <4 x i32> %res.0
144}
145
146define <4 x i32> @test_nocse2(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
147entry:
148; Check that @llvm.aarch64.neon.ld3 is not optimized away by Early CSE due
149; to mismatch between st2 and ld3.
150; CHECK-LABEL: @test_nocse2
151; CHECK: call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8
152  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
153  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
154  br label %for.cond
155
156for.cond:                                         ; preds = %for.body, %entry
157  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
158  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
159  %cmp = icmp slt i32 %i.0, %n
160  br i1 %cmp, label %for.body, label %for.end
161
162for.body:                                         ; preds = %for.cond
163  %0 = bitcast i32* %a to i8*
164  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
165  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
166  %3 = bitcast <16 x i8> %1 to <4 x i32>
167  %4 = bitcast <16 x i8> %2 to <4 x i32>
168  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %4, i8* %0)
169  %5 = bitcast i32* %a to i8*
170  %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
171  %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
172  %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
173  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.2.extract)
174  %inc = add nsw i32 %i.0, 1
175  br label %for.cond
176
177for.end:                                          ; preds = %for.cond
178  ret <4 x i32> %res.0
179}
180
181define <4 x i32> @test_nocse3(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
182entry:
183; Check that @llvm.aarch64.neon.st3 is not optimized away by Early CSE due to
184; mismatch between st2 and st3.
185; CHECK-LABEL: @test_nocse3
186; CHECK: call void @llvm.aarch64.neon.st3.v4i32.p0i8
187; CHECK: call void @llvm.aarch64.neon.st2.v4i32.p0i8
188  %s.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %s.coerce, 0
189  %s.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %s.coerce, 1
190  br label %for.cond
191
192for.cond:                                         ; preds = %for.body, %entry
193  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
194  %res.0 = phi <4 x i32> [ undef, %entry ], [ %call, %for.body ]
195  %cmp = icmp slt i32 %i.0, %n
196  br i1 %cmp, label %for.body, label %for.end
197
198for.body:                                         ; preds = %for.cond
199  %0 = bitcast i32* %a to i8*
200  %1 = bitcast <4 x i32> %s.coerce.fca.0.extract to <16 x i8>
201  %2 = bitcast <4 x i32> %s.coerce.fca.1.extract to <16 x i8>
202  %3 = bitcast <16 x i8> %1 to <4 x i32>
203  %4 = bitcast <16 x i8> %2 to <4 x i32>
204  call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> %4, <4 x i32> %3, <4 x i32> %3, i8* %0)
205  call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> %3, <4 x i32> %3, i8* %0)
206  %5 = bitcast i32* %a to i8*
207  %vld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8* %5)
208  %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
209  %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
210  %call = call <4 x i32> @vaddq_s32(<4 x i32> %vld3.fca.0.extract, <4 x i32> %vld3.fca.0.extract)
211  %inc = add nsw i32 %i.0, 1
212  br label %for.cond
213
214for.end:                                          ; preds = %for.cond
215  ret <4 x i32> %res.0
216}
217
218; Function Attrs: nounwind
219declare void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32>, <4 x i32>, i8* nocapture)
220
221; Function Attrs: nounwind
222declare void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32>, <4 x i32>, <4 x i32>, i8* nocapture)
223
224; Function Attrs: nounwind readonly
225declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i8(i8*)
226
227; Function Attrs: nounwind readonly
228declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i8(i8*)
229
230define internal fastcc <4 x i32> @vaddq_s32(<4 x i32> %__p0, <4 x i32> %__p1) {
231entry:
232  %add = add <4 x i32> %__p0, %__p1
233  ret <4 x i32> %add
234}
235