1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=enabled %s -o - | FileCheck %s
3
4define dso_local arm_aapcs_vfpcc void @sext_i8(i16* noalias nocapture %a, i8* nocapture readonly %b, i32 %N) {
5; CHECK-LABEL: sext_i8:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    push {r7, lr}
8; CHECK-NEXT:    cmp r2, #0
9; CHECK-NEXT:    it eq
10; CHECK-NEXT:    popeq {r7, pc}
11; CHECK-NEXT:  .LBB0_1: @ %vector.ph
12; CHECK-NEXT:    movs r3, #0
13; CHECK-NEXT:    dlstp.16 lr, r2
14; CHECK-NEXT:  .LBB0_2: @ %vector.body
15; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
16; CHECK-NEXT:    adds r3, #8
17; CHECK-NEXT:    vldrb.s16 q0, [r1], #8
18; CHECK-NEXT:    vldrh.u16 q1, [r0]
19; CHECK-NEXT:    vadd.i16 q0, q1, q0
20; CHECK-NEXT:    vstrh.16 q0, [r0], #16
21; CHECK-NEXT:    letp lr, .LBB0_2
22; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
23; CHECK-NEXT:    pop {r7, pc}
24entry:
25  %cmp8 = icmp eq i32 %N, 0
26  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
27
28vector.ph:                                        ; preds = %entry
29  %n.rnd.up = add i32 %N, 7
30  %n.vec = and i32 %n.rnd.up, -8
31  %trip.count.minus.1 = add i32 %N, -1
32  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
33  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
34  br label %vector.body
35
36vector.body:                                      ; preds = %vector.body, %vector.ph
37  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
38  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
39  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
40  %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
41  %0 = getelementptr inbounds i8, i8* %b, i32 %index
42
43  ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11
44  %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
45
46  %2 = bitcast i8* %0 to <8 x i8>*
47  %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
48  %3 = sext <8 x i8> %wide.masked.load to <8 x i16>
49  %4 = getelementptr inbounds i16, i16* %a, i32 %index
50  %5 = bitcast i16* %4 to <8 x i16>*
51  %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %5, i32 2, <8 x i1> %1, <8 x i16> undef)
52  %6 = add <8 x i16> %wide.masked.load12, %3
53  %7 = bitcast i16* %4 to <8 x i16>*
54  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %6, <8 x i16>* %7, i32 2, <8 x i1> %1)
55  %index.next = add i32 %index, 8
56  %8 = icmp eq i32 %index.next, %n.vec
57  br i1 %8, label %for.cond.cleanup, label %vector.body
58
59for.cond.cleanup:                                 ; preds = %vector.body, %entry
60  ret void
61}
62
63; Function Attrs: nofree norecurse nounwind
64define dso_local arm_aapcs_vfpcc void @zext_i8(i16* noalias nocapture %a, i8* nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
65; CHECK-LABEL: zext_i8:
66; CHECK:       @ %bb.0: @ %entry
67; CHECK-NEXT:    push {r7, lr}
68; CHECK-NEXT:    cmp r2, #0
69; CHECK-NEXT:    it eq
70; CHECK-NEXT:    popeq {r7, pc}
71; CHECK-NEXT:  .LBB1_1: @ %vector.ph
72; CHECK-NEXT:    movs r3, #0
73; CHECK-NEXT:    dlstp.16 lr, r2
74; CHECK-NEXT:  .LBB1_2: @ %vector.body
75; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
76; CHECK-NEXT:    adds r3, #8
77; CHECK-NEXT:    vldrb.u16 q0, [r1], #8
78; CHECK-NEXT:    vldrh.u16 q1, [r0]
79; CHECK-NEXT:    vadd.i16 q0, q1, q0
80; CHECK-NEXT:    vstrh.16 q0, [r0], #16
81; CHECK-NEXT:    letp lr, .LBB1_2
82; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
83; CHECK-NEXT:    pop {r7, pc}
84entry:
85  %cmp8 = icmp eq i32 %N, 0
86  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
87
88vector.ph:                                        ; preds = %entry
89  %n.rnd.up = add i32 %N, 7
90  %n.vec = and i32 %n.rnd.up, -8
91  %trip.count.minus.1 = add i32 %N, -1
92  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
93  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
94  br label %vector.body
95
96vector.body:                                      ; preds = %vector.body, %vector.ph
97  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
98  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
99  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
100  %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
101  %0 = getelementptr inbounds i8, i8* %b, i32 %index
102
103  ; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11
104  %1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
105
106  %2 = bitcast i8* %0 to <8 x i8>*
107  %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
108  %3 = zext <8 x i8> %wide.masked.load to <8 x i16>
109  %4 = getelementptr inbounds i16, i16* %a, i32 %index
110  %5 = bitcast i16* %4 to <8 x i16>*
111  %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %5, i32 2, <8 x i1> %1, <8 x i16> undef)
112  %6 = add <8 x i16> %wide.masked.load12, %3
113  %7 = bitcast i16* %4 to <8 x i16>*
114  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %6, <8 x i16>* %7, i32 2, <8 x i1> %1)
115  %index.next = add i32 %index, 8
116  %8 = icmp eq i32 %index.next, %n.vec
117  br i1 %8, label %for.cond.cleanup, label %vector.body
118
119for.cond.cleanup:                                 ; preds = %vector.body, %entry
120  ret void
121}
122
123; Function Attrs: nofree norecurse nounwind
124define dso_local arm_aapcs_vfpcc void @sext_i16(i32* noalias nocapture %a, i16* nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
125; CHECK-LABEL: sext_i16:
126; CHECK:       @ %bb.0: @ %entry
127; CHECK-NEXT:    push {r7, lr}
128; CHECK-NEXT:    cmp r2, #0
129; CHECK-NEXT:    it eq
130; CHECK-NEXT:    popeq {r7, pc}
131; CHECK-NEXT:  .LBB2_1: @ %vector.ph
132; CHECK-NEXT:    movs r3, #0
133; CHECK-NEXT:    dlstp.32 lr, r2
134; CHECK-NEXT:  .LBB2_2: @ %vector.body
135; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
136; CHECK-NEXT:    adds r3, #4
137; CHECK-NEXT:    vldrh.s32 q0, [r1], #8
138; CHECK-NEXT:    vldrw.u32 q1, [r0]
139; CHECK-NEXT:    vadd.i32 q0, q1, q0
140; CHECK-NEXT:    vstrw.32 q0, [r0], #16
141; CHECK-NEXT:    letp lr, .LBB2_2
142; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
143; CHECK-NEXT:    pop {r7, pc}
144entry:
145  %cmp6 = icmp eq i32 %N, 0
146  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
147
148vector.ph:                                        ; preds = %entry
149  %n.rnd.up = add i32 %N, 3
150  %n.vec = and i32 %n.rnd.up, -4
151  %trip.count.minus.1 = add i32 %N, -1
152  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
153  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
154  br label %vector.body
155
156vector.body:                                      ; preds = %vector.body, %vector.ph
157  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
158  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
159  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
160  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
161  %0 = getelementptr inbounds i16, i16* %b, i32 %index
162
163  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
164  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
165
166  %2 = bitcast i16* %0 to <4 x i16>*
167  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
168  %3 = sext <4 x i16> %wide.masked.load to <4 x i32>
169  %4 = getelementptr inbounds i32, i32* %a, i32 %index
170  %5 = bitcast i32* %4 to <4 x i32>*
171  %wide.masked.load10 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %5, i32 4, <4 x i1> %1, <4 x i32> undef)
172  %6 = add nsw <4 x i32> %wide.masked.load10, %3
173  %7 = bitcast i32* %4 to <4 x i32>*
174  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %6, <4 x i32>* %7, i32 4, <4 x i1> %1)
175  %index.next = add i32 %index, 4
176  %8 = icmp eq i32 %index.next, %n.vec
177  br i1 %8, label %for.cond.cleanup, label %vector.body
178
179for.cond.cleanup:                                 ; preds = %vector.body, %entry
180  ret void
181}
182
183; Function Attrs: nofree norecurse nounwind
184define dso_local arm_aapcs_vfpcc void @zext_i16(i32* noalias nocapture %a, i16* nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
185; CHECK-LABEL: zext_i16:
186; CHECK:       @ %bb.0: @ %entry
187; CHECK-NEXT:    push {r7, lr}
188; CHECK-NEXT:    cmp r2, #0
189; CHECK-NEXT:    it eq
190; CHECK-NEXT:    popeq {r7, pc}
191; CHECK-NEXT:  .LBB3_1: @ %vector.ph
192; CHECK-NEXT:    movs r3, #0
193; CHECK-NEXT:    dlstp.32 lr, r2
194; CHECK-NEXT:  .LBB3_2: @ %vector.body
195; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
196; CHECK-NEXT:    adds r3, #4
197; CHECK-NEXT:    vldrh.u32 q0, [r1], #8
198; CHECK-NEXT:    vldrw.u32 q1, [r0]
199; CHECK-NEXT:    vadd.i32 q0, q1, q0
200; CHECK-NEXT:    vstrw.32 q0, [r0], #16
201; CHECK-NEXT:    letp lr, .LBB3_2
202; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
203; CHECK-NEXT:    pop {r7, pc}
204entry:
205  %cmp6 = icmp eq i32 %N, 0
206  br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
207
208vector.ph:                                        ; preds = %entry
209  %n.rnd.up = add i32 %N, 3
210  %n.vec = and i32 %n.rnd.up, -4
211  %trip.count.minus.1 = add i32 %N, -1
212  %broadcast.splatinsert8 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
213  %broadcast.splat9 = shufflevector <4 x i32> %broadcast.splatinsert8, <4 x i32> undef, <4 x i32> zeroinitializer
214  br label %vector.body
215
216vector.body:                                      ; preds = %vector.body, %vector.ph
217  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
218  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
219  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
220  %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
221  %0 = getelementptr inbounds i16, i16* %b, i32 %index
222
223  ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
224  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
225
226  %2 = bitcast i16* %0 to <4 x i16>*
227  %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
228  %3 = zext <4 x i16> %wide.masked.load to <4 x i32>
229  %4 = getelementptr inbounds i32, i32* %a, i32 %index
230  %5 = bitcast i32* %4 to <4 x i32>*
231  %wide.masked.load10 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %5, i32 4, <4 x i1> %1, <4 x i32> undef)
232  %6 = add <4 x i32> %wide.masked.load10, %3
233  %7 = bitcast i32* %4 to <4 x i32>*
234  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %6, <4 x i32>* %7, i32 4, <4 x i1> %1)
235  %index.next = add i32 %index, 4
236  %8 = icmp eq i32 %index.next, %n.vec
237  br i1 %8, label %for.cond.cleanup, label %vector.body
238
239for.cond.cleanup:                                 ; preds = %vector.body, %entry
240  ret void
241}
242
243declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>)
244declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
245declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
246declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
247declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
248declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
249
250declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
251declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
252