1; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
2
3; CHECK-LABEL: expand_v8i16_v8i32
4; CHECK-NOT: call i32 @llvm.arm.mve.vctp
5define void @expand_v8i16_v8i32(i16* noalias nocapture readonly %a, i16* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
6entry:
7  %cmp8 = icmp eq i32 %N, 0
8  %tmp8 = add i32 %N, 7
9  %tmp9 = lshr i32 %tmp8, 3
10  %tmp10 = shl nuw i32 %tmp9, 3
11  %tmp11 = add i32 %tmp10, -8
12  %tmp12 = lshr i32 %tmp11, 3
13  %tmp13 = add nuw nsw i32 %tmp12, 1
14  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
15
16vector.ph:                                        ; preds = %entry
17  %trip.count.minus.1 = add i32 %N, -1
18  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
19  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
20  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
21  br label %vector.body
22
23vector.body:                                      ; preds = %vector.body, %vector.ph
24  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
25  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
26  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
27  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
28  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
29  %tmp = getelementptr inbounds i16, i16* %a, i32 %index
30
31  ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
32   %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
33
34  %tmp2 = bitcast i16* %tmp to <8 x i16>*
35  %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
36  %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
37  %tmp4 = bitcast i16* %tmp3 to <8 x i16>*
38  %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
39  %expand.1 = zext <8 x i16> %wide.masked.load to <8 x i32>
40  %expand.2 = zext <8 x i16> %wide.masked.load2 to <8 x i32>
41  %mul = mul nsw <8 x i32> %expand.2, %expand.1
42  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
43  %tmp7 = bitcast i32* %tmp6 to <8 x i32>*
44  tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %mul, <8 x i32>* %tmp7, i32 4, <8 x i1> %tmp1)
45  %index.next = add i32 %index, 8
46  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
47  %tmp16 = icmp ne i32 %tmp15, 0
48  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
49
50for.cond.cleanup:                                 ; preds = %vector.body, %entry
51  ret void
52}
53
54; CHECK-LABEL: expand_v8i16_v4i32
55; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS_REM:%[^ ]+]], %vector.body ]
56; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]])
57; CHECK: [[ELEMS_REM]] = sub i32 [[ELEMS]], 8
58; CHECK: tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
59; CHECK: %store.pred = icmp ule <4 x i32> %induction.store
60; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %store.pred)
61; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %store.pred)
62define void @expand_v8i16_v4i32(i16* readonly %a, i16* readonly %b, i32* %c, i32* %d, i32 %N) {
63entry:
64  %cmp8 = icmp eq i32 %N, 0
65  %tmp8 = add i32 %N, 7
66  %tmp9 = lshr i32 %tmp8, 3
67  %tmp10 = shl nuw i32 %tmp9, 3
68  %tmp11 = add i32 %tmp10, -8
69  %tmp12 = lshr i32 %tmp11, 3
70  %tmp13 = add nuw nsw i32 %tmp12, 1
71  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
72
73vector.ph:                                        ; preds = %entry
74  %trip.count.minus.1 = add i32 %N, -1
75  %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
76  %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
77  %broadcast.splatinsert10.store = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
78  %broadcast.splat11.store = shufflevector <4 x i32> %broadcast.splatinsert10.store, <4 x i32> undef, <4 x i32> zeroinitializer
79  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
80  br label %vector.body
81
82vector.body:                                      ; preds = %vector.body, %vector.ph
83  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
84  %store.idx = phi i32 [ 0, %vector.ph ], [ %store.idx.next, %vector.body ]
85  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
86  %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
87  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
88  %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
89  %tmp = getelementptr inbounds i16, i16* %a, i32 %index
90
91  ; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
92  %tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
93
94  %tmp2 = bitcast i16* %tmp to <8 x i16>*
95  %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
96  %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
97  %tmp4 = bitcast i16* %tmp3 to <8 x i16>*
98  %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
99  %extract.2.low = shufflevector <8 x i16> %wide.masked.load2, <8 x i16> undef, < 4 x i32> <i32 0, i32 1, i32 2, i32 3>
100  %extract.2.high = shufflevector <8 x i16> %wide.masked.load2, <8 x i16> undef, < 4 x i32> <i32 4, i32 5, i32 6, i32 7>
101  %expand.1 = zext <4 x i16> %extract.2.low to <4 x i32>
102  %expand.2 = zext <4 x i16> %extract.2.high to <4 x i32>
103  %mul = mul nsw <4 x i32> %expand.2, %expand.1
104  %sub = mul nsw <4 x i32> %expand.1, %expand.2
105  %broadcast.splatinsert.store = insertelement <4 x i32> undef, i32 %store.idx, i32 0
106  %broadcast.splat.store = shufflevector <4 x i32> %broadcast.splatinsert.store, <4 x i32> undef, <4 x i32> zeroinitializer
107  %induction.store = add <4 x i32> %broadcast.splat.store, <i32 0, i32 1, i32 2, i32 3>
108  %store.pred = icmp ule <4 x i32> %induction.store, %broadcast.splat11.store
109  %tmp6 = getelementptr inbounds i32, i32* %c, i32 %store.idx
110  %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
111  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %store.pred)
112  %gep = getelementptr inbounds i32, i32* %d, i32 %store.idx
113  %cast.gep = bitcast i32* %gep to <4 x i32>*
114  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %sub, <4 x i32>* %cast.gep, i32 4, <4 x i1> %store.pred)
115  %store.idx.next = add i32 %store.idx, 4
116  %index.next = add i32 %index, 8
117  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
118  %tmp16 = icmp ne i32 %tmp15, 0
119  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
120
121for.cond.cleanup:                                 ; preds = %vector.body, %entry
122  ret void
123}
124
125; CHECK-LABEL: expand_v4i32_v4i64
126; CHECK-NOT: call i32 @llvm.arm.mve.vctp
127define void @expand_v4i32_v4i64(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i64* noalias nocapture %c, i32 %N) {
128entry:
129  %cmp8 = icmp eq i32 %N, 0
130  %tmp8 = add i32 %N, 3
131  %tmp9 = lshr i32 %tmp8, 2
132  %tmp10 = shl nuw i32 %tmp9, 2
133  %tmp11 = add i32 %tmp10, -4
134  %tmp12 = lshr i32 %tmp11, 2
135  %tmp13 = add nuw nsw i32 %tmp12, 1
136  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
137
138vector.ph:                                        ; preds = %entry
139  %trip.count.minus.1 = add i32 %N, -1
140  %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
141  %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
142  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
143  br label %vector.body
144
145vector.body:                                      ; preds = %vector.body, %vector.ph
146  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
147  %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
148  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
149  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
150  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
151  %tmp = getelementptr inbounds i32, i32* %a, i32 %index
152
153  ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
154  %tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
155
156  %tmp2 = bitcast i32* %tmp to <4 x i32>*
157  %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
158  %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
159  %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
160  %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
161  %expand.1 = zext <4 x i32> %wide.masked.load to <4 x i64>
162  %expand.2 = zext <4 x i32> %wide.masked.load2 to <4 x i64>
163  %mul = mul nsw <4 x i64> %expand.2, %expand.1
164  %tmp6 = getelementptr inbounds i64, i64* %c, i32 %index
165  %tmp7 = bitcast i64* %tmp6 to <4 x i64>*
166  tail call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %mul, <4 x i64>* %tmp7, i32 4, <4 x i1> %tmp1)
167  %index.next = add i32 %index, 4
168  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
169  %tmp16 = icmp ne i32 %tmp15, 0
170  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
171
172for.cond.cleanup:                                 ; preds = %vector.body, %entry
173  ret void
174}
175
176declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
177declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32 immarg, <8 x i1>)
178declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
179declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
180declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32 immarg, <4 x i1>)
181declare i32 @llvm.start.loop.iterations.i32(i32)
182declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
183declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
184declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
185