1; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s 2 3; CHECK-LABEL: mul_v16i8 4; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 5; CHECK: vector.body: 6; CHECK: %index = phi i32 7; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] 8; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[ELEMS]]) 9; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16 10; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef) 11; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef) 12; CHECK: tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> {{.*}}, <16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]]) 13define dso_local arm_aapcs_vfpcc void @mul_v16i8(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) { 14entry: 15 %cmp8 = icmp eq i32 %N, 0 16 %tmp8 = add i32 %N, 15 17 %tmp9 = lshr i32 %tmp8, 4 18 %tmp10 = shl nuw i32 %tmp9, 4 19 %tmp11 = add i32 %tmp10, -16 20 %tmp12 = lshr i32 %tmp11, 4 21 %tmp13 = add nuw nsw i32 %tmp12, 1 22 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 23 24vector.ph: ; preds = %entry 25 %trip.count.minus.1 = add i32 %N, -1 26 %broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0 27 %broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer 28 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 29 br label %vector.body 30 31vector.body: ; preds = %vector.body, %vector.ph 32 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 33 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 34 %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0 35 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer 36 %induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 37 %tmp = getelementptr inbounds i8, i8* %a, i32 %index 38 39; %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11 40 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N) 41 42 %tmp2 = bitcast i8* %tmp to <16 x i8>* 43 %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef) 44 %tmp3 = getelementptr inbounds i8, i8* %b, i32 %index 45 %tmp4 = bitcast i8* %tmp3 to <16 x i8>* 46 %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef) 47 %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load 48 %tmp6 = getelementptr inbounds i8, i8* %c, i32 %index 49 %tmp7 = bitcast i8* %tmp6 to <16 x i8>* 50 tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %active.lane.mask) 51 %index.next = add i32 %index, 16 52 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) 53 %tmp16 = icmp ne i32 %tmp15, 0 54 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 55 56for.cond.cleanup: ; preds = %vector.body, %entry 57 ret void 58} 59 60; CHECK-LABEL: mul_v8i16 61; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 62; CHECK: vector.body: 63; CHECK: %index = phi i32 64; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] 65; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]]) 66; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8 67; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) 68; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) 69; CHECK: tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> {{.*}}, <8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]]) 70define dso_local arm_aapcs_vfpcc void @mul_v8i16(i16* noalias nocapture readonly %a, i16* noalias nocapture readonly %b, i16* noalias nocapture %c, i32 %N) { 71entry: 72 %cmp8 = icmp eq i32 %N, 0 73 %tmp8 = add i32 %N, 7 74 %tmp9 = lshr i32 %tmp8, 3 75 %tmp10 = shl nuw i32 %tmp9, 3 76 %tmp11 = add i32 %tmp10, -8 77 %tmp12 = lshr i32 %tmp11, 3 78 %tmp13 = add nuw nsw i32 %tmp12, 1 79 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 80 81vector.ph: ; preds = %entry 82 %trip.count.minus.1 = add i32 %N, -1 83 %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 84 %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer 85 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 86 br label %vector.body 87 88vector.body: ; preds = %vector.body, %vector.ph 89 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 90 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 91 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 92 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer 93 %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 94 %tmp = getelementptr inbounds i16, i16* %a, i32 %index 95 96; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11 97 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) 98 99 %tmp2 = bitcast i16* %tmp to <8 x i16>* 100 %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef) 101 %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index 102 %tmp4 = bitcast i16* %tmp3 to <8 x i16>* 103 %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef) 104 %mul = mul nsw <8 x i16> %wide.masked.load2, %wide.masked.load 105 %tmp6 = getelementptr inbounds i16, i16* %c, i32 %index 106 %tmp7 = bitcast i16* %tmp6 to <8 x i16>* 107 tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %active.lane.mask) 108 %index.next = add i32 %index, 8 109 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) 110 %tmp16 = icmp ne i32 %tmp15, 0 111 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 112 113for.cond.cleanup: ; preds = %vector.body, %entry 114 ret void 115} 116 117; CHECK-LABEL: mul_v4i32 118; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 119; CHECK: vector.body: 120; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] 121; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) 122; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 123; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) 124; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) 125; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]) 126define dso_local arm_aapcs_vfpcc void @mul_v4i32(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 127entry: 128 %cmp8 = icmp eq i32 %N, 0 129 %tmp8 = add i32 %N, 3 130 %tmp9 = lshr i32 %tmp8, 2 131 %tmp10 = shl nuw i32 %tmp9, 2 132 %tmp11 = add i32 %tmp10, -4 133 %tmp12 = lshr i32 %tmp11, 2 134 %tmp13 = add nuw nsw i32 %tmp12, 1 135 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 136 137vector.ph: ; preds = %entry 138 %trip.count.minus.1 = add i32 %N, -1 139 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 140 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 141 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 142 br label %vector.body 143 144vector.body: ; preds = %vector.body, %vector.ph 145 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 146 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 147 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 148 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 149 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 150 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 151 ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 152 %tmp2 = bitcast i32* %tmp to <4 x i32>* 153 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 154 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 155 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 156 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 157 %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 158 %mul = mul nsw <4 x i32> %wide.masked.load2, %wide.masked.load 159 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 160 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 161 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) 162 %index.next = add i32 %index, 4 163 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) 164 %tmp16 = icmp ne i32 %tmp15, 0 165 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 166 167for.cond.cleanup: ; preds = %vector.body, %entry 168 ret void 169} 170 171; CHECK-LABEL: split_vector 172; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 173; CHECK: vector.body: 174; CHECK: %index = phi i32 175; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] 176; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) 177; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 178; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) 179; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) 180; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]) 181define dso_local arm_aapcs_vfpcc void @split_vector(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 182entry: 183 %cmp8 = icmp eq i32 %N, 0 184 %tmp8 = add i32 %N, 3 185 %tmp9 = lshr i32 %tmp8, 2 186 %tmp10 = shl nuw i32 %tmp9, 2 187 %tmp11 = add i32 %tmp10, -4 188 %tmp12 = lshr i32 %tmp11, 2 189 %tmp13 = add nuw nsw i32 %tmp12, 1 190 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 191 192vector.ph: ; preds = %entry 193 %trip.count.minus.1 = add i32 %N, -1 194 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 195 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 196 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 197 br label %vector.body 198 199vector.body: ; preds = %vector.body, %vector.ph 200 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 201 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 202 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 203 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 204 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 205 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 206; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 207 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 208 %tmp2 = bitcast i32* %tmp to <4 x i32>* 209 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 210 %extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2> 211 %extract.1.high = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 1, i32 3> 212 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 213 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 214 %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 215 %extract.2.low = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 0, i32 2> 216 %extract.2.high = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 1, i32 3> 217 %mul = mul nsw <2 x i32> %extract.1.low, %extract.2.low 218 %sub = sub nsw <2 x i32> %extract.1.high, %extract.2.high 219 %combine = shufflevector <2 x i32> %mul, <2 x i32> %sub, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 220 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 221 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 222 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) 223 %index.next = add i32 %index, 4 224 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) 225 %tmp16 = icmp ne i32 %tmp15, 0 226 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 227 228for.cond.cleanup: ; preds = %vector.body, %entry 229 ret void 230} 231 232; One of the loads now uses ult predicate. 233; CHECK-LABEL: mismatch_load_pred 234; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] 235; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) 236; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 237; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) 238; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef) 239; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]) 240define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 241entry: 242 %cmp8 = icmp eq i32 %N, 0 243 %tmp8 = add i32 %N, 3 244 %tmp9 = lshr i32 %tmp8, 2 245 %tmp10 = shl nuw i32 %tmp9, 2 246 %tmp11 = add i32 %tmp10, -4 247 %tmp12 = lshr i32 %tmp11, 2 248 %tmp13 = add nuw nsw i32 %tmp12, 1 249 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 250 251vector.ph: ; preds = %entry 252 %trip.count.minus.1 = add i32 %N, -1 253 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 254 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 255 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 256 br label %vector.body 257 258vector.body: ; preds = %vector.body, %vector.ph 259 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 260 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 261 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 262 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 263 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 264 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 265 266; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 267 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 268 269 %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 270 %tmp2 = bitcast i32* %tmp to <4 x i32>* 271 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 272 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 273 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 274 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %wrong, <4 x i32> undef) 275 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 276 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 277 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 278 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) 279 %index.next = add i32 %index, 4 280 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) 281 %tmp16 = icmp ne i32 %tmp15, 0 282 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 283 284for.cond.cleanup: ; preds = %vector.body, %entry 285 ret void 286} 287 288; The store now uses ult predicate. 289; CHECK-LABEL: mismatch_store_pred 290; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 291; CHECK: vector.body: 292; CHECK: %index = phi i32 293; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] 294; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) 295; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 296; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) 297; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) 298; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong) 299define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 300entry: 301 %cmp8 = icmp eq i32 %N, 0 302 %tmp8 = add i32 %N, 3 303 %tmp9 = lshr i32 %tmp8, 2 304 %tmp10 = shl nuw i32 %tmp9, 2 305 %tmp11 = add i32 %tmp10, -4 306 %tmp12 = lshr i32 %tmp11, 2 307 %tmp13 = add nuw nsw i32 %tmp12, 1 308 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 309 310vector.ph: ; preds = %entry 311 %trip.count.minus.1 = add i32 %N, -1 312 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 313 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 314 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 315 br label %vector.body 316 317vector.body: ; preds = %vector.body, %vector.ph 318 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 319 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 320 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 321 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 322 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 323 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 324 325; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11 326 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 327 328 %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 329 %tmp2 = bitcast i32* %tmp to <4 x i32>* 330 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 331 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 332 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 333 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 334 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 335 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 336 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 337 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %wrong) 338 %index.next = add i32 %index, 4 339 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) 340 %tmp16 = icmp ne i32 %tmp15, 0 341 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 342 343for.cond.cleanup: ; preds = %vector.body, %entry 344 ret void 345} 346 347; TODO: Multiple intrinsics not yet supported. 348; This is currently rejected, because if the vector body is unrolled, the step 349; is not what we expect: 350; 351; Step value 16 doesn't match vector width 4 352; 353; CHECK-LABEL: interleave4 354; CHECK: vector.body: 355; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 356; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N) 357; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N) 358; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N) 359; 360define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { 361entry: 362 %cmp8 = icmp sgt i32 %N, 0 363 %v0 = add i32 %N, 15 364 %v1 = lshr i32 %v0, 4 365 %v2 = shl nuw i32 %v1, 4 366 %v3 = add i32 %v2, -16 367 %v4 = lshr i32 %v3, 4 368 %v5 = add nuw nsw i32 %v4, 1 369 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 370 371 372vector.ph: 373 %trip.count.minus.1 = add i32 %N, -1 374 %scevgep = getelementptr i32, i32* %A, i32 8 375 %scevgep30 = getelementptr i32, i32* %C, i32 8 376 %scevgep37 = getelementptr i32, i32* %B, i32 8 377 %start = call i32 @llvm.start.loop.iterations.i32(i32 %v5) 378 br label %vector.body 379 380vector.body: 381 %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %scevgep37, %vector.ph ] 382 %lsr.iv31 = phi i32* [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ] 383 %lsr.iv = phi i32* [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ] 384 %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ] 385 %v6 = phi i32 [ %start, %vector.ph ], [ %v15, %vector.body ] 386 %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>* 387 %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>* 388 %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>* 389 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 390 %v7 = add i32 %index, 4 391 %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N) 392 %v8 = add i32 %v7, 4 393 %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N) 394 %v9 = add i32 %v8, 4 395 %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N) 396 %scevgep42 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -2 397 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 398 %scevgep43 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -1 399 %wide.masked.load18 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep43, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef) 400 %wide.masked.load19 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef) 401 %scevgep41 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 1 402 %wide.masked.load20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep41, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef) 403 %scevgep34 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -2 404 %wide.masked.load21 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep34, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 405 %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -1 406 %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep35, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef) 407 %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3133, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef) 408 %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 1 409 %wide.masked.load24 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep36, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef) 410 %v10 = add nsw <4 x i32> %wide.masked.load21, %wide.masked.load 411 %v11 = add nsw <4 x i32> %wide.masked.load22, %wide.masked.load18 412 %v12 = add nsw <4 x i32> %wide.masked.load23, %wide.masked.load19 413 %v13 = add nsw <4 x i32> %wide.masked.load24, %wide.masked.load20 414 %scevgep27 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -2 415 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v10, <4 x i32>* %scevgep27, i32 4, <4 x i1> %active.lane.mask) 416 %scevgep28 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -1 417 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v11, <4 x i32>* %scevgep28, i32 4, <4 x i1> %active.lane.mask15) 418 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v12, <4 x i32>* %lsr.iv26, i32 4, <4 x i1> %active.lane.mask16) 419 %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 1 420 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v13, <4 x i32>* %scevgep29, i32 4, <4 x i1> %active.lane.mask17) 421 %scevgep25 = getelementptr i32, i32* %lsr.iv, i32 16 422 %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 16 423 %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 16 424 %v14 = add i32 %v9, 4 425 %v15 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1) 426 %v16 = icmp ne i32 %v15, 0 427 br i1 %v16, label %vector.body, label %for.cond.cleanup 428 429for.cond.cleanup: 430 ret void 431} 432 433; CHECK-LABEL: const_expected_in_set_loop 434; CHECK: call <4 x i1> @llvm.get.active.lane.mask 435; CHECK-NOT: vctp 436; CHECK: ret void 437; 438define dso_local void @const_expected_in_set_loop(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { 439entry: 440 %cmp8 = icmp sgt i32 %N, 0 441 %0 = add i32 %N, 3 442 %1 = lshr i32 %0, 2 443 %2 = shl nuw i32 %1, 2 444 %3 = add i32 %2, -4 445 %4 = lshr i32 %3, 2 446 %5 = add nuw nsw i32 %4, 1 447 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 448 449vector.ph: 450 %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) 451 br label %vector.body 452 453vector.body: ; preds = %vector.body, %vector.ph 454 %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] 455 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] 456 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] 457 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 458 %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ] 459 %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* 460 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* 461 %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* 462 463 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 42) 464 465 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 466 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 467 %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load 468 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask) 469 %index.next = add i32 %index, 4 470 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 471 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 472 %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 473 %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) 474 %9 = icmp ne i32 %8, 0 475 br i1 %9, label %vector.body, label %for.cond.cleanup 476 477for.cond.cleanup: ; preds = %vector.body, %entry 478 ret void 479} 480 481; CHECK-LABEL: tripcount_arg_not_invariant 482; CHECK: call <4 x i1> @llvm.get.active.lane.mask 483; CHECK-NOT: vctp 484; CHECK: ret void 485; 486define dso_local void @tripcount_arg_not_invariant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { 487entry: 488 %cmp8 = icmp sgt i32 %N, 0 489 %0 = add i32 %N, 3 490 %1 = lshr i32 %0, 2 491 %2 = shl nuw i32 %1, 2 492 %3 = add i32 %2, -4 493 %4 = lshr i32 %3, 2 494 %5 = add nuw nsw i32 %4, 1 495 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 496 497vector.ph: ; preds = %entry 498 %trip.count.minus.1 = add i32 %N, -1 499 %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) 500 br label %vector.body 501 502vector.body: ; preds = %vector.body, %vector.ph 503 %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] 504 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] 505 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] 506 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 507 %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ] 508 509 %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* 510 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* 511 %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* 512 513 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index) 514 515 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 516 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 517 %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load 518 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask) 519 %index.next = add i32 %index, 4 520 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 521 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 522 %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 523 %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) 524 %9 = icmp ne i32 %8, 0 525 ;br i1 %9, label %vector.body, label %for.cond.cleanup 526 br i1 %9, label %vector.body, label %vector.ph 527 528for.cond.cleanup: ; preds = %vector.body, %entry 529 ret void 530} 531 532; CHECK-LABEL: addrec_base_not_zero 533; CHECK: call <4 x i1> @llvm.get.active.lane.mask 534; CHECK-NOT: vctp 535; CHECK: ret void 536; 537define dso_local void @addrec_base_not_zero(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { 538entry: 539 %cmp8 = icmp sgt i32 %N, 0 540 %0 = add i32 %N, 3 541 %1 = lshr i32 %0, 2 542 %2 = shl nuw i32 %1, 2 543 %3 = add i32 %2, -4 544 %4 = lshr i32 %3, 2 545 %5 = add nuw nsw i32 %4, 1 546 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 547 548vector.ph: ; preds = %entry 549 %trip.count.minus.1 = add i32 %N, -1 550 %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) 551 br label %vector.body 552 553vector.body: ; preds = %vector.body, %vector.ph 554 %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] 555 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] 556 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] 557 558; AddRec base is not 0: 559 %index = phi i32 [ 1, %vector.ph ], [ %index.next, %vector.body ] 560 561 %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ] 562 %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* 563 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* 564 %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* 565 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 566 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 567 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 568 %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load 569 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask) 570 %index.next = add i32 %index, 4 571 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 572 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 573 %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 574 %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) 575 %9 = icmp ne i32 %8, 0 576 ;br i1 %9, label %vector.body, label %for.cond.cleanup 577 br i1 %9, label %vector.body, label %vector.ph 578 579for.cond.cleanup: ; preds = %vector.body, %entry 580 ret void 581} 582 583 584declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) 585declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) 586declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) 587declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) 588declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) 589declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, <2 x i1>) 590declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>) 591declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) 592declare i32 @llvm.start.loop.iterations.i32(i32) 593declare i32 @llvm.loop.decrement.reg.i32(i32, i32) 594declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) 595declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) 596declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) 597