1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s 3 4define dso_local void @foo(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 5; CHECK-LABEL: @foo( 6; CHECK-NEXT: entry: 7; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001) 8; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 9; CHECK: vector.body: 10; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] 11; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] 12; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] 13; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 14; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] 15; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 16; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* 17; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* 18; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* 19; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 20; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer 21; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3> 22; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]]) 23; CHECK-NEXT: [[TMP3]] = sub i32 [[TMP1]], 4 24; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef) 25; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef) 26; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] 27; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP2]]) 28; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 29; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 30; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4 31; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4 32; CHECK-NEXT: [[TMP5]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) 33; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0 34; CHECK-NEXT: br i1 [[TMP6]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 35; CHECK: for.cond.cleanup: 36; CHECK-NEXT: ret void 37; 38entry: 39 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) 40 br label %vector.body 41 42vector.body: 43 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] 44 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] 45 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] 46 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 47 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] 48 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* 49 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* 50 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* 51 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 52 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 53 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 54 55 ; %1 = icmp ult <4 x i32> %induction, <i32 32002, i32 32002, i32 32002, i32 32002> 56 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003) 57 58 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) 59 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) 60 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load 61 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) 62 %index.next = add i32 %index, 4 63 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 64 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 65 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 66 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 67 %4 = icmp ne i32 %3, 0 68 br i1 %4, label %vector.body, label %for.cond.cleanup 69 70for.cond.cleanup: 71 ret void 72} 73 74; Silly test case: the loop count is constant and a multiple of the vectorisation 75; factor. So, the vectoriser should not produce masked loads/stores and there's 76; nothing to tail-predicate here, just checking. 77define dso_local void @foo2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 78; CHECK-LABEL: @foo2( 79; CHECK-NEXT: entry: 80; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 2000) 81; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 82; CHECK: vector.body: 83; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] 84; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] 85; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] 86; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] 87; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* 88; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* 89; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* 90; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[LSR_IV10]], align 4 91; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[LSR_IV1113]], align 4 92; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD]] 93; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[LSR_IV1416]], align 4 94; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 95; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4 96; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4 97; CHECK-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) 98; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 99; CHECK-NEXT: br i1 [[TMP3]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 100; CHECK: for.cond.cleanup: 101; CHECK-NEXT: ret void 102; 103entry: 104 %start = call i32 @llvm.start.loop.iterations.i32(i32 2000) 105 br label %vector.body 106 107vector.body: 108 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] 109 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] 110 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] 111 %0 = phi i32 [ %start, %entry ], [ %2, %vector.body ] 112 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* 113 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* 114 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* 115 %wide.load = load <4 x i32>, <4 x i32>* %lsr.iv10, align 4 116 %wide.load9 = load <4 x i32>, <4 x i32>* %lsr.iv1113, align 4 117 %1 = add nsw <4 x i32> %wide.load9, %wide.load 118 store <4 x i32> %1, <4 x i32>* %lsr.iv1416, align 4 119 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 120 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 121 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 122 %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 123 %3 = icmp ne i32 %2, 0 124 br i1 %3, label %vector.body, label %for.cond.cleanup 125 126for.cond.cleanup: 127 ret void 128} 129 130; Check that the icmp is a ult 131define dso_local void @foo3(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 132; CHECK-LABEL: @foo3( 133; CHECK-NEXT: entry: 134; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001) 135; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 136; CHECK: vector.body: 137; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] 138; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] 139; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] 140; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 141; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 142; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* 143; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* 144; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* 145; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 146; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer 147; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3> 148; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i32> [[INDUCTION]], <i32 32002, i32 32002, i32 32002, i32 32002> 149; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 150; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 151; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] 152; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP1]]) 153; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 154; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 155; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4 156; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4 157; CHECK-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) 158; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 159; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 160; CHECK: for.cond.cleanup: 161; CHECK-NEXT: ret void 162; 163entry: 164 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) 165 br label %vector.body 166 167vector.body: 168 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] 169 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] 170 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] 171 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 172 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] 173 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* 174 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* 175 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* 176 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 177 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 178 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 179 180; UGT here: 181 %1 = icmp ugt <4 x i32> %induction, <i32 32002, i32 32002, i32 32002, i32 32002> 182 183 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) 184 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) 185 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load 186 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) 187 %index.next = add i32 %index, 4 188 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 189 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 190 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 191 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 192 %4 = icmp ne i32 %3, 0 193 br i1 %4, label %vector.body, label %for.cond.cleanup 194 195for.cond.cleanup: 196 ret void 197} 198 199define dso_local void @foo5(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 200; CHECK-LABEL: @foo5( 201; CHECK-NEXT: entry: 202; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001) 203; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 204; CHECK: vector.body: 205; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] 206; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] 207; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] 208; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 209; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 210; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>* 211; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>* 212; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>* 213; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 214; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer 215; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3> 216; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[INDUCTION]], <i32 0, i32 3200, i32 32002, i32 32002> 217; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 218; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 219; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] 220; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP1]]) 221; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 222; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4 223; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4 224; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4 225; CHECK-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) 226; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 227; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 228; CHECK: for.cond.cleanup: 229; CHECK-NEXT: ret void 230; 231entry: 232 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) 233 br label %vector.body 234 235vector.body: 236 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] 237 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] 238 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] 239 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 240 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] 241 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* 242 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* 243 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* 244 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 245 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 246 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 247 248; Non-uniform constant vector here. This can't be represented with 249; @llvm.get.active.lane.mask, but let's keep this test as a sanity check: 250 %1 = icmp ult <4 x i32> %induction, <i32 0, i32 3200, i32 32002, i32 32002> 251 252 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) 253 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) 254 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load 255 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) 256 %index.next = add i32 %index, 4 257 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 258 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 259 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 260 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 261 %4 = icmp ne i32 %3, 0 262 br i1 %4, label %vector.body, label %for.cond.cleanup 263 264for.cond.cleanup: 265 ret void 266} 267 268; CHECK-LABEL: @inconsistent_tripcounts( 269; CHECK: vector.body: 270; CHECK-NOT: @llvm.arm.mve.vctp32 271; CHECK: @llvm.get.active.lane.mask 272; CHECK: ret void 273; 274define dso_local void @inconsistent_tripcounts(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 275entry: 276 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) 277 br label %vector.body 278 279vector.body: 280 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] 281 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] 282 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] 283 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 284 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] 285 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* 286 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* 287 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* 288 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 289 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 290 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 291 292; BTC = UINT_MAX, and scalar trip count BTC + 1 would overflow: 293 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967295) 294 295 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) 296 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) 297 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load 298 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) 299 %index.next = add i32 %index, 4 300 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 301 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 302 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 303 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 304 %4 = icmp ne i32 %3, 0 305 br i1 %4, label %vector.body, label %for.cond.cleanup 306 307for.cond.cleanup: 308 ret void 309} 310 311; CHECK-LABEL: @overflow_in_sub( 312; CHECK: vector.body: 313; CHECK-NOT: @llvm.arm.mve.vctp32 314; CHECK: @llvm.get.active.lane.mask 315; CHECK: ret void 316; 317define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 318entry: 319 %start = call i32 @llvm.start.loop.iterations.i32(i32 1073741824) 320 br label %vector.body 321 322vector.body: 323 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] 324 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] 325 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] 326 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 327 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] 328 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* 329 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* 330 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* 331 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 332 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 333 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 334 335 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003) 336 337 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) 338 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) 339 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load 340 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) 341 %index.next = add i32 %index, 4 342 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 343 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 344 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 345 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 346 %4 = icmp ne i32 %3, 0 347 br i1 %4, label %vector.body, label %for.cond.cleanup 348 349for.cond.cleanup: 350 ret void 351} 352 353 354; CHECK-LABEL: @IV_not_an_induction( 355; CHECK: vector.body: 356; CHECK-NOT: @llvm.arm.mve.vctp32 357; CHECK: @llvm.get.active.lane.mask 358; CHECK: ret void 359; 360define dso_local void @IV_not_an_induction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 361entry: 362 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) 363 br label %vector.body 364 365vector.body: 366 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] 367 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] 368 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] 369 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 370 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] 371 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* 372 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* 373 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* 374 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 375 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 376 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 377 378; The induction variable %N is not an IV: 379 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003) 380 381 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) 382 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) 383 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load 384 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) 385 %index.next = add i32 %index, 4 386 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 387 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 388 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 389 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 390 %4 = icmp ne i32 %3, 0 391 br i1 %4, label %vector.body, label %for.cond.cleanup 392 393for.cond.cleanup: 394 ret void 395} 396 397; CHECK-LABEL: @IV_wrong_step( 398; CHECK: vector.body: 399; CHECK-NOT: @llvm.arm.mve.vctp32 400; CHECK: @llvm.get.active.lane.mask 401; CHECK: ret void 402; 403define dso_local void @IV_wrong_step(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 404entry: 405 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) 406 br label %vector.body 407 408vector.body: 409 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] 410 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] 411 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] 412 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 413 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] 414 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* 415 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* 416 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* 417 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 418 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 419 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 420 421 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003) 422 423 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) 424 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) 425 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load 426 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) 427 428; %index is incremented with 3 and not 4, which is the vectorisation factor 429; that we expect here: 430 %index.next = add i32 %index, 3 431 432 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 433 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 434 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 435 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 436 %4 = icmp ne i32 %3, 0 437 br i1 %4, label %vector.body, label %for.cond.cleanup 438 439for.cond.cleanup: 440 ret void 441} 442 443; CHECK-LABEL: @IV_step_not_constant( 444; CHECK: vector.body: 445; CHECK-NOT: @llvm.arm.mve.vctp32 446; CHECK: @llvm.get.active.lane.mask 447; CHECK: ret void 448; 449define dso_local void @IV_step_not_constant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 450entry: 451 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) 452 br label %vector.body 453 454vector.body: 455 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ] 456 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ] 457 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ] 458 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 459 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] 460 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* 461 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>* 462 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>* 463 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 464 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 465 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 466 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003) 467 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef) 468 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef) 469 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load 470 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1) 471 472; %index is incremented with some runtime value, i.e. not a constant: 473 %index.next = add i32 %index, %N 474 475 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 476 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4 477 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 478 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 479 %4 = icmp ne i32 %3, 0 480 br i1 %4, label %vector.body, label %for.cond.cleanup 481 482for.cond.cleanup: 483 ret void 484} 485 486; CHECK-LABEL: @outerloop_phi( 487; CHECK: vector.body: 488; CHECK-NOT: @llvm.arm.mve.vctp32 489; CHECK: @llvm.get.active.lane.mask 490; CHECK: ret void 491; 492define dso_local void @outerloop_phi(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { 493entry: 494 %cmp24 = icmp eq i32 %N, 0 495 br i1 %cmp24, label %for.cond.cleanup, label %vector.ph.preheader 496 497vector.ph.preheader: ; preds = %entry 498 br label %vector.ph 499 500vector.ph: ; preds = %vector.ph.preheader, %for.cond.cleanup3 501 %lsr.iv36 = phi i32* [ %B, %vector.ph.preheader ], [ %scevgep37, %for.cond.cleanup3 ] 502 %lsr.iv31 = phi i32* [ %C, %vector.ph.preheader ], [ %scevgep32, %for.cond.cleanup3 ] 503 %lsr.iv = phi i32* [ %A, %vector.ph.preheader ], [ %scevgep, %for.cond.cleanup3 ] 504 %j.025 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %vector.ph.preheader ] 505 %start = call i32 @llvm.start.loop.iterations.i32(i32 1025) 506 br label %vector.body 507 508vector.body: ; preds = %vector.body, %vector.ph 509 %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %lsr.iv36, %vector.ph ] 510 %lsr.iv33 = phi i32* [ %scevgep34, %vector.body ], [ %lsr.iv31, %vector.ph ] 511 %lsr.iv28 = phi i32* [ %scevgep29, %vector.body ], [ %lsr.iv, %vector.ph ] 512 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 513 %0 = phi i32 [ %start, %vector.ph ], [ %2, %vector.body ] 514 %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>* 515 %lsr.iv3335 = bitcast i32* %lsr.iv33 to <4 x i32>* 516 %lsr.iv2830 = bitcast i32* %lsr.iv28 to <4 x i32>* 517 518; It's using %j.025, the induction variable from its outer loop: 519 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %j.025, i32 4096) 520 521 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 522 %wide.masked.load27 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3335, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 523 %1 = add nsw <4 x i32> %wide.masked.load27, %wide.masked.load 524 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %lsr.iv2830, i32 4, <4 x i1> %active.lane.mask) 525 %index.next = add i32 %index, 4 526 %scevgep29 = getelementptr i32, i32* %lsr.iv28, i32 4 527 %scevgep34 = getelementptr i32, i32* %lsr.iv33, i32 4 528 %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 4 529 %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 530 %3 = icmp ne i32 %2, 0 531 br i1 %3, label %vector.body, label %for.cond.cleanup3 532 533for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry 534 ret void 535 536for.cond.cleanup3: ; preds = %vector.body 537 %inc11 = add nuw i32 %j.025, 1 538 %scevgep = getelementptr i32, i32* %lsr.iv, i32 1 539 %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 1 540 %scevgep37 = getelementptr i32, i32* %lsr.iv36, i32 1 541 %exitcond26 = icmp eq i32 %inc11, %N 542 br i1 %exitcond26, label %for.cond.cleanup, label %vector.ph 543} 544 545 546declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1 547declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2 548declare i32 @llvm.loop.decrement.reg.i32(i32 , i32 ) 549declare i32 @llvm.start.loop.iterations.i32(i32) 550declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) 551