1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt < %s -loop-vectorize -mcpu=corei7-avx -S -vectorizer-min-trip-count=21 | FileCheck %s 3 4target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-linux" 6 7; 8; The source code for the test: 9; 10; void foo(float* restrict A, float* restrict B) 11; { 12; for (int i = 0; i < 20; ++i) A[i] += B[i]; 13; } 14; 15 16; 17; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata. 18; 19define void @vectorized(float* noalias nocapture %A, float* noalias nocapture readonly %B) { 20; CHECK-LABEL: @vectorized( 21; CHECK-NEXT: entry: 22; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 23; CHECK: vector.ph: 24; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 25; CHECK: vector.body: 26; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 27; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 28; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] 29; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 30; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* 31; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !0 32; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] 33; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 34; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* 35; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !0 36; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] 37; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* 38; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0 39; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 40; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 41; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !1 42; CHECK: middle.block: 43; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 16 44; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] 45; CHECK: scalar.ph: 46; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 47; CHECK-NEXT: br label [[FOR_BODY:%.*]] 48; CHECK: for.body: 49; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] 50; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] 51; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !0 52; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] 53; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !0 54; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]] 55; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0 56; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 57; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 58; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !4 59; CHECK: for.end: 60; CHECK-NEXT: ret void 61; 62entry: 63 br label %for.body 64 65for.body: 66 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 67 %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv 68 %0 = load float, float* %arrayidx, align 4, !llvm.access.group !11 69 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv 70 %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !11 71 %add = fadd fast float %0, %1 72 store float %add, float* %arrayidx2, align 4, !llvm.access.group !11 73 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 74 %exitcond = icmp eq i64 %indvars.iv.next, 20 75 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1 76 77for.end: 78 ret void 79} 80 81!1 = !{!1, !2, !{!"llvm.loop.parallel_accesses", !11}} 82!2 = !{!"llvm.loop.vectorize.enable", i1 true} 83!11 = distinct !{} 84 85; 86; This loop will be vectorized as the trip count is below the threshold but no 87; scalar iterations are needed thanks to folding its tail. 88; 89define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture readonly %B) { 90; CHECK-LABEL: @vectorized1( 91; CHECK-NEXT: entry: 92; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 93; CHECK: vector.ph: 94; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 95; CHECK: vector.body: 96; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 97; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 98; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer 99; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 100; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 101; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19> 102; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] 103; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0 104; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>* 105; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP4]], i32 4, <8 x i1> [[TMP1]], <8 x float> undef), !llvm.access.group !6 106; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] 107; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0 108; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* 109; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP1]], <8 x float> undef), !llvm.access.group !6 110; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]] 111; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP6]] to <8 x float>* 112; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group !6 113; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 114; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24 115; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !7 116; CHECK: middle.block: 117; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] 118; CHECK: scalar.ph: 119; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 120; CHECK-NEXT: br label [[FOR_BODY:%.*]] 121; CHECK: for.body: 122; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] 123; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] 124; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !6 125; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] 126; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 127; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP11]], [[TMP12]] 128; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 129; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 130; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 131; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !9 132; CHECK: for.end: 133; CHECK-NEXT: ret void 134; 135entry: 136 br label %for.body 137 138for.body: 139 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 140 %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv 141 %0 = load float, float* %arrayidx, align 4, !llvm.access.group !13 142 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv 143 %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !13 144 %add = fadd fast float %0, %1 145 store float %add, float* %arrayidx2, align 4, !llvm.access.group !13 146 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 147 %exitcond = icmp eq i64 %indvars.iv.next, 20 148 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3 149 150for.end: 151 ret void 152} 153 154!3 = !{!3, !{!"llvm.loop.parallel_accesses", !13}} 155!13 = distinct !{} 156 157; 158; This loop will be vectorized as the trip count is below the threshold but no 159; scalar iterations are needed. 160; 161define void @vectorized2(float* noalias nocapture %A, float* noalias nocapture readonly %B) { 162; CHECK-LABEL: @vectorized2( 163; CHECK-NEXT: entry: 164; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 165; CHECK: vector.ph: 166; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 167; CHECK: vector.body: 168; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 169; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0 170; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer 171; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 172; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 173; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]] 174; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 175; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>* 176; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4, !llvm.access.group !6 177; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]] 178; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 179; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>* 180; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4, !llvm.access.group !6 181; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] 182; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP5]] to <8 x float>* 183; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !6 184; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 185; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 186; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10 187; CHECK: middle.block: 188; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, 16 189; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] 190; CHECK: scalar.ph: 191; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 192; CHECK-NEXT: br label [[FOR_BODY:%.*]] 193; CHECK: for.body: 194; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] 195; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] 196; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX]], align 4, !llvm.access.group !6 197; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] 198; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 199; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP10]], [[TMP11]] 200; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6 201; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 202; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16 203; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !11 204; CHECK: for.end: 205; CHECK-NEXT: ret void 206; 207entry: 208 br label %for.body 209 210for.body: 211 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 212 %arrayidx = getelementptr inbounds float, float* %B, i64 %indvars.iv 213 %0 = load float, float* %arrayidx, align 4, !llvm.access.group !13 214 %arrayidx2 = getelementptr inbounds float, float* %A, i64 %indvars.iv 215 %1 = load float, float* %arrayidx2, align 4, !llvm.access.group !13 216 %add = fadd fast float %0, %1 217 store float %add, float* %arrayidx2, align 4, !llvm.access.group !13 218 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 219 %exitcond = icmp eq i64 %indvars.iv.next, 16 220 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4 221 222for.end: 223 ret void 224} 225 226!4 = !{!4} 227 228