1; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S | FileCheck %s 2; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s -check-prefix=WIDTH 3 4target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 5 6; Vectorization with dependence checks. 7 8; No plausible dependence - can be vectorized. 9; for (i = 0; i < 1024; ++i) 10; A[i] = A[i + 1] + 1; 11 12; CHECK-LABEL: @f1_vec( 13; CHECK: <2 x i32> 14 15define void @f1_vec(i32* %A) { 16entry: 17 br label %for.body 18 19for.body: 20 %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 21 %indvars.iv.next = add i32 %indvars.iv, 1 22 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv.next 23 %0 = load i32, i32* %arrayidx, align 4 24 %add1 = add nsw i32 %0, 1 25 %arrayidx3 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv 26 store i32 %add1, i32* %arrayidx3, align 4 27 %exitcond = icmp ne i32 %indvars.iv.next, 1024 28 br i1 %exitcond, label %for.body, label %for.end 29 30for.end: 31 ret void 32} 33 34; Plausible dependence of distance 1 - can't be vectorized. 35; for (i = 0; i < 1024; ++i) 36; A[i+1] = A[i] + 1; 37 38; CHECK-LABEL: @f2_novec( 39; CHECK-NOT: <2 x i32> 40 41define void @f2_novec(i32* %A) { 42entry: 43 br label %for.body 44 45for.body: 46 %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 47 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv 48 %0 = load i32, i32* %arrayidx, align 4 49 %add = add nsw i32 %0, 1 50 %indvars.iv.next = add i32 %indvars.iv, 1 51 %arrayidx3 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv.next 52 store i32 %add, i32* %arrayidx3, align 4 53 %exitcond = icmp ne i32 %indvars.iv.next, 1024 54 br i1 %exitcond, label %for.body, label %for.end 55 56for.end: 57 ret void 58} 59 60; Plausible dependence of distance 2 - can be vectorized with a width of 2. 61; for (i = 0; i < 1024; ++i) 62; A[i+2] = A[i] + 1; 63 64; CHECK-LABEL: @f3_vec_len( 65; CHECK: <2 x i32> 66 67; WIDTH: f3_vec_len 68; WIDTH-NOT: <4 x i32> 69 70define void @f3_vec_len(i32* %A) { 71entry: 72 br label %for.body 73 74for.body: 75 %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 76 %idxprom = sext i32 %i.01 to i64 77 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom 78 %0 = load i32, i32* %arrayidx, align 4 79 %add = add nsw i32 %0, 1 80 %add1 = add nsw i32 %i.01, 2 81 %idxprom2 = sext i32 %add1 to i64 82 %arrayidx3 = getelementptr inbounds i32, i32* %A, i64 %idxprom2 83 store i32 %add, i32* %arrayidx3, align 4 84 %inc = add nsw i32 %i.01, 1 85 %cmp = icmp slt i32 %inc, 1024 86 br i1 %cmp, label %for.body, label %for.end 87 88for.end: 89 ret void 90} 91 92; Plausible dependence of distance 1 - cannot be vectorized (without reordering 93; accesses). 94; for (i = 0; i < 1024; ++i) { 95; B[i] = A[i]; 96; A[i] = B[i + 1]; 97; } 98 99; CHECK-LABEL: @f5( 100; CHECK-NOT: <2 x i32> 101 102define void @f5(i32* %A, i32* %B) { 103entry: 104 br label %for.body 105 106for.body: 107 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 108 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 109 %0 = load i32, i32* %arrayidx, align 4 110 %arrayidx2 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 111 store i32 %0, i32* %arrayidx2, align 4 112 %indvars.iv.next = add nsw i64 %indvars.iv, 1 113 %arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %indvars.iv.next 114 %1 = load i32, i32* %arrayidx4, align 4 115 store i32 %1, i32* %arrayidx, align 4 116 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 117 %exitcond = icmp ne i32 %lftr.wideiv, 1024 118 br i1 %exitcond, label %for.body, label %for.end 119 120for.end: 121 ret void 122} 123 124; Dependence through a phi node - must not vectorize. 125; for (i = 0; i < 1024; ++i) { 126; a[i+1] = tmp; 127; tmp = a[i]; 128; } 129 130; CHECK-LABEL: @f6 131; CHECK-NOT: <2 x i32> 132 133define i32 @f6(i32* %a, i32 %tmp) { 134entry: 135 br label %for.body 136 137for.body: 138 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 139 %tmp.addr.08 = phi i32 [ %tmp, %entry ], [ %0, %for.body ] 140 %indvars.iv.next = add nsw i64 %indvars.iv, 1 141 %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next 142 store i32 %tmp.addr.08, i32* %arrayidx, align 4 143 %arrayidx3 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv 144 %0 = load i32, i32* %arrayidx3, align 4 145 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 146 %exitcond = icmp ne i32 %lftr.wideiv, 1024 147 br i1 %exitcond, label %for.body, label %for.end 148 149for.end: 150 ret i32 undef 151} 152 153; Don't vectorize true loop carried dependencies that are not a multiple of the 154; vector width. 155; Example: 156; for (int i = ...; ++i) { 157; a[i] = a[i-3] + ...; 158; It is a bad idea to vectorize this loop because store-load forwarding will not 159; happen. 160; 161 162; CHECK-LABEL: @nostoreloadforward( 163; CHECK-NOT: <2 x i32> 164 165define void @nostoreloadforward(i32* %A) { 166entry: 167 br label %for.body 168 169for.body: 170 %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ] 171 %0 = add nsw i64 %indvars.iv, -3 172 %arrayidx = getelementptr inbounds i32, i32* %A, i64 %0 173 %1 = load i32, i32* %arrayidx, align 4 174 %2 = add nsw i64 %indvars.iv, 4 175 %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %2 176 %3 = load i32, i32* %arrayidx2, align 4 177 %add3 = add nsw i32 %3, %1 178 %arrayidx5 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 179 store i32 %add3, i32* %arrayidx5, align 4 180 %indvars.iv.next = add i64 %indvars.iv, 1 181 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 182 %exitcond = icmp ne i32 %lftr.wideiv, 128 183 br i1 %exitcond, label %for.body, label %for.end 184 185for.end: 186 ret void 187} 188 189; Example: 190; for (int i = ...; ++i) { 191; a[i] = b[i]; 192; c[i] = a[i-3] + ...; 193; It is a bad idea to vectorize this loop because store-load forwarding will not 194; happen. 195; 196 197; CHECK-LABEL: @nostoreloadforward2( 198; CHECK-NOT: <2 x i32> 199 200define void @nostoreloadforward2(i32* noalias %A, i32* noalias %B, i32* noalias %C) { 201entry: 202 br label %for.body 203 204for.body: 205 %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ] 206 %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv 207 %0 = load i32, i32* %arrayidx, align 4 208 %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv 209 store i32 %0, i32* %arrayidx2, align 4 210 %1 = add nsw i64 %indvars.iv, -3 211 %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %1 212 %2 = load i32, i32* %arrayidx4, align 4 213 %arrayidx6 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv 214 store i32 %2, i32* %arrayidx6, align 4 215 %indvars.iv.next = add i64 %indvars.iv, 1 216 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 217 %exitcond = icmp ne i32 %lftr.wideiv, 128 218 br i1 %exitcond, label %for.body, label %for.end 219 220for.end: 221 ret void 222} 223