1; RUN: opt -mtriple armv7-linux-gnueabihf -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX 2; RUN: opt -mtriple armv8-linux-gnu -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX 3; RUN: opt -mtriple armv8.1.m-none-eabi -mattr=+mve.fp -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=MVE 4; RUN: opt -mtriple armv7-unknwon-darwin -loop-vectorize -S %s -debug-only=loop-vectorize -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN 5; REQUIRES: asserts 6 7; Testing the ability of the loop vectorizer to tell when SIMD is safe or not 8; regarding IEEE 754 standard. 9; On Linux, we only want the vectorizer to work when -ffast-math flag is set, 10; because NEON is not IEEE compliant. 11; Darwin, on the other hand, doesn't support subnormals, and all optimizations 12; are allowed, even without -ffast-math. 13 14; Integer loops are always vectorizeable 15; CHECK: Checking a loop in "sumi" 16; CHECK: We can vectorize this loop! 17define void @sumi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { 18entry: 19 %cmp5 = icmp eq i32 %N, 0 20 br i1 %cmp5, label %for.end, label %for.body.preheader 21 22for.body.preheader: ; preds = %entry 23 br label %for.body 24 25for.body: ; preds = %for.body.preheader, %for.body 26 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 27 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06 28 %0 = load i32, i32* %arrayidx, align 4 29 %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06 30 %1 = load i32, i32* %arrayidx1, align 4 31 %mul = mul nsw i32 %1, %0 32 %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06 33 store i32 %mul, i32* %arrayidx2, align 4 34 %inc = add nuw nsw i32 %i.06, 1 35 %exitcond = icmp eq i32 %inc, %N 36 br i1 %exitcond, label %for.end.loopexit, label %for.body 37 38for.end.loopexit: ; preds = %for.body 39 br label %for.end 40 41for.end: ; preds = %for.end.loopexit, %entry 42 ret void 43} 44 45; Floating-point loops need fast-math to be vectorizeable 46; LINUX: Checking a loop in "sumf" 47; LINUX: Potentially unsafe FP op prevents vectorization 48; MVE: Checking a loop in "sumf" 49; MVE: We can vectorize this loop! 50; DARWIN: Checking a loop in "sumf" 51; DARWIN: We can vectorize this loop! 52define void @sumf(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { 53entry: 54 %cmp5 = icmp eq i32 %N, 0 55 br i1 %cmp5, label %for.end, label %for.body.preheader 56 57for.body.preheader: ; preds = %entry 58 br label %for.body 59 60for.body: ; preds = %for.body.preheader, %for.body 61 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 62 %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06 63 %0 = load float, float* %arrayidx, align 4 64 %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06 65 %1 = load float, float* %arrayidx1, align 4 66 %mul = fmul float %0, %1 67 %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06 68 store float %mul, float* %arrayidx2, align 4 69 %inc = add nuw nsw i32 %i.06, 1 70 %exitcond = icmp eq i32 %inc, %N 71 br i1 %exitcond, label %for.end.loopexit, label %for.body 72 73for.end.loopexit: ; preds = %for.body 74 br label %for.end 75 76for.end: ; preds = %for.end.loopexit, %entry 77 ret void 78} 79 80; Integer loops are always vectorizeable 81; CHECK: Checking a loop in "redi" 82; CHECK: We can vectorize this loop! 83define i32 @redi(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { 84entry: 85 %cmp5 = icmp eq i32 %N, 0 86 br i1 %cmp5, label %for.end, label %for.body.preheader 87 88for.body.preheader: ; preds = %entry 89 br label %for.body 90 91for.body: ; preds = %for.body.preheader, %for.body 92 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 93 %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] 94 %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07 95 %0 = load i32, i32* %arrayidx, align 4 96 %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07 97 %1 = load i32, i32* %arrayidx1, align 4 98 %mul = mul nsw i32 %1, %0 99 %add = add nsw i32 %mul, %Red.06 100 %inc = add nuw nsw i32 %i.07, 1 101 %exitcond = icmp eq i32 %inc, %N 102 br i1 %exitcond, label %for.end.loopexit, label %for.body 103 104for.end.loopexit: ; preds = %for.body 105 %add.lcssa = phi i32 [ %add, %for.body ] 106 br label %for.end 107 108for.end: ; preds = %for.end.loopexit, %entry 109 %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] 110 ret i32 %Red.0.lcssa 111} 112 113; Floating-point loops need fast-math to be vectorizeable 114; LINUX: Checking a loop in "redf" 115; LINUX: Potentially unsafe FP op prevents vectorization 116; MVE: Checking a loop in "redf" 117; MVE: We can vectorize this loop! 118; DARWIN: Checking a loop in "redf" 119; DARWIN: We can vectorize this loop! 120define float @redf(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) { 121entry: 122 %cmp5 = icmp eq i32 %N, 0 123 br i1 %cmp5, label %for.end, label %for.body.preheader 124 125for.body.preheader: ; preds = %entry 126 br label %for.body 127 128for.body: ; preds = %for.body.preheader, %for.body 129 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 130 %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] 131 %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07 132 %0 = load float, float* %arrayidx, align 4 133 %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07 134 %1 = load float, float* %arrayidx1, align 4 135 %mul = fmul float %0, %1 136 %add = fadd float %Red.06, %mul 137 %inc = add nuw nsw i32 %i.07, 1 138 %exitcond = icmp eq i32 %inc, %N 139 br i1 %exitcond, label %for.end.loopexit, label %for.body 140 141for.end.loopexit: ; preds = %for.body 142 %add.lcssa = phi float [ %add, %for.body ] 143 br label %for.end 144 145for.end: ; preds = %for.end.loopexit, %entry 146 %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] 147 ret float %Red.0.lcssa 148} 149 150; Make sure calls that turn into builtins are also covered 151; LINUX: Checking a loop in "fabs" 152; LINUX: Potentially unsafe FP op prevents vectorization 153; DARWIN: Checking a loop in "fabs" 154; DARWIN: We can vectorize this loop! 155define void @fabs(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { 156entry: 157 %cmp10 = icmp eq i32 %N, 0 158 br i1 %cmp10, label %for.end, label %for.body 159 160for.body: ; preds = %entry, %for.body 161 %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 162 %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011 163 %0 = load float, float* %arrayidx, align 4 164 %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011 165 %1 = load float, float* %arrayidx1, align 4 166 %fabsf = tail call float @fabsf(float %1) #1 167 %conv3 = fmul float %0, %fabsf 168 %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011 169 store float %conv3, float* %arrayidx4, align 4 170 %inc = add nuw nsw i32 %i.011, 1 171 %exitcond = icmp eq i32 %inc, %N 172 br i1 %exitcond, label %for.end, label %for.body 173 174for.end: ; preds = %for.body, %entry 175 ret void 176} 177 178; Integer loops are always vectorizeable 179; CHECK: Checking a loop in "sumi_fast" 180; CHECK: We can vectorize this loop! 181define void @sumi_fast(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %N) { 182entry: 183 %cmp5 = icmp eq i32 %N, 0 184 br i1 %cmp5, label %for.end, label %for.body.preheader 185 186for.body.preheader: ; preds = %entry 187 br label %for.body 188 189for.body: ; preds = %for.body.preheader, %for.body 190 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 191 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.06 192 %0 = load i32, i32* %arrayidx, align 4 193 %arrayidx1 = getelementptr inbounds i32, i32* %B, i32 %i.06 194 %1 = load i32, i32* %arrayidx1, align 4 195 %mul = mul nsw i32 %1, %0 196 %arrayidx2 = getelementptr inbounds i32, i32* %C, i32 %i.06 197 store i32 %mul, i32* %arrayidx2, align 4 198 %inc = add nuw nsw i32 %i.06, 1 199 %exitcond = icmp eq i32 %inc, %N 200 br i1 %exitcond, label %for.end.loopexit, label %for.body 201 202for.end.loopexit: ; preds = %for.body 203 br label %for.end 204 205for.end: ; preds = %for.end.loopexit, %entry 206 ret void 207} 208 209; Floating-point loops can be vectorizeable with fast-math 210; CHECK: Checking a loop in "sumf_fast" 211; CHECK: We can vectorize this loop! 212define void @sumf_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { 213entry: 214 %cmp5 = icmp eq i32 %N, 0 215 br i1 %cmp5, label %for.end, label %for.body.preheader 216 217for.body.preheader: ; preds = %entry 218 br label %for.body 219 220for.body: ; preds = %for.body.preheader, %for.body 221 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 222 %arrayidx = getelementptr inbounds float, float* %A, i32 %i.06 223 %0 = load float, float* %arrayidx, align 4 224 %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.06 225 %1 = load float, float* %arrayidx1, align 4 226 %mul = fmul fast float %1, %0 227 %arrayidx2 = getelementptr inbounds float, float* %C, i32 %i.06 228 store float %mul, float* %arrayidx2, align 4 229 %inc = add nuw nsw i32 %i.06, 1 230 %exitcond = icmp eq i32 %inc, %N 231 br i1 %exitcond, label %for.end.loopexit, label %for.body 232 233for.end.loopexit: ; preds = %for.body 234 br label %for.end 235 236for.end: ; preds = %for.end.loopexit, %entry 237 ret void 238} 239 240; Integer loops are always vectorizeable 241; CHECK: Checking a loop in "redi_fast" 242; CHECK: We can vectorize this loop! 243define i32 @redi_fast(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) { 244entry: 245 %cmp5 = icmp eq i32 %N, 0 246 br i1 %cmp5, label %for.end, label %for.body.preheader 247 248for.body.preheader: ; preds = %entry 249 br label %for.body 250 251for.body: ; preds = %for.body.preheader, %for.body 252 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 253 %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] 254 %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.07 255 %0 = load i32, i32* %arrayidx, align 4 256 %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.07 257 %1 = load i32, i32* %arrayidx1, align 4 258 %mul = mul nsw i32 %1, %0 259 %add = add nsw i32 %mul, %Red.06 260 %inc = add nuw nsw i32 %i.07, 1 261 %exitcond = icmp eq i32 %inc, %N 262 br i1 %exitcond, label %for.end.loopexit, label %for.body 263 264for.end.loopexit: ; preds = %for.body 265 %add.lcssa = phi i32 [ %add, %for.body ] 266 br label %for.end 267 268for.end: ; preds = %for.end.loopexit, %entry 269 %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] 270 ret i32 %Red.0.lcssa 271} 272 273; Floating-point loops can be vectorizeable with fast-math 274; CHECK: Checking a loop in "redf_fast" 275; CHECK: We can vectorize this loop! 276define float @redf_fast(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i32 %N) { 277entry: 278 %cmp5 = icmp eq i32 %N, 0 279 br i1 %cmp5, label %for.end, label %for.body.preheader 280 281for.body.preheader: ; preds = %entry 282 br label %for.body 283 284for.body: ; preds = %for.body.preheader, %for.body 285 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 286 %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] 287 %arrayidx = getelementptr inbounds float, float* %a, i32 %i.07 288 %0 = load float, float* %arrayidx, align 4 289 %arrayidx1 = getelementptr inbounds float, float* %b, i32 %i.07 290 %1 = load float, float* %arrayidx1, align 4 291 %mul = fmul fast float %1, %0 292 %add = fadd fast float %mul, %Red.06 293 %inc = add nuw nsw i32 %i.07, 1 294 %exitcond = icmp eq i32 %inc, %N 295 br i1 %exitcond, label %for.end.loopexit, label %for.body 296 297for.end.loopexit: ; preds = %for.body 298 %add.lcssa = phi float [ %add, %for.body ] 299 br label %for.end 300 301for.end: ; preds = %for.end.loopexit, %entry 302 %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] 303 ret float %Red.0.lcssa 304} 305 306; Make sure calls that turn into builtins are also covered 307; CHECK: Checking a loop in "fabs_fast" 308; CHECK: We can vectorize this loop! 309define void @fabs_fast(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float* noalias nocapture %C, i32 %N) { 310entry: 311 %cmp10 = icmp eq i32 %N, 0 312 br i1 %cmp10, label %for.end, label %for.body 313 314for.body: ; preds = %entry, %for.body 315 %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 316 %arrayidx = getelementptr inbounds float, float* %A, i32 %i.011 317 %0 = load float, float* %arrayidx, align 4 318 %arrayidx1 = getelementptr inbounds float, float* %B, i32 %i.011 319 %1 = load float, float* %arrayidx1, align 4 320 %fabsf = tail call fast float @fabsf(float %1) #2 321 %conv3 = fmul fast float %fabsf, %0 322 %arrayidx4 = getelementptr inbounds float, float* %C, i32 %i.011 323 store float %conv3, float* %arrayidx4, align 4 324 %inc = add nuw nsw i32 %i.011, 1 325 %exitcond = icmp eq i32 %inc, %N 326 br i1 %exitcond, label %for.end, label %for.body 327 328for.end: ; preds = %for.body, %entry 329 ret void 330} 331 332declare float @fabsf(float) 333 334attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" } 335attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" } 336