1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc void @fmas1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { 5; CHECK-LABEL: fmas1: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r4, lr} 8; CHECK-NEXT: push {r4, lr} 9; CHECK-NEXT: cmp r3, #1 10; CHECK-NEXT: it lt 11; CHECK-NEXT: poplt {r4, pc} 12; CHECK-NEXT: .LBB0_1: @ %vector.ph 13; CHECK-NEXT: vmov r12, s0 14; CHECK-NEXT: movs r4, #0 15; CHECK-NEXT: dlstp.32 lr, r3 16; CHECK-NEXT: .LBB0_2: @ %vector.body 17; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 18; CHECK-NEXT: adds r4, #4 19; CHECK-NEXT: vldrw.u32 q0, [r1], #16 20; CHECK-NEXT: vldrw.u32 q1, [r0], #16 21; CHECK-NEXT: vfmas.f32 q1, q0, r12 22; CHECK-NEXT: vstrw.32 q1, [r2], #16 23; CHECK-NEXT: letp lr, .LBB0_2 24; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 25; CHECK-NEXT: pop {r4, pc} 26entry: 27 %cmp8 = icmp sgt i32 %n, 0 28 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 29 30vector.ph: ; preds = %entry 31 %n.rnd.up = add i32 %n, 3 32 %n.vec = and i32 %n.rnd.up, -4 33 %trip.count.minus.1 = add i32 %n, -1 34 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 35 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 36 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 37 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 38 br label %vector.body 39 40vector.body: ; preds = %vector.body, %vector.ph 41 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 42 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 43 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 44 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 45 %0 = getelementptr inbounds float, float* %x, i32 %index 46 47 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 48 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 49 50 %2 = bitcast float* %0 to <4 x float>* 51 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) 52 %3 = getelementptr inbounds float, float* %y, i32 %index 53 %4 = bitcast float* %3 to <4 x float>* 54 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) 55 %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14) 56 %6 = getelementptr inbounds float, float* %z, i32 %index 57 %7 = bitcast float* %6 to <4 x float>* 58 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1) 59 %index.next = add i32 %index, 4 60 %8 = icmp eq i32 %index.next, %n.vec 61 br i1 %8, label %for.cond.cleanup, label %vector.body 62 63for.cond.cleanup: ; preds = %vector.body, %entry 64 ret void 65} 66 67define arm_aapcs_vfpcc void @fmas2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { 68; CHECK-LABEL: fmas2: 69; CHECK: @ %bb.0: @ %entry 70; CHECK-NEXT: .save {r4, lr} 71; CHECK-NEXT: push {r4, lr} 72; CHECK-NEXT: cmp r3, #1 73; CHECK-NEXT: it lt 74; CHECK-NEXT: poplt {r4, pc} 75; CHECK-NEXT: .LBB1_1: @ %vector.ph 76; CHECK-NEXT: vmov r12, s0 77; CHECK-NEXT: movs r4, #0 78; CHECK-NEXT: dlstp.32 lr, r3 79; CHECK-NEXT: .LBB1_2: @ %vector.body 80; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 81; CHECK-NEXT: adds r4, #4 82; CHECK-NEXT: vldrw.u32 q0, [r0], #16 83; CHECK-NEXT: vldrw.u32 q1, [r1], #16 84; CHECK-NEXT: vfmas.f32 q1, q0, r12 85; CHECK-NEXT: vstrw.32 q1, [r2], #16 86; CHECK-NEXT: letp lr, .LBB1_2 87; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 88; CHECK-NEXT: pop {r4, pc} 89entry: 90 %cmp8 = icmp sgt i32 %n, 0 91 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 92 93vector.ph: ; preds = %entry 94 %n.rnd.up = add i32 %n, 3 95 %n.vec = and i32 %n.rnd.up, -4 96 %trip.count.minus.1 = add i32 %n, -1 97 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 98 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 99 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 100 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 101 br label %vector.body 102 103vector.body: ; preds = %vector.body, %vector.ph 104 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 105 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 106 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 107 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 108 %0 = getelementptr inbounds float, float* %x, i32 %index 109 110 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 111 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 112 113 %2 = bitcast float* %0 to <4 x float>* 114 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) 115 %3 = getelementptr inbounds float, float* %y, i32 %index 116 %4 = bitcast float* %3 to <4 x float>* 117 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) 118 %5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load 119 %6 = fadd fast <4 x float> %5, %broadcast.splat14 120 %7 = getelementptr inbounds float, float* %z, i32 %index 121 %8 = bitcast float* %7 to <4 x float>* 122 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1) 123 %index.next = add i32 %index, 4 124 %9 = icmp eq i32 %index.next, %n.vec 125 br i1 %9, label %for.cond.cleanup, label %vector.body 126 127for.cond.cleanup: ; preds = %vector.body, %entry 128 ret void 129} 130 131define arm_aapcs_vfpcc void @fma1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { 132; CHECK-LABEL: fma1: 133; CHECK: @ %bb.0: @ %entry 134; CHECK-NEXT: .save {r4, lr} 135; CHECK-NEXT: push {r4, lr} 136; CHECK-NEXT: cmp r3, #1 137; CHECK-NEXT: it lt 138; CHECK-NEXT: poplt {r4, pc} 139; CHECK-NEXT: .LBB2_1: @ %vector.ph 140; CHECK-NEXT: vmov r12, s0 141; CHECK-NEXT: movs r4, #0 142; CHECK-NEXT: dlstp.32 lr, r3 143; CHECK-NEXT: .LBB2_2: @ %vector.body 144; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 145; CHECK-NEXT: adds r4, #4 146; CHECK-NEXT: vldrw.u32 q0, [r0], #16 147; CHECK-NEXT: vldrw.u32 q1, [r1], #16 148; CHECK-NEXT: vfma.f32 q1, q0, r12 149; CHECK-NEXT: vstrw.32 q1, [r2], #16 150; CHECK-NEXT: letp lr, .LBB2_2 151; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 152; CHECK-NEXT: pop {r4, pc} 153entry: 154 %cmp8 = icmp sgt i32 %n, 0 155 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 156 157vector.ph: ; preds = %entry 158 %n.rnd.up = add i32 %n, 3 159 %n.vec = and i32 %n.rnd.up, -4 160 %trip.count.minus.1 = add i32 %n, -1 161 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 162 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 163 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 164 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 165 br label %vector.body 166 167vector.body: ; preds = %vector.body, %vector.ph 168 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 169 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 170 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 171 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 172 %0 = getelementptr inbounds float, float* %x, i32 %index 173 174 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 175 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 176 177 %2 = bitcast float* %0 to <4 x float>* 178 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) 179 %3 = getelementptr inbounds float, float* %y, i32 %index 180 %4 = bitcast float* %3 to <4 x float>* 181 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) 182 %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12) 183 %6 = getelementptr inbounds float, float* %z, i32 %index 184 %7 = bitcast float* %6 to <4 x float>* 185 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1) 186 %index.next = add i32 %index, 4 187 %8 = icmp eq i32 %index.next, %n.vec 188 br i1 %8, label %for.cond.cleanup, label %vector.body 189 190for.cond.cleanup: ; preds = %vector.body, %entry 191 ret void 192} 193 194define arm_aapcs_vfpcc void @fma2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { 195; CHECK-LABEL: fma2: 196; CHECK: @ %bb.0: @ %entry 197; CHECK-NEXT: .save {r4, lr} 198; CHECK-NEXT: push {r4, lr} 199; CHECK-NEXT: cmp r3, #1 200; CHECK-NEXT: it lt 201; CHECK-NEXT: poplt {r4, pc} 202; CHECK-NEXT: .LBB3_1: @ %vector.ph 203; CHECK-NEXT: vmov r12, s0 204; CHECK-NEXT: movs r4, #0 205; CHECK-NEXT: dlstp.32 lr, r3 206; CHECK-NEXT: .LBB3_2: @ %vector.body 207; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 208; CHECK-NEXT: adds r4, #4 209; CHECK-NEXT: vldrw.u32 q0, [r0], #16 210; CHECK-NEXT: vldrw.u32 q1, [r1], #16 211; CHECK-NEXT: vfma.f32 q1, q0, r12 212; CHECK-NEXT: vstrw.32 q1, [r2], #16 213; CHECK-NEXT: letp lr, .LBB3_2 214; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 215; CHECK-NEXT: pop {r4, pc} 216entry: 217 %cmp8 = icmp sgt i32 %n, 0 218 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 219 220vector.ph: ; preds = %entry 221 %n.rnd.up = add i32 %n, 3 222 %n.vec = and i32 %n.rnd.up, -4 223 %trip.count.minus.1 = add i32 %n, -1 224 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 225 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 226 %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0 227 %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer 228 br label %vector.body 229 230vector.body: ; preds = %vector.body, %vector.ph 231 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 232 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 233 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 234 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 235 %0 = getelementptr inbounds float, float* %x, i32 %index 236 237 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 238 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 239 240 %2 = bitcast float* %0 to <4 x float>* 241 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) 242 %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13 243 %4 = getelementptr inbounds float, float* %y, i32 %index 244 %5 = bitcast float* %4 to <4 x float>* 245 %wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %5, i32 4, <4 x i1> %1, <4 x float> undef) 246 %6 = fadd fast <4 x float> %3, %wide.masked.load14 247 %7 = getelementptr inbounds float, float* %z, i32 %index 248 %8 = bitcast float* %7 to <4 x float>* 249 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1) 250 %index.next = add i32 %index, 4 251 %9 = icmp eq i32 %index.next, %n.vec 252 br i1 %9, label %for.cond.cleanup, label %vector.body 253 254for.cond.cleanup: ; preds = %vector.body, %entry 255 ret void 256} 257 258define arm_aapcs_vfpcc void @fmss1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { 259; CHECK-LABEL: fmss1: 260; CHECK: @ %bb.0: @ %entry 261; CHECK-NEXT: .save {r4, lr} 262; CHECK-NEXT: push {r4, lr} 263; CHECK-NEXT: cmp r3, #1 264; CHECK-NEXT: it lt 265; CHECK-NEXT: poplt {r4, pc} 266; CHECK-NEXT: .LBB4_1: @ %vector.ph 267; CHECK-NEXT: vmov r12, s0 268; CHECK-NEXT: movs r4, #0 269; CHECK-NEXT: dlstp.32 lr, r3 270; CHECK-NEXT: eor r12, r12, #-2147483648 271; CHECK-NEXT: .LBB4_2: @ %vector.body 272; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 273; CHECK-NEXT: adds r4, #4 274; CHECK-NEXT: vldrw.u32 q0, [r1], #16 275; CHECK-NEXT: vldrw.u32 q1, [r0], #16 276; CHECK-NEXT: vfmas.f32 q1, q0, r12 277; CHECK-NEXT: vstrw.32 q1, [r2], #16 278; CHECK-NEXT: letp lr, .LBB4_2 279; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 280; CHECK-NEXT: pop {r4, pc} 281entry: 282 %cmp8 = icmp sgt i32 %n, 0 283 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 284 285vector.ph: ; preds = %entry 286 %fneg = fneg fast float %a 287 %n.rnd.up = add i32 %n, 3 288 %n.vec = and i32 %n.rnd.up, -4 289 %trip.count.minus.1 = add i32 %n, -1 290 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 291 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 292 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0 293 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 294 br label %vector.body 295 296vector.body: ; preds = %vector.body, %vector.ph 297 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 298 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 299 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 300 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 301 %0 = getelementptr inbounds float, float* %x, i32 %index 302 303 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 304 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 305 306 %2 = bitcast float* %0 to <4 x float>* 307 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) 308 %3 = getelementptr inbounds float, float* %y, i32 %index 309 %4 = bitcast float* %3 to <4 x float>* 310 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) 311 %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14) 312 %6 = getelementptr inbounds float, float* %z, i32 %index 313 %7 = bitcast float* %6 to <4 x float>* 314 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1) 315 %index.next = add i32 %index, 4 316 %8 = icmp eq i32 %index.next, %n.vec 317 br i1 %8, label %for.cond.cleanup, label %vector.body 318 319for.cond.cleanup: ; preds = %vector.body, %entry 320 ret void 321} 322 323define arm_aapcs_vfpcc void @fmss2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { 324; CHECK-LABEL: fmss2: 325; CHECK: @ %bb.0: @ %entry 326; CHECK-NEXT: .save {r4, r5, r6, lr} 327; CHECK-NEXT: push {r4, r5, r6, lr} 328; CHECK-NEXT: cmp r3, #1 329; CHECK-NEXT: blt .LBB5_3 330; CHECK-NEXT: @ %bb.1: @ %vector.ph 331; CHECK-NEXT: vmov r6, s0 332; CHECK-NEXT: vdup.32 q0, r6 333; CHECK-NEXT: mov.w r12, #0 334; CHECK-NEXT: vneg.f32 q0, q0 335; CHECK-NEXT: dlstp.32 lr, r3 336; CHECK-NEXT: .LBB5_2: @ %vector.body 337; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 338; CHECK-NEXT: add.w r12, r12, #4 339; CHECK-NEXT: vmov q3, q0 340; CHECK-NEXT: vldrw.u32 q1, [r0], #16 341; CHECK-NEXT: vldrw.u32 q2, [r1], #16 342; CHECK-NEXT: vfma.f32 q3, q2, q1 343; CHECK-NEXT: vstrw.32 q3, [r2], #16 344; CHECK-NEXT: letp lr, .LBB5_2 345; CHECK-NEXT: .LBB5_3: @ %for.cond.cleanup 346; CHECK-NEXT: pop {r4, r5, r6, pc} 347entry: 348 %cmp8 = icmp sgt i32 %n, 0 349 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 350 351vector.ph: ; preds = %entry 352 %n.rnd.up = add i32 %n, 3 353 %n.vec = and i32 %n.rnd.up, -4 354 %trip.count.minus.1 = add i32 %n, -1 355 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 356 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 357 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 358 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 359 br label %vector.body 360 361vector.body: ; preds = %vector.body, %vector.ph 362 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 363 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 364 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 365 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 366 %0 = getelementptr inbounds float, float* %x, i32 %index 367 368 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 369 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 370 371 %2 = bitcast float* %0 to <4 x float>* 372 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) 373 %3 = getelementptr inbounds float, float* %y, i32 %index 374 %4 = bitcast float* %3 to <4 x float>* 375 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) 376 %5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load 377 %6 = fsub fast <4 x float> %5, %broadcast.splat14 378 %7 = getelementptr inbounds float, float* %z, i32 %index 379 %8 = bitcast float* %7 to <4 x float>* 380 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1) 381 %index.next = add i32 %index, 4 382 %9 = icmp eq i32 %index.next, %n.vec 383 br i1 %9, label %for.cond.cleanup, label %vector.body 384 385for.cond.cleanup: ; preds = %vector.body, %entry 386 ret void 387} 388 389define arm_aapcs_vfpcc void @fmss3(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { 390; CHECK-LABEL: fmss3: 391; CHECK: @ %bb.0: @ %entry 392; CHECK-NEXT: .save {r4, lr} 393; CHECK-NEXT: push {r4, lr} 394; CHECK-NEXT: cmp r3, #1 395; CHECK-NEXT: it lt 396; CHECK-NEXT: poplt {r4, pc} 397; CHECK-NEXT: .LBB6_1: @ %vector.ph 398; CHECK-NEXT: vmov r4, s0 399; CHECK-NEXT: vdup.32 q0, r4 400; CHECK-NEXT: mov.w r12, #0 401; CHECK-NEXT: dlstp.32 lr, r3 402; CHECK-NEXT: .LBB6_2: @ %vector.body 403; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 404; CHECK-NEXT: add.w r12, r12, #4 405; CHECK-NEXT: vmov q3, q0 406; CHECK-NEXT: vldrw.u32 q1, [r0], #16 407; CHECK-NEXT: vldrw.u32 q2, [r1], #16 408; CHECK-NEXT: vfms.f32 q3, q2, q1 409; CHECK-NEXT: vstrw.32 q3, [r2], #16 410; CHECK-NEXT: letp lr, .LBB6_2 411; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 412; CHECK-NEXT: pop {r4, pc} 413entry: 414 %cmp8 = icmp sgt i32 %n, 0 415 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 416 417vector.ph: ; preds = %entry 418 %n.rnd.up = add i32 %n, 3 419 %n.vec = and i32 %n.rnd.up, -4 420 %trip.count.minus.1 = add i32 %n, -1 421 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 422 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 423 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 424 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 425 br label %vector.body 426 427vector.body: ; preds = %vector.body, %vector.ph 428 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 429 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 430 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 431 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 432 %0 = getelementptr inbounds float, float* %x, i32 %index 433 434 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 435 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 436 437 %2 = bitcast float* %0 to <4 x float>* 438 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) 439 %3 = getelementptr inbounds float, float* %y, i32 %index 440 %4 = bitcast float* %3 to <4 x float>* 441 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) 442 %5 = fneg fast <4 x float> %wide.masked.load12 443 %6 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %5, <4 x float> %broadcast.splat14) 444 %7 = getelementptr inbounds float, float* %z, i32 %index 445 %8 = bitcast float* %7 to <4 x float>* 446 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1) 447 %index.next = add i32 %index, 4 448 %9 = icmp eq i32 %index.next, %n.vec 449 br i1 %9, label %for.cond.cleanup, label %vector.body 450 451for.cond.cleanup: ; preds = %vector.body, %entry 452 ret void 453} 454 455define arm_aapcs_vfpcc void @fmss4(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { 456; CHECK-LABEL: fmss4: 457; CHECK: @ %bb.0: @ %entry 458; CHECK-NEXT: .save {r4, lr} 459; CHECK-NEXT: push {r4, lr} 460; CHECK-NEXT: cmp r3, #1 461; CHECK-NEXT: it lt 462; CHECK-NEXT: poplt {r4, pc} 463; CHECK-NEXT: .LBB7_1: @ %vector.ph 464; CHECK-NEXT: vmov r4, s0 465; CHECK-NEXT: vdup.32 q0, r4 466; CHECK-NEXT: mov.w r12, #0 467; CHECK-NEXT: dlstp.32 lr, r3 468; CHECK-NEXT: .LBB7_2: @ %vector.body 469; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 470; CHECK-NEXT: add.w r12, r12, #4 471; CHECK-NEXT: vmov q3, q0 472; CHECK-NEXT: vldrw.u32 q1, [r0], #16 473; CHECK-NEXT: vldrw.u32 q2, [r1], #16 474; CHECK-NEXT: vfms.f32 q3, q2, q1 475; CHECK-NEXT: vstrw.32 q3, [r2], #16 476; CHECK-NEXT: letp lr, .LBB7_2 477; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 478; CHECK-NEXT: pop {r4, pc} 479entry: 480 %cmp8 = icmp sgt i32 %n, 0 481 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 482 483vector.ph: ; preds = %entry 484 %n.rnd.up = add i32 %n, 3 485 %n.vec = and i32 %n.rnd.up, -4 486 %trip.count.minus.1 = add i32 %n, -1 487 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 488 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 489 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 490 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 491 br label %vector.body 492 493vector.body: ; preds = %vector.body, %vector.ph 494 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 495 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 496 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 497 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 498 %0 = getelementptr inbounds float, float* %x, i32 %index 499 500 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 501 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 502 503 %2 = bitcast float* %0 to <4 x float>* 504 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) 505 %3 = getelementptr inbounds float, float* %y, i32 %index 506 %4 = bitcast float* %3 to <4 x float>* 507 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) 508 %5 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load 509 %6 = fsub fast <4 x float> %broadcast.splat14, %5 510 %7 = getelementptr inbounds float, float* %z, i32 %index 511 %8 = bitcast float* %7 to <4 x float>* 512 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1) 513 %index.next = add i32 %index, 4 514 %9 = icmp eq i32 %index.next, %n.vec 515 br i1 %9, label %for.cond.cleanup, label %vector.body 516 517for.cond.cleanup: ; preds = %vector.body, %entry 518 ret void 519} 520 521define arm_aapcs_vfpcc void @fms1(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { 522; CHECK-LABEL: fms1: 523; CHECK: @ %bb.0: @ %entry 524; CHECK-NEXT: .save {r4, lr} 525; CHECK-NEXT: push {r4, lr} 526; CHECK-NEXT: cmp r3, #1 527; CHECK-NEXT: it lt 528; CHECK-NEXT: poplt {r4, pc} 529; CHECK-NEXT: .LBB8_1: @ %vector.ph 530; CHECK-NEXT: vmov r12, s0 531; CHECK-NEXT: movs r4, #0 532; CHECK-NEXT: dlstp.32 lr, r3 533; CHECK-NEXT: eor r12, r12, #-2147483648 534; CHECK-NEXT: .LBB8_2: @ %vector.body 535; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 536; CHECK-NEXT: adds r4, #4 537; CHECK-NEXT: vldrw.u32 q0, [r0], #16 538; CHECK-NEXT: vldrw.u32 q1, [r1], #16 539; CHECK-NEXT: vfma.f32 q1, q0, r12 540; CHECK-NEXT: vstrw.32 q1, [r2], #16 541; CHECK-NEXT: letp lr, .LBB8_2 542; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 543; CHECK-NEXT: pop {r4, pc} 544entry: 545 %cmp8 = icmp sgt i32 %n, 0 546 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 547 548vector.ph: ; preds = %entry 549 %fneg = fneg fast float %a 550 %n.rnd.up = add i32 %n, 3 551 %n.vec = and i32 %n.rnd.up, -4 552 %trip.count.minus.1 = add i32 %n, -1 553 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 554 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 555 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0 556 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 557 br label %vector.body 558 559vector.body: ; preds = %vector.body, %vector.ph 560 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 561 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 562 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 563 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 564 %0 = getelementptr inbounds float, float* %x, i32 %index 565 566 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 567 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 568 569 %2 = bitcast float* %0 to <4 x float>* 570 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) 571 %3 = getelementptr inbounds float, float* %y, i32 %index 572 %4 = bitcast float* %3 to <4 x float>* 573 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) 574 %5 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12) 575 %6 = getelementptr inbounds float, float* %z, i32 %index 576 %7 = bitcast float* %6 to <4 x float>* 577 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %7, i32 4, <4 x i1> %1) 578 %index.next = add i32 %index, 4 579 %8 = icmp eq i32 %index.next, %n.vec 580 br i1 %8, label %for.cond.cleanup, label %vector.body 581 582for.cond.cleanup: ; preds = %vector.body, %entry 583 ret void 584} 585 586define arm_aapcs_vfpcc void @fms2(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { 587; CHECK-LABEL: fms2: 588; CHECK: @ %bb.0: @ %entry 589; CHECK-NEXT: .save {r4, lr} 590; CHECK-NEXT: push {r4, lr} 591; CHECK-NEXT: cmp r3, #1 592; CHECK-NEXT: it lt 593; CHECK-NEXT: poplt {r4, pc} 594; CHECK-NEXT: .LBB9_1: @ %vector.ph 595; CHECK-NEXT: vmov r4, s0 596; CHECK-NEXT: vdup.32 q0, r4 597; CHECK-NEXT: mov.w r12, #0 598; CHECK-NEXT: dlstp.32 lr, r3 599; CHECK-NEXT: .LBB9_2: @ %vector.body 600; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 601; CHECK-NEXT: add.w r12, r12, #4 602; CHECK-NEXT: vldrw.u32 q1, [r0], #16 603; CHECK-NEXT: vldrw.u32 q2, [r1], #16 604; CHECK-NEXT: vfms.f32 q2, q1, q0 605; CHECK-NEXT: vstrw.32 q2, [r2], #16 606; CHECK-NEXT: letp lr, .LBB9_2 607; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 608; CHECK-NEXT: pop {r4, pc} 609entry: 610 %cmp8 = icmp sgt i32 %n, 0 611 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 612 613vector.ph: ; preds = %entry 614 %n.rnd.up = add i32 %n, 3 615 %n.vec = and i32 %n.rnd.up, -4 616 %trip.count.minus.1 = add i32 %n, -1 617 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 618 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 619 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 620 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 621 br label %vector.body 622 623vector.body: ; preds = %vector.body, %vector.ph 624 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 625 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 626 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 627 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 628 %0 = getelementptr inbounds float, float* %x, i32 %index 629 630 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 631 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 632 633 %2 = bitcast float* %0 to <4 x float>* 634 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) 635 %3 = getelementptr inbounds float, float* %y, i32 %index 636 %4 = bitcast float* %3 to <4 x float>* 637 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) 638 %5 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat14 639 %6 = fsub fast <4 x float> %wide.masked.load12, %5 640 %7 = getelementptr inbounds float, float* %z, i32 %index 641 %8 = bitcast float* %7 to <4 x float>* 642 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1) 643 %index.next = add i32 %index, 4 644 %9 = icmp eq i32 %index.next, %n.vec 645 br i1 %9, label %for.cond.cleanup, label %vector.body 646 647for.cond.cleanup: ; preds = %vector.body, %entry 648 ret void 649} 650 651define arm_aapcs_vfpcc void @fms3(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { 652; CHECK-LABEL: fms3: 653; CHECK: @ %bb.0: @ %entry 654; CHECK-NEXT: .save {r4, lr} 655; CHECK-NEXT: push {r4, lr} 656; CHECK-NEXT: cmp r3, #1 657; CHECK-NEXT: it lt 658; CHECK-NEXT: poplt {r4, pc} 659; CHECK-NEXT: .LBB10_1: @ %vector.ph 660; CHECK-NEXT: vmov r12, s0 661; CHECK-NEXT: movs r4, #0 662; CHECK-NEXT: dlstp.32 lr, r3 663; CHECK-NEXT: .LBB10_2: @ %vector.body 664; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 665; CHECK-NEXT: vldrw.u32 q0, [r0], #16 666; CHECK-NEXT: vldrw.u32 q1, [r1], #16 667; CHECK-NEXT: adds r4, #4 668; CHECK-NEXT: vneg.f32 q1, q1 669; CHECK-NEXT: vfma.f32 q1, q0, r12 670; CHECK-NEXT: vstrw.32 q1, [r2], #16 671; CHECK-NEXT: letp lr, .LBB10_2 672; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 673; CHECK-NEXT: pop {r4, pc} 674entry: 675 %cmp8 = icmp sgt i32 %n, 0 676 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 677 678vector.ph: ; preds = %entry 679 %n.rnd.up = add i32 %n, 3 680 %n.vec = and i32 %n.rnd.up, -4 681 %trip.count.minus.1 = add i32 %n, -1 682 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 683 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 684 %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0 685 %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer 686 br label %vector.body 687 688vector.body: ; preds = %vector.body, %vector.ph 689 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 690 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 691 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 692 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 693 %0 = getelementptr inbounds float, float* %x, i32 %index 694 695 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 696 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 697 698 %2 = bitcast float* %0 to <4 x float>* 699 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) 700 %3 = getelementptr inbounds float, float* %y, i32 %index 701 %4 = bitcast float* %3 to <4 x float>* 702 %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %1, <4 x float> undef) 703 %5 = fneg fast <4 x float> %wide.masked.load12 704 %6 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %5) 705 %7 = getelementptr inbounds float, float* %z, i32 %index 706 %8 = bitcast float* %7 to <4 x float>* 707 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1) 708 %index.next = add i32 %index, 4 709 %9 = icmp eq i32 %index.next, %n.vec 710 br i1 %9, label %for.cond.cleanup, label %vector.body 711 712for.cond.cleanup: ; preds = %vector.body, %entry 713 ret void 714} 715 716define arm_aapcs_vfpcc void @fms4(float* nocapture readonly %x, float* nocapture readonly %y, float* noalias nocapture %z, float %a, i32 %n) { 717; CHECK-LABEL: fms4: 718; CHECK: @ %bb.0: @ %entry 719; CHECK-NEXT: .save {r4, lr} 720; CHECK-NEXT: push {r4, lr} 721; CHECK-NEXT: cmp r3, #1 722; CHECK-NEXT: it lt 723; CHECK-NEXT: poplt {r4, pc} 724; CHECK-NEXT: .LBB11_1: @ %vector.ph 725; CHECK-NEXT: vmov r12, s0 726; CHECK-NEXT: movs r4, #0 727; CHECK-NEXT: dlstp.32 lr, r3 728; CHECK-NEXT: .LBB11_2: @ %vector.body 729; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 730; CHECK-NEXT: vldrw.u32 q0, [r0], #16 731; CHECK-NEXT: vldrw.u32 q1, [r1], #16 732; CHECK-NEXT: adds r4, #4 733; CHECK-NEXT: vneg.f32 q1, q1 734; CHECK-NEXT: vfma.f32 q1, q0, r12 735; CHECK-NEXT: vstrw.32 q1, [r2], #16 736; CHECK-NEXT: letp lr, .LBB11_2 737; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 738; CHECK-NEXT: pop {r4, pc} 739entry: 740 %cmp8 = icmp sgt i32 %n, 0 741 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 742 743vector.ph: ; preds = %entry 744 %n.rnd.up = add i32 %n, 3 745 %n.vec = and i32 %n.rnd.up, -4 746 %trip.count.minus.1 = add i32 %n, -1 747 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 748 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 749 %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0 750 %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer 751 br label %vector.body 752 753vector.body: ; preds = %vector.body, %vector.ph 754 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 755 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 756 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 757 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 758 %0 = getelementptr inbounds float, float* %x, i32 %index 759 760 ; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11 761 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 762 763 %2 = bitcast float* %0 to <4 x float>* 764 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef) 765 %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13 766 %4 = getelementptr inbounds float, float* %y, i32 %index 767 %5 = bitcast float* %4 to <4 x float>* 768 %wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %5, i32 4, <4 x i1> %1, <4 x float> undef) 769 %6 = fsub fast <4 x float> %3, %wide.masked.load14 770 %7 = getelementptr inbounds float, float* %z, i32 %index 771 %8 = bitcast float* %7 to <4 x float>* 772 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %6, <4 x float>* %8, i32 4, <4 x i1> %1) 773 %index.next = add i32 %index, 4 774 %9 = icmp eq i32 %index.next, %n.vec 775 br i1 %9, label %for.cond.cleanup, label %vector.body 776 777for.cond.cleanup: ; preds = %vector.body, %entry 778 ret void 779} 780 781declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) 782declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) 783declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) 784declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) 785