1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s 3 4%struct.DCT_InstanceTypeDef = type { float*, i32, i32 } 5 6define void @DCT_mve1(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) { 7; CHECK-LABEL: DCT_mve1: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} 10; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} 11; CHECK-NEXT: ldr r3, [r0, #4] 12; CHECK-NEXT: sub.w r12, r3, #1 13; CHECK-NEXT: cmp.w r12, #2 14; CHECK-NEXT: blo .LBB0_5 15; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 16; CHECK-NEXT: ldr r5, [r0, #8] 17; CHECK-NEXT: ldr r3, [r0] 18; CHECK-NEXT: add.w r4, r3, r5, lsl #2 19; CHECK-NEXT: movs r0, #1 20; CHECK-NEXT: lsl.w r9, r5, #2 21; CHECK-NEXT: .LBB0_2: @ %for.body 22; CHECK-NEXT: @ =>This Loop Header: Depth=1 23; CHECK-NEXT: @ Child Loop BB0_3 Depth 2 24; CHECK-NEXT: vmov.i32 q0, #0x0 25; CHECK-NEXT: dlstp.32 lr, r5 26; CHECK-NEXT: mov r7, r1 27; CHECK-NEXT: mov r3, r4 28; CHECK-NEXT: mov r6, r5 29; CHECK-NEXT: .LBB0_3: @ %vector.body 30; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 31; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 32; CHECK-NEXT: vldrw.u32 q1, [r7], #16 33; CHECK-NEXT: vldrw.u32 q2, [r3], #16 34; CHECK-NEXT: vfma.f32 q0, q2, q1 35; CHECK-NEXT: letp lr, .LBB0_3 36; CHECK-NEXT: @ %bb.4: @ %middle.block 37; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 38; CHECK-NEXT: vadd.f32 s4, s2, s3 39; CHECK-NEXT: add.w r3, r2, r0, lsl #2 40; CHECK-NEXT: vadd.f32 s0, s0, s1 41; CHECK-NEXT: adds r0, #1 42; CHECK-NEXT: add r4, r9 43; CHECK-NEXT: cmp r0, r12 44; CHECK-NEXT: vadd.f32 s0, s0, s4 45; CHECK-NEXT: vstr s0, [r3] 46; CHECK-NEXT: bne .LBB0_2 47; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup 48; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} 49entry: 50 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2 51 %0 = load i32, i32* %NumInputs, align 4 52 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1 53 %1 = load i32, i32* %NumFilters, align 4 54 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0 55 %2 = load float*, float** %pDCTCoefs, align 4 56 %cmp = icmp ugt i32 %0, 1 57 tail call void @llvm.assume(i1 %cmp) 58 %sub = add i32 %1, -1 59 %cmp350 = icmp ugt i32 %sub, 1 60 br i1 %cmp350, label %for.body.preheader, label %for.cond.cleanup 61 62for.body.preheader: ; preds = %entry 63 %n.rnd.up = add i32 %0, 3 64 %n.vec = and i32 %n.rnd.up, -4 65 br label %for.body 66 67for.cond.cleanup: ; preds = %middle.block, %entry 68 ret void 69 70for.body: ; preds = %for.body.preheader, %middle.block 71 %k2.051 = phi i32 [ %add16, %middle.block ], [ 1, %for.body.preheader ] 72 %mul4 = mul i32 %k2.051, %0 73 br label %vector.body 74 75vector.body: ; preds = %vector.body, %for.body 76 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] 77 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %10, %vector.body ] 78 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0) 79 %3 = getelementptr inbounds float, float* %pIn, i32 %index 80 %4 = bitcast float* %3 to <4 x float>* 81 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 82 %5 = add i32 %index, %mul4 83 %6 = getelementptr inbounds float, float* %2, i32 %5 84 %7 = bitcast float* %6 to <4 x float>* 85 %wide.masked.load53 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 86 %8 = fmul fast <4 x float> %wide.masked.load53, %wide.masked.load 87 %9 = fadd fast <4 x float> %8, %vec.phi 88 %10 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi 89 %index.next = add i32 %index, 4 90 %11 = icmp eq i32 %index.next, %n.vec 91 br i1 %11, label %middle.block, label %vector.body 92 93middle.block: ; preds = %vector.body 94 %12 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %10) 95 %arrayidx14 = getelementptr inbounds float, float* %pOut, i32 %k2.051 96 store float %12, float* %arrayidx14, align 4 97 %add16 = add nuw i32 %k2.051, 1 98 %exitcond52.not = icmp eq i32 %add16, %sub 99 br i1 %exitcond52.not, label %for.cond.cleanup, label %for.body 100} 101 102define void @DCT_mve2(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) { 103; CHECK-LABEL: DCT_mve2: 104; CHECK: @ %bb.0: @ %entry 105; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 106; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 107; CHECK-NEXT: .pad #4 108; CHECK-NEXT: sub sp, #4 109; CHECK-NEXT: str r1, [sp] @ 4-byte Spill 110; CHECK-NEXT: ldr r1, [r0, #4] 111; CHECK-NEXT: subs r1, #2 112; CHECK-NEXT: cmp r1, #2 113; CHECK-NEXT: blo .LBB1_5 114; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 115; CHECK-NEXT: ldr.w r12, [r0, #8] 116; CHECK-NEXT: movs r4, #1 117; CHECK-NEXT: ldr r3, [r0] 118; CHECK-NEXT: add.w r0, r12, #3 119; CHECK-NEXT: bic r0, r0, #3 120; CHECK-NEXT: add.w r5, r3, r12, lsl #2 121; CHECK-NEXT: subs r0, #4 122; CHECK-NEXT: add.w r7, r3, r12, lsl #3 123; CHECK-NEXT: lsl.w r9, r12, #3 124; CHECK-NEXT: add.w r8, r4, r0, lsr #2 125; CHECK-NEXT: .LBB1_2: @ %for.body 126; CHECK-NEXT: @ =>This Loop Header: Depth=1 127; CHECK-NEXT: @ Child Loop BB1_3 Depth 2 128; CHECK-NEXT: dls lr, r8 129; CHECK-NEXT: ldr r6, [sp] @ 4-byte Reload 130; CHECK-NEXT: vmov.i32 q0, #0x0 131; CHECK-NEXT: add.w r11, r4, #1 132; CHECK-NEXT: mov r3, r5 133; CHECK-NEXT: mov r0, r7 134; CHECK-NEXT: vmov q1, q0 135; CHECK-NEXT: mov r10, r12 136; CHECK-NEXT: .LBB1_3: @ %vector.body 137; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1 138; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 139; CHECK-NEXT: vctp.32 r10 140; CHECK-NEXT: sub.w r10, r10, #4 141; CHECK-NEXT: vpstttt 142; CHECK-NEXT: vldrwt.u32 q2, [r6], #16 143; CHECK-NEXT: vldrwt.u32 q3, [r3], #16 144; CHECK-NEXT: vfmat.f32 q1, q3, q2 145; CHECK-NEXT: vldrwt.u32 q3, [r0], #16 146; CHECK-NEXT: vpst 147; CHECK-NEXT: vfmat.f32 q0, q3, q2 148; CHECK-NEXT: le lr, .LBB1_3 149; CHECK-NEXT: @ %bb.4: @ %middle.block 150; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1 151; CHECK-NEXT: vadd.f32 s8, s2, s3 152; CHECK-NEXT: add.w r0, r2, r11, lsl #2 153; CHECK-NEXT: vadd.f32 s0, s0, s1 154; CHECK-NEXT: add r5, r9 155; CHECK-NEXT: vadd.f32 s2, s6, s7 156; CHECK-NEXT: add r7, r9 157; CHECK-NEXT: vadd.f32 s4, s4, s5 158; CHECK-NEXT: vadd.f32 s0, s0, s8 159; CHECK-NEXT: vadd.f32 s2, s4, s2 160; CHECK-NEXT: vstr s0, [r0] 161; CHECK-NEXT: add.w r0, r2, r4, lsl #2 162; CHECK-NEXT: adds r4, #2 163; CHECK-NEXT: cmp r4, r1 164; CHECK-NEXT: vstr s2, [r0] 165; CHECK-NEXT: blo .LBB1_2 166; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup 167; CHECK-NEXT: add sp, #4 168; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 169entry: 170 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2 171 %0 = load i32, i32* %NumInputs, align 4 172 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1 173 %1 = load i32, i32* %NumFilters, align 4 174 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0 175 %2 = load float*, float** %pDCTCoefs, align 4 176 %cmp = icmp ugt i32 %0, 1 177 tail call void @llvm.assume(i1 %cmp) 178 %sub = add i32 %1, -2 179 %cmp371 = icmp ugt i32 %sub, 1 180 br i1 %cmp371, label %for.body.preheader, label %for.cond.cleanup 181 182for.body.preheader: ; preds = %entry 183 %n.rnd.up = add i32 %0, 3 184 %n.vec = and i32 %n.rnd.up, -4 185 br label %for.body 186 187for.cond.cleanup: ; preds = %middle.block, %entry 188 ret void 189 190for.body: ; preds = %for.body.preheader, %middle.block 191 %k2.072 = phi i32 [ %add25, %middle.block ], [ 1, %for.body.preheader ] 192 %mul4 = mul i32 %k2.072, %0 193 %add = add nuw i32 %k2.072, 1 194 %mul5 = mul i32 %add, %0 195 br label %vector.body 196 197vector.body: ; preds = %vector.body, %for.body 198 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] 199 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %15, %vector.body ] 200 %vec.phi73 = phi <4 x float> [ zeroinitializer, %for.body ], [ %16, %vector.body ] 201 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0) 202 %3 = getelementptr inbounds float, float* %pIn, i32 %index 203 %4 = bitcast float* %3 to <4 x float>* 204 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 205 %5 = add i32 %index, %mul4 206 %6 = getelementptr inbounds float, float* %2, i32 %5 207 %7 = bitcast float* %6 to <4 x float>* 208 %wide.masked.load74 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 209 %8 = fmul fast <4 x float> %wide.masked.load74, %wide.masked.load 210 %9 = fadd fast <4 x float> %8, %vec.phi73 211 %10 = add i32 %index, %mul5 212 %11 = getelementptr inbounds float, float* %2, i32 %10 213 %12 = bitcast float* %11 to <4 x float>* 214 %wide.masked.load75 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 215 %13 = fmul fast <4 x float> %wide.masked.load75, %wide.masked.load 216 %14 = fadd fast <4 x float> %13, %vec.phi 217 %15 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi 218 %16 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi73 219 %index.next = add i32 %index, 4 220 %17 = icmp eq i32 %index.next, %n.vec 221 br i1 %17, label %middle.block, label %vector.body 222 223middle.block: ; preds = %vector.body 224 %18 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %16) 225 %19 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %15) 226 %arrayidx21 = getelementptr inbounds float, float* %pOut, i32 %k2.072 227 store float %18, float* %arrayidx21, align 4 228 %arrayidx23 = getelementptr inbounds float, float* %pOut, i32 %add 229 store float %19, float* %arrayidx23, align 4 230 %add25 = add i32 %k2.072, 2 231 %cmp3 = icmp ult i32 %add25, %sub 232 br i1 %cmp3, label %for.body, label %for.cond.cleanup 233} 234 235define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) { 236; CHECK-LABEL: DCT_mve3: 237; CHECK: @ %bb.0: @ %entry 238; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 239; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 240; CHECK-NEXT: .pad #4 241; CHECK-NEXT: sub sp, #4 242; CHECK-NEXT: .vsave {d8, d9} 243; CHECK-NEXT: vpush {d8, d9} 244; CHECK-NEXT: .pad #16 245; CHECK-NEXT: sub sp, #16 246; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill 247; CHECK-NEXT: ldr r1, [r0, #4] 248; CHECK-NEXT: subs r1, #3 249; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill 250; CHECK-NEXT: cmp r1, #2 251; CHECK-NEXT: blo .LBB2_5 252; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 253; CHECK-NEXT: ldr r7, [r0, #8] 254; CHECK-NEXT: movs r5, #1 255; CHECK-NEXT: ldr r3, [r0] 256; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill 257; CHECK-NEXT: add.w r0, r7, r7, lsl #1 258; CHECK-NEXT: add.w r12, r3, r7, lsl #2 259; CHECK-NEXT: add.w r1, r3, r7, lsl #3 260; CHECK-NEXT: add.w r8, r3, r0, lsl #2 261; CHECK-NEXT: adds r3, r7, #3 262; CHECK-NEXT: bic r3, r3, #3 263; CHECK-NEXT: lsls r7, r0, #2 264; CHECK-NEXT: subs r3, #4 265; CHECK-NEXT: add.w r3, r5, r3, lsr #2 266; CHECK-NEXT: str r3, [sp] @ 4-byte Spill 267; CHECK-NEXT: .LBB2_2: @ %for.body 268; CHECK-NEXT: @ =>This Loop Header: Depth=1 269; CHECK-NEXT: @ Child Loop BB2_3 Depth 2 270; CHECK-NEXT: ldrd r0, r10, [sp] @ 8-byte Folded Reload 271; CHECK-NEXT: vmov.i32 q0, #0x0 272; CHECK-NEXT: add.w r9, r5, #2 273; CHECK-NEXT: add.w r11, r5, #1 274; CHECK-NEXT: dls lr, r0 275; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload 276; CHECK-NEXT: mov r3, r12 277; CHECK-NEXT: mov r0, r1 278; CHECK-NEXT: mov r4, r8 279; CHECK-NEXT: vmov q2, q0 280; CHECK-NEXT: vmov q1, q0 281; CHECK-NEXT: .LBB2_3: @ %vector.body 282; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1 283; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 284; CHECK-NEXT: vctp.32 r10 285; CHECK-NEXT: sub.w r10, r10, #4 286; CHECK-NEXT: vpstttt 287; CHECK-NEXT: vldrwt.u32 q3, [r6], #16 288; CHECK-NEXT: vldrwt.u32 q4, [r3], #16 289; CHECK-NEXT: vfmat.f32 q1, q4, q3 290; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 291; CHECK-NEXT: vpsttt 292; CHECK-NEXT: vfmat.f32 q2, q4, q3 293; CHECK-NEXT: vldrwt.u32 q4, [r4], #16 294; CHECK-NEXT: vfmat.f32 q0, q4, q3 295; CHECK-NEXT: le lr, .LBB2_3 296; CHECK-NEXT: @ %bb.4: @ %middle.block 297; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1 298; CHECK-NEXT: vadd.f32 s12, s10, s11 299; CHECK-NEXT: add.w r0, r2, r11, lsl #2 300; CHECK-NEXT: vadd.f32 s8, s8, s9 301; CHECK-NEXT: add r12, r7 302; CHECK-NEXT: vadd.f32 s10, s6, s7 303; CHECK-NEXT: add r1, r7 304; CHECK-NEXT: vadd.f32 s4, s4, s5 305; CHECK-NEXT: add r8, r7 306; CHECK-NEXT: vadd.f32 s6, s2, s3 307; CHECK-NEXT: vadd.f32 s0, s0, s1 308; CHECK-NEXT: vadd.f32 s2, s8, s12 309; CHECK-NEXT: vadd.f32 s4, s4, s10 310; CHECK-NEXT: vadd.f32 s0, s0, s6 311; CHECK-NEXT: vstr s2, [r0] 312; CHECK-NEXT: add.w r0, r2, r5, lsl #2 313; CHECK-NEXT: adds r5, #3 314; CHECK-NEXT: vstr s4, [r0] 315; CHECK-NEXT: add.w r0, r2, r9, lsl #2 316; CHECK-NEXT: vstr s0, [r0] 317; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload 318; CHECK-NEXT: cmp r5, r0 319; CHECK-NEXT: blo .LBB2_2 320; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup 321; CHECK-NEXT: add sp, #16 322; CHECK-NEXT: vpop {d8, d9} 323; CHECK-NEXT: add sp, #4 324; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 325entry: 326 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2 327 %0 = load i32, i32* %NumInputs, align 4 328 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1 329 %1 = load i32, i32* %NumFilters, align 4 330 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0 331 %2 = load float*, float** %pDCTCoefs, align 4 332 %cmp = icmp ugt i32 %0, 1 333 tail call void @llvm.assume(i1 %cmp) 334 %sub = add i32 %1, -3 335 %cmp392 = icmp ugt i32 %sub, 1 336 br i1 %cmp392, label %for.body.preheader, label %for.cond.cleanup 337 338for.body.preheader: ; preds = %entry 339 %n.rnd.up = add i32 %0, 3 340 %n.vec = and i32 %n.rnd.up, -4 341 br label %for.body 342 343for.cond.cleanup: ; preds = %middle.block, %entry 344 ret void 345 346for.body: ; preds = %for.body.preheader, %middle.block 347 %k2.093 = phi i32 [ %add34, %middle.block ], [ 1, %for.body.preheader ] 348 %mul4 = mul i32 %k2.093, %0 349 %add = add nuw i32 %k2.093, 1 350 %mul5 = mul i32 %add, %0 351 %add6 = add i32 %k2.093, 2 352 %mul7 = mul i32 %add6, %0 353 br label %vector.body 354 355vector.body: ; preds = %vector.body, %for.body 356 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] 357 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %20, %vector.body ] 358 %vec.phi94 = phi <4 x float> [ zeroinitializer, %for.body ], [ %21, %vector.body ] 359 %vec.phi95 = phi <4 x float> [ zeroinitializer, %for.body ], [ %22, %vector.body ] 360 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0) 361 %3 = getelementptr inbounds float, float* %pIn, i32 %index 362 %4 = bitcast float* %3 to <4 x float>* 363 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 364 %5 = add i32 %index, %mul4 365 %6 = getelementptr inbounds float, float* %2, i32 %5 366 %7 = bitcast float* %6 to <4 x float>* 367 %wide.masked.load96 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 368 %8 = fmul fast <4 x float> %wide.masked.load96, %wide.masked.load 369 %9 = fadd fast <4 x float> %8, %vec.phi95 370 %10 = add i32 %index, %mul5 371 %11 = getelementptr inbounds float, float* %2, i32 %10 372 %12 = bitcast float* %11 to <4 x float>* 373 %wide.masked.load97 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 374 %13 = fmul fast <4 x float> %wide.masked.load97, %wide.masked.load 375 %14 = fadd fast <4 x float> %13, %vec.phi94 376 %15 = add i32 %index, %mul7 377 %16 = getelementptr inbounds float, float* %2, i32 %15 378 %17 = bitcast float* %16 to <4 x float>* 379 %wide.masked.load98 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 380 %18 = fmul fast <4 x float> %wide.masked.load98, %wide.masked.load 381 %19 = fadd fast <4 x float> %18, %vec.phi 382 %20 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi 383 %21 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi94 384 %22 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi95 385 %index.next = add i32 %index, 4 386 %23 = icmp eq i32 %index.next, %n.vec 387 br i1 %23, label %middle.block, label %vector.body 388 389middle.block: ; preds = %vector.body 390 %24 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %22) 391 %25 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %21) 392 %26 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %20) 393 %arrayidx28 = getelementptr inbounds float, float* %pOut, i32 %k2.093 394 store float %24, float* %arrayidx28, align 4 395 %arrayidx30 = getelementptr inbounds float, float* %pOut, i32 %add 396 store float %25, float* %arrayidx30, align 4 397 %arrayidx32 = getelementptr inbounds float, float* %pOut, i32 %add6 398 store float %26, float* %arrayidx32, align 4 399 %add34 = add i32 %k2.093, 3 400 %cmp3 = icmp ult i32 %add34, %sub 401 br i1 %cmp3, label %for.body, label %for.cond.cleanup 402} 403 404define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) { 405; CHECK-LABEL: DCT_mve4: 406; CHECK: @ %bb.0: @ %entry 407; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 408; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 409; CHECK-NEXT: .pad #4 410; CHECK-NEXT: sub sp, #4 411; CHECK-NEXT: .vsave {d8, d9, d10, d11} 412; CHECK-NEXT: vpush {d8, d9, d10, d11} 413; CHECK-NEXT: .pad #32 414; CHECK-NEXT: sub sp, #32 415; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill 416; CHECK-NEXT: ldr r1, [r0, #4] 417; CHECK-NEXT: subs r1, #4 418; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill 419; CHECK-NEXT: cmp r1, #2 420; CHECK-NEXT: blo.w .LBB3_5 421; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 422; CHECK-NEXT: ldr r3, [r0, #8] 423; CHECK-NEXT: movs r6, #1 424; CHECK-NEXT: ldr r1, [r0] 425; CHECK-NEXT: add.w r0, r3, r3, lsl #1 426; CHECK-NEXT: add.w r8, r1, r3, lsl #2 427; CHECK-NEXT: add.w r12, r1, r3, lsl #3 428; CHECK-NEXT: add.w r10, r1, r3, lsl #4 429; CHECK-NEXT: add.w r9, r1, r0, lsl #2 430; CHECK-NEXT: adds r0, r3, #3 431; CHECK-NEXT: bic r0, r0, #3 432; CHECK-NEXT: lsls r7, r3, #4 433; CHECK-NEXT: subs r0, #4 434; CHECK-NEXT: add.w r0, r6, r0, lsr #2 435; CHECK-NEXT: strd r0, r3, [sp, #4] @ 8-byte Folded Spill 436; CHECK-NEXT: .LBB3_2: @ %for.body 437; CHECK-NEXT: @ =>This Loop Header: Depth=1 438; CHECK-NEXT: @ Child Loop BB3_3 Depth 2 439; CHECK-NEXT: adds r0, r6, #3 440; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill 441; CHECK-NEXT: adds r0, r6, #2 442; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill 443; CHECK-NEXT: adds r0, r6, #1 444; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill 445; CHECK-NEXT: ldrd r0, r11, [sp, #4] @ 8-byte Folded Reload 446; CHECK-NEXT: vmov.i32 q0, #0x0 447; CHECK-NEXT: mov r3, r8 448; CHECK-NEXT: mov r5, r9 449; CHECK-NEXT: dls lr, r0 450; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload 451; CHECK-NEXT: mov r0, r12 452; CHECK-NEXT: mov r4, r10 453; CHECK-NEXT: vmov q1, q0 454; CHECK-NEXT: vmov q2, q0 455; CHECK-NEXT: vmov q3, q0 456; CHECK-NEXT: .LBB3_3: @ %vector.body 457; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1 458; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 459; CHECK-NEXT: vctp.32 r11 460; CHECK-NEXT: sub.w r11, r11, #4 461; CHECK-NEXT: vpstttt 462; CHECK-NEXT: vldrwt.u32 q4, [r1], #16 463; CHECK-NEXT: vldrwt.u32 q5, [r0], #16 464; CHECK-NEXT: vfmat.f32 q3, q5, q4 465; CHECK-NEXT: vldrwt.u32 q5, [r3], #16 466; CHECK-NEXT: vpstttt 467; CHECK-NEXT: vfmat.f32 q2, q5, q4 468; CHECK-NEXT: vldrwt.u32 q5, [r5], #16 469; CHECK-NEXT: vfmat.f32 q1, q5, q4 470; CHECK-NEXT: vldrwt.u32 q5, [r4], #16 471; CHECK-NEXT: vpst 472; CHECK-NEXT: vfmat.f32 q0, q5, q4 473; CHECK-NEXT: le lr, .LBB3_3 474; CHECK-NEXT: @ %bb.4: @ %middle.block 475; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1 476; CHECK-NEXT: vadd.f32 s16, s14, s15 477; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload 478; CHECK-NEXT: vadd.f32 s12, s12, s13 479; CHECK-NEXT: add r8, r7 480; CHECK-NEXT: vadd.f32 s14, s10, s11 481; CHECK-NEXT: add r12, r7 482; CHECK-NEXT: vadd.f32 s8, s8, s9 483; CHECK-NEXT: add.w r0, r2, r0, lsl #2 484; CHECK-NEXT: vadd.f32 s10, s6, s7 485; CHECK-NEXT: add r9, r7 486; CHECK-NEXT: vadd.f32 s4, s4, s5 487; CHECK-NEXT: add r10, r7 488; CHECK-NEXT: vadd.f32 s6, s2, s3 489; CHECK-NEXT: vadd.f32 s0, s0, s1 490; CHECK-NEXT: vadd.f32 s2, s12, s16 491; CHECK-NEXT: vadd.f32 s8, s8, s14 492; CHECK-NEXT: vadd.f32 s4, s4, s10 493; CHECK-NEXT: vadd.f32 s0, s0, s6 494; CHECK-NEXT: vstr s2, [r0] 495; CHECK-NEXT: add.w r0, r2, r6, lsl #2 496; CHECK-NEXT: adds r6, #4 497; CHECK-NEXT: vstr s8, [r0] 498; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload 499; CHECK-NEXT: add.w r0, r2, r0, lsl #2 500; CHECK-NEXT: vstr s4, [r0] 501; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload 502; CHECK-NEXT: add.w r0, r2, r0, lsl #2 503; CHECK-NEXT: vstr s0, [r0] 504; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload 505; CHECK-NEXT: cmp r6, r0 506; CHECK-NEXT: blo .LBB3_2 507; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup 508; CHECK-NEXT: add sp, #32 509; CHECK-NEXT: vpop {d8, d9, d10, d11} 510; CHECK-NEXT: add sp, #4 511; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 512entry: 513 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2 514 %0 = load i32, i32* %NumInputs, align 4 515 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1 516 %1 = load i32, i32* %NumFilters, align 4 517 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0 518 %2 = load float*, float** %pDCTCoefs, align 4 519 %cmp = icmp ugt i32 %0, 1 520 tail call void @llvm.assume(i1 %cmp) 521 %sub = add i32 %1, -4 522 %cmp3113 = icmp ugt i32 %sub, 1 523 br i1 %cmp3113, label %for.body.preheader, label %for.cond.cleanup 524 525for.body.preheader: ; preds = %entry 526 %n.rnd.up = add i32 %0, 3 527 %n.vec = and i32 %n.rnd.up, -4 528 br label %for.body 529 530for.cond.cleanup: ; preds = %middle.block, %entry 531 ret void 532 533for.body: ; preds = %for.body.preheader, %middle.block 534 %k2.0114 = phi i32 [ %add43, %middle.block ], [ 1, %for.body.preheader ] 535 %mul4 = mul i32 %k2.0114, %0 536 %add = add nuw nsw i32 %k2.0114, 1 537 %mul5 = mul i32 %add, %0 538 %add6 = add nuw nsw i32 %k2.0114, 2 539 %mul7 = mul i32 %add6, %0 540 %add8 = add i32 %k2.0114, 3 541 %mul9 = mul i32 %add8, %0 542 br label %vector.body 543 544vector.body: ; preds = %vector.body, %for.body 545 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] 546 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %25, %vector.body ] 547 %vec.phi115 = phi <4 x float> [ zeroinitializer, %for.body ], [ %26, %vector.body ] 548 %vec.phi116 = phi <4 x float> [ zeroinitializer, %for.body ], [ %27, %vector.body ] 549 %vec.phi117 = phi <4 x float> [ zeroinitializer, %for.body ], [ %28, %vector.body ] 550 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0) 551 %3 = getelementptr inbounds float, float* %pIn, i32 %index 552 %4 = bitcast float* %3 to <4 x float>* 553 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 554 %5 = add i32 %index, %mul4 555 %6 = getelementptr inbounds float, float* %2, i32 %5 556 %7 = bitcast float* %6 to <4 x float>* 557 %wide.masked.load118 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 558 %8 = fmul fast <4 x float> %wide.masked.load118, %wide.masked.load 559 %9 = fadd fast <4 x float> %8, %vec.phi116 560 %10 = add i32 %index, %mul5 561 %11 = getelementptr inbounds float, float* %2, i32 %10 562 %12 = bitcast float* %11 to <4 x float>* 563 %wide.masked.load119 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 564 %13 = fmul fast <4 x float> %wide.masked.load119, %wide.masked.load 565 %14 = fadd fast <4 x float> %13, %vec.phi117 566 %15 = add i32 %index, %mul7 567 %16 = getelementptr inbounds float, float* %2, i32 %15 568 %17 = bitcast float* %16 to <4 x float>* 569 %wide.masked.load120 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 570 %18 = fmul fast <4 x float> %wide.masked.load120, %wide.masked.load 571 %19 = fadd fast <4 x float> %18, %vec.phi115 572 %20 = add i32 %index, %mul9 573 %21 = getelementptr inbounds float, float* %2, i32 %20 574 %22 = bitcast float* %21 to <4 x float>* 575 %wide.masked.load121 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 576 %23 = fmul fast <4 x float> %wide.masked.load121, %wide.masked.load 577 %24 = fadd fast <4 x float> %23, %vec.phi 578 %25 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi 579 %26 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi115 580 %27 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi116 581 %28 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi117 582 %index.next = add i32 %index, 4 583 %29 = icmp eq i32 %index.next, %n.vec 584 br i1 %29, label %middle.block, label %vector.body 585 586middle.block: ; preds = %vector.body 587 %30 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %28) 588 %31 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %27) 589 %32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %26) 590 %33 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %25) 591 %arrayidx35 = getelementptr inbounds float, float* %pOut, i32 %k2.0114 592 store float %31, float* %arrayidx35, align 4 593 %arrayidx37 = getelementptr inbounds float, float* %pOut, i32 %add 594 store float %30, float* %arrayidx37, align 4 595 %arrayidx39 = getelementptr inbounds float, float* %pOut, i32 %add6 596 store float %32, float* %arrayidx39, align 4 597 %arrayidx41 = getelementptr inbounds float, float* %pOut, i32 %add8 598 store float %33, float* %arrayidx41, align 4 599 %add43 = add i32 %k2.0114, 4 600 %cmp3 = icmp ult i32 %add43, %sub 601 br i1 %cmp3, label %for.body, label %for.cond.cleanup 602} 603 604define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) { 605; CHECK-LABEL: DCT_mve5: 606; CHECK: @ %bb.0: @ %entry 607; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 608; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 609; CHECK-NEXT: .pad #4 610; CHECK-NEXT: sub sp, #4 611; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 612; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 613; CHECK-NEXT: .pad #32 614; CHECK-NEXT: sub sp, #32 615; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill 616; CHECK-NEXT: ldr r1, [r0, #4] 617; CHECK-NEXT: subs r1, #5 618; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill 619; CHECK-NEXT: cmp r1, #2 620; CHECK-NEXT: blo.w .LBB4_5 621; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 622; CHECK-NEXT: ldr r3, [r0, #8] 623; CHECK-NEXT: ldr r1, [r0] 624; CHECK-NEXT: adds r0, r3, #3 625; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill 626; CHECK-NEXT: bic r0, r0, #3 627; CHECK-NEXT: add.w r8, r1, r3, lsl #2 628; CHECK-NEXT: subs r1, r0, #4 629; CHECK-NEXT: movs r0, #1 630; CHECK-NEXT: lsls r5, r3, #2 631; CHECK-NEXT: add.w r1, r0, r1, lsr #2 632; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill 633; CHECK-NEXT: add.w r1, r3, r3, lsl #2 634; CHECK-NEXT: lsls r1, r1, #2 635; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill 636; CHECK-NEXT: .LBB4_2: @ %for.body 637; CHECK-NEXT: @ =>This Loop Header: Depth=1 638; CHECK-NEXT: @ Child Loop BB4_3 Depth 2 639; CHECK-NEXT: adds r1, r0, #4 640; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill 641; CHECK-NEXT: adds r1, r0, #3 642; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill 643; CHECK-NEXT: ldrd r1, r11, [sp, #8] @ 8-byte Folded Reload 644; CHECK-NEXT: vmov.i32 q1, #0x0 645; CHECK-NEXT: add.w r10, r0, #2 646; CHECK-NEXT: adds r7, r0, #1 647; CHECK-NEXT: dls lr, r1 648; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload 649; CHECK-NEXT: mov r3, r8 650; CHECK-NEXT: vmov q0, q1 651; CHECK-NEXT: vmov q3, q1 652; CHECK-NEXT: vmov q2, q1 653; CHECK-NEXT: vmov q4, q1 654; CHECK-NEXT: .LBB4_3: @ %vector.body 655; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1 656; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 657; CHECK-NEXT: add.w r9, r3, r5 658; CHECK-NEXT: vctp.32 r11 659; CHECK-NEXT: vpsttt 660; CHECK-NEXT: vldrwt.u32 q5, [r1], #16 661; CHECK-NEXT: vldrwt.u32 q6, [r3], #16 662; CHECK-NEXT: vfmat.f32 q3, q6, q5 663; CHECK-NEXT: add.w r12, r9, r5 664; CHECK-NEXT: vpstt 665; CHECK-NEXT: vldrwt.u32 q6, [r9] 666; CHECK-NEXT: vfmat.f32 q4, q6, q5 667; CHECK-NEXT: sub.w r11, r11, #4 668; CHECK-NEXT: add.w r4, r12, r5 669; CHECK-NEXT: vpstt 670; CHECK-NEXT: vldrwt.u32 q6, [r12] 671; CHECK-NEXT: vfmat.f32 q2, q6, q5 672; CHECK-NEXT: adds r6, r4, r5 673; CHECK-NEXT: vpstttt 674; CHECK-NEXT: vldrwt.u32 q6, [r4] 675; CHECK-NEXT: vfmat.f32 q0, q6, q5 676; CHECK-NEXT: vldrwt.u32 q6, [r6] 677; CHECK-NEXT: vfmat.f32 q1, q6, q5 678; CHECK-NEXT: le lr, .LBB4_3 679; CHECK-NEXT: @ %bb.4: @ %middle.block 680; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1 681; CHECK-NEXT: vadd.f32 s20, s18, s19 682; CHECK-NEXT: add.w r1, r2, r7, lsl #2 683; CHECK-NEXT: vadd.f32 s16, s16, s17 684; CHECK-NEXT: vadd.f32 s18, s14, s15 685; CHECK-NEXT: vadd.f32 s12, s12, s13 686; CHECK-NEXT: vadd.f32 s14, s6, s7 687; CHECK-NEXT: vadd.f32 s4, s4, s5 688; CHECK-NEXT: vadd.f32 s6, s10, s11 689; CHECK-NEXT: vadd.f32 s8, s8, s9 690; CHECK-NEXT: vadd.f32 s10, s2, s3 691; CHECK-NEXT: vadd.f32 s0, s0, s1 692; CHECK-NEXT: vadd.f32 s2, s16, s20 693; CHECK-NEXT: vadd.f32 s12, s12, s18 694; CHECK-NEXT: vadd.f32 s4, s4, s14 695; CHECK-NEXT: vadd.f32 s6, s8, s6 696; CHECK-NEXT: vadd.f32 s0, s0, s10 697; CHECK-NEXT: vstr s2, [r1] 698; CHECK-NEXT: add.w r1, r2, r0, lsl #2 699; CHECK-NEXT: adds r0, #5 700; CHECK-NEXT: vstr s12, [r1] 701; CHECK-NEXT: add.w r1, r2, r10, lsl #2 702; CHECK-NEXT: vstr s6, [r1] 703; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload 704; CHECK-NEXT: add.w r1, r2, r1, lsl #2 705; CHECK-NEXT: vstr s0, [r1] 706; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload 707; CHECK-NEXT: add.w r1, r2, r1, lsl #2 708; CHECK-NEXT: vstr s4, [r1] 709; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload 710; CHECK-NEXT: add r8, r1 711; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload 712; CHECK-NEXT: cmp r0, r1 713; CHECK-NEXT: blo.w .LBB4_2 714; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup 715; CHECK-NEXT: add sp, #32 716; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 717; CHECK-NEXT: add sp, #4 718; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 719entry: 720 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2 721 %0 = load i32, i32* %NumInputs, align 4 722 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1 723 %1 = load i32, i32* %NumFilters, align 4 724 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0 725 %2 = load float*, float** %pDCTCoefs, align 4 726 %cmp = icmp ugt i32 %0, 1 727 tail call void @llvm.assume(i1 %cmp) 728 %sub = add i32 %1, -5 729 %cmp3134 = icmp ugt i32 %sub, 1 730 br i1 %cmp3134, label %for.body.preheader, label %for.cond.cleanup 731 732for.body.preheader: ; preds = %entry 733 %n.rnd.up = add i32 %0, 3 734 %n.vec = and i32 %n.rnd.up, -4 735 br label %for.body 736 737for.cond.cleanup: ; preds = %middle.block, %entry 738 ret void 739 740for.body: ; preds = %for.body.preheader, %middle.block 741 %k2.0135 = phi i32 [ %add52, %middle.block ], [ 1, %for.body.preheader ] 742 %mul4 = mul i32 %k2.0135, %0 743 %add = add nuw i32 %k2.0135, 1 744 %mul5 = mul i32 %add, %0 745 %add6 = add i32 %k2.0135, 2 746 %mul7 = mul i32 %add6, %0 747 %add8 = add i32 %k2.0135, 3 748 %mul9 = mul i32 %add8, %0 749 %add10 = add i32 %k2.0135, 4 750 %mul11 = mul i32 %add10, %0 751 br label %vector.body 752 753vector.body: ; preds = %vector.body, %for.body 754 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] 755 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %30, %vector.body ] 756 %vec.phi136 = phi <4 x float> [ zeroinitializer, %for.body ], [ %31, %vector.body ] 757 %vec.phi137 = phi <4 x float> [ zeroinitializer, %for.body ], [ %32, %vector.body ] 758 %vec.phi138 = phi <4 x float> [ zeroinitializer, %for.body ], [ %33, %vector.body ] 759 %vec.phi139 = phi <4 x float> [ zeroinitializer, %for.body ], [ %34, %vector.body ] 760 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0) 761 %3 = getelementptr inbounds float, float* %pIn, i32 %index 762 %4 = bitcast float* %3 to <4 x float>* 763 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 764 %5 = add i32 %index, %mul4 765 %6 = getelementptr inbounds float, float* %2, i32 %5 766 %7 = bitcast float* %6 to <4 x float>* 767 %wide.masked.load140 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 768 %8 = fmul fast <4 x float> %wide.masked.load140, %wide.masked.load 769 %9 = fadd fast <4 x float> %8, %vec.phi137 770 %10 = add i32 %index, %mul5 771 %11 = getelementptr inbounds float, float* %2, i32 %10 772 %12 = bitcast float* %11 to <4 x float>* 773 %wide.masked.load141 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 774 %13 = fmul fast <4 x float> %wide.masked.load141, %wide.masked.load 775 %14 = fadd fast <4 x float> %13, %vec.phi139 776 %15 = add i32 %index, %mul7 777 %16 = getelementptr inbounds float, float* %2, i32 %15 778 %17 = bitcast float* %16 to <4 x float>* 779 %wide.masked.load142 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 780 %18 = fmul fast <4 x float> %wide.masked.load142, %wide.masked.load 781 %19 = fadd fast <4 x float> %18, %vec.phi138 782 %20 = add i32 %index, %mul9 783 %21 = getelementptr inbounds float, float* %2, i32 %20 784 %22 = bitcast float* %21 to <4 x float>* 785 %wide.masked.load143 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 786 %23 = fmul fast <4 x float> %wide.masked.load143, %wide.masked.load 787 %24 = fadd fast <4 x float> %23, %vec.phi136 788 %25 = add i32 %index, %mul11 789 %26 = getelementptr inbounds float, float* %2, i32 %25 790 %27 = bitcast float* %26 to <4 x float>* 791 %wide.masked.load144 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 792 %28 = fmul fast <4 x float> %wide.masked.load144, %wide.masked.load 793 %29 = fadd fast <4 x float> %28, %vec.phi 794 %30 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi 795 %31 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi136 796 %32 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi137 797 %33 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi138 798 %34 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi139 799 %index.next = add i32 %index, 4 800 %35 = icmp eq i32 %index.next, %n.vec 801 br i1 %35, label %middle.block, label %vector.body 802 803middle.block: ; preds = %vector.body 804 %36 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %34) 805 %37 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %33) 806 %38 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %32) 807 %39 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %31) 808 %40 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %30) 809 %arrayidx42 = getelementptr inbounds float, float* %pOut, i32 %k2.0135 810 store float %38, float* %arrayidx42, align 4 811 %arrayidx44 = getelementptr inbounds float, float* %pOut, i32 %add 812 store float %36, float* %arrayidx44, align 4 813 %arrayidx46 = getelementptr inbounds float, float* %pOut, i32 %add6 814 store float %37, float* %arrayidx46, align 4 815 %arrayidx48 = getelementptr inbounds float, float* %pOut, i32 %add8 816 store float %39, float* %arrayidx48, align 4 817 %arrayidx50 = getelementptr inbounds float, float* %pOut, i32 %add10 818 store float %40, float* %arrayidx50, align 4 819 %add52 = add i32 %k2.0135, 5 820 %cmp3 = icmp ult i32 %add52, %sub 821 br i1 %cmp3, label %for.body, label %for.cond.cleanup 822} 823 824define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) { 825; CHECK-LABEL: DCT_mve6: 826; CHECK: @ %bb.0: @ %entry 827; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 828; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 829; CHECK-NEXT: .pad #4 830; CHECK-NEXT: sub sp, #4 831; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 832; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 833; CHECK-NEXT: .pad #32 834; CHECK-NEXT: sub sp, #32 835; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill 836; CHECK-NEXT: ldr r1, [r0, #4] 837; CHECK-NEXT: subs r1, #6 838; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill 839; CHECK-NEXT: cmp r1, #2 840; CHECK-NEXT: blo.w .LBB5_5 841; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 842; CHECK-NEXT: ldr r3, [r0, #8] 843; CHECK-NEXT: ldr r1, [r0] 844; CHECK-NEXT: adds r0, r3, #3 845; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill 846; CHECK-NEXT: bic r0, r0, #3 847; CHECK-NEXT: add.w r9, r1, r3, lsl #2 848; CHECK-NEXT: subs r1, r0, #4 849; CHECK-NEXT: movs r0, #1 850; CHECK-NEXT: lsls r5, r3, #2 851; CHECK-NEXT: add.w r1, r0, r1, lsr #2 852; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill 853; CHECK-NEXT: add.w r1, r3, r3, lsl #1 854; CHECK-NEXT: lsls r1, r1, #3 855; CHECK-NEXT: str r1, [sp] @ 4-byte Spill 856; CHECK-NEXT: .LBB5_2: @ %for.body 857; CHECK-NEXT: @ =>This Loop Header: Depth=1 858; CHECK-NEXT: @ Child Loop BB5_3 Depth 2 859; CHECK-NEXT: adds r1, r0, #5 860; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill 861; CHECK-NEXT: adds r1, r0, #4 862; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill 863; CHECK-NEXT: adds r1, r0, #3 864; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill 865; CHECK-NEXT: ldrd r1, r8, [sp, #4] @ 8-byte Folded Reload 866; CHECK-NEXT: vmov.i32 q1, #0x0 867; CHECK-NEXT: add.w r11, r0, #2 868; CHECK-NEXT: adds r4, r0, #1 869; CHECK-NEXT: dls lr, r1 870; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload 871; CHECK-NEXT: mov r3, r9 872; CHECK-NEXT: vmov q3, q1 873; CHECK-NEXT: vmov q4, q1 874; CHECK-NEXT: vmov q0, q1 875; CHECK-NEXT: vmov q5, q1 876; CHECK-NEXT: vmov q2, q1 877; CHECK-NEXT: .LBB5_3: @ %vector.body 878; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1 879; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 880; CHECK-NEXT: add.w r12, r3, r5 881; CHECK-NEXT: vctp.32 r8 882; CHECK-NEXT: vpsttt 883; CHECK-NEXT: vldrwt.u32 q6, [r1], #16 884; CHECK-NEXT: vldrwt.u32 q7, [r3], #16 885; CHECK-NEXT: vfmat.f32 q4, q7, q6 886; CHECK-NEXT: add.w r10, r12, r5 887; CHECK-NEXT: vpstt 888; CHECK-NEXT: vldrwt.u32 q7, [r12] 889; CHECK-NEXT: vfmat.f32 q5, q7, q6 890; CHECK-NEXT: add.w r6, r10, r5 891; CHECK-NEXT: vpstt 892; CHECK-NEXT: vldrwt.u32 q7, [r10] 893; CHECK-NEXT: vfmat.f32 q2, q7, q6 894; CHECK-NEXT: sub.w r8, r8, #4 895; CHECK-NEXT: adds r7, r6, r5 896; CHECK-NEXT: vpstt 897; CHECK-NEXT: vldrwt.u32 q7, [r6] 898; CHECK-NEXT: vfmat.f32 q0, q7, q6 899; CHECK-NEXT: adds r6, r7, r5 900; CHECK-NEXT: vpstttt 901; CHECK-NEXT: vldrwt.u32 q7, [r7] 902; CHECK-NEXT: vfmat.f32 q3, q7, q6 903; CHECK-NEXT: vldrwt.u32 q7, [r6] 904; CHECK-NEXT: vfmat.f32 q1, q7, q6 905; CHECK-NEXT: le lr, .LBB5_3 906; CHECK-NEXT: @ %bb.4: @ %middle.block 907; CHECK-NEXT: @ in Loop: Header=BB5_2 Depth=1 908; CHECK-NEXT: vadd.f32 s24, s22, s23 909; CHECK-NEXT: add.w r1, r2, r4, lsl #2 910; CHECK-NEXT: vadd.f32 s20, s20, s21 911; CHECK-NEXT: vadd.f32 s22, s18, s19 912; CHECK-NEXT: vadd.f32 s16, s16, s17 913; CHECK-NEXT: vadd.f32 s18, s6, s7 914; CHECK-NEXT: vadd.f32 s4, s4, s5 915; CHECK-NEXT: vadd.f32 s6, s14, s15 916; CHECK-NEXT: vadd.f32 s12, s12, s13 917; CHECK-NEXT: vadd.f32 s14, s10, s11 918; CHECK-NEXT: vadd.f32 s8, s8, s9 919; CHECK-NEXT: vadd.f32 s0, s0, s1 920; CHECK-NEXT: vadd.f32 s10, s2, s3 921; CHECK-NEXT: vadd.f32 s2, s20, s24 922; CHECK-NEXT: vadd.f32 s1, s16, s22 923; CHECK-NEXT: vadd.f32 s6, s12, s6 924; CHECK-NEXT: vadd.f32 s4, s4, s18 925; CHECK-NEXT: vadd.f32 s8, s8, s14 926; CHECK-NEXT: vadd.f32 s0, s0, s10 927; CHECK-NEXT: vstr s2, [r1] 928; CHECK-NEXT: add.w r1, r2, r0, lsl #2 929; CHECK-NEXT: adds r0, #6 930; CHECK-NEXT: vstr s1, [r1] 931; CHECK-NEXT: add.w r1, r2, r11, lsl #2 932; CHECK-NEXT: vstr s8, [r1] 933; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload 934; CHECK-NEXT: add.w r1, r2, r1, lsl #2 935; CHECK-NEXT: vstr s0, [r1] 936; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload 937; CHECK-NEXT: add.w r1, r2, r1, lsl #2 938; CHECK-NEXT: vstr s6, [r1] 939; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload 940; CHECK-NEXT: add.w r1, r2, r1, lsl #2 941; CHECK-NEXT: vstr s4, [r1] 942; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload 943; CHECK-NEXT: add r9, r1 944; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload 945; CHECK-NEXT: cmp r0, r1 946; CHECK-NEXT: blo.w .LBB5_2 947; CHECK-NEXT: .LBB5_5: @ %for.cond.cleanup 948; CHECK-NEXT: add sp, #32 949; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 950; CHECK-NEXT: add sp, #4 951; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 952entry: 953 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2 954 %0 = load i32, i32* %NumInputs, align 4 955 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1 956 %1 = load i32, i32* %NumFilters, align 4 957 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0 958 %2 = load float*, float** %pDCTCoefs, align 4 959 %cmp = icmp ugt i32 %0, 1 960 tail call void @llvm.assume(i1 %cmp) 961 %sub = add i32 %1, -6 962 %cmp3155 = icmp ugt i32 %sub, 1 963 br i1 %cmp3155, label %for.body.preheader, label %for.cond.cleanup 964 965for.body.preheader: ; preds = %entry 966 %n.rnd.up = add i32 %0, 3 967 %n.vec = and i32 %n.rnd.up, -4 968 br label %for.body 969 970for.cond.cleanup: ; preds = %middle.block, %entry 971 ret void 972 973for.body: ; preds = %for.body.preheader, %middle.block 974 %k2.0156 = phi i32 [ %add61, %middle.block ], [ 1, %for.body.preheader ] 975 %mul4 = mul i32 %k2.0156, %0 976 %add = add nuw i32 %k2.0156, 1 977 %mul5 = mul i32 %add, %0 978 %add6 = add i32 %k2.0156, 2 979 %mul7 = mul i32 %add6, %0 980 %add8 = add i32 %k2.0156, 3 981 %mul9 = mul i32 %add8, %0 982 %add10 = add i32 %k2.0156, 4 983 %mul11 = mul i32 %add10, %0 984 %add12 = add i32 %k2.0156, 5 985 %mul13 = mul i32 %add12, %0 986 br label %vector.body 987 988vector.body: ; preds = %vector.body, %for.body 989 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] 990 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %35, %vector.body ] 991 %vec.phi157 = phi <4 x float> [ zeroinitializer, %for.body ], [ %36, %vector.body ] 992 %vec.phi158 = phi <4 x float> [ zeroinitializer, %for.body ], [ %37, %vector.body ] 993 %vec.phi159 = phi <4 x float> [ zeroinitializer, %for.body ], [ %38, %vector.body ] 994 %vec.phi160 = phi <4 x float> [ zeroinitializer, %for.body ], [ %39, %vector.body ] 995 %vec.phi161 = phi <4 x float> [ zeroinitializer, %for.body ], [ %40, %vector.body ] 996 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0) 997 %3 = getelementptr inbounds float, float* %pIn, i32 %index 998 %4 = bitcast float* %3 to <4 x float>* 999 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1000 %5 = add i32 %index, %mul4 1001 %6 = getelementptr inbounds float, float* %2, i32 %5 1002 %7 = bitcast float* %6 to <4 x float>* 1003 %wide.masked.load162 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1004 %8 = fmul fast <4 x float> %wide.masked.load162, %wide.masked.load 1005 %9 = fadd fast <4 x float> %8, %vec.phi158 1006 %10 = add i32 %index, %mul5 1007 %11 = getelementptr inbounds float, float* %2, i32 %10 1008 %12 = bitcast float* %11 to <4 x float>* 1009 %wide.masked.load163 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1010 %13 = fmul fast <4 x float> %wide.masked.load163, %wide.masked.load 1011 %14 = fadd fast <4 x float> %13, %vec.phi160 1012 %15 = add i32 %index, %mul7 1013 %16 = getelementptr inbounds float, float* %2, i32 %15 1014 %17 = bitcast float* %16 to <4 x float>* 1015 %wide.masked.load164 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1016 %18 = fmul fast <4 x float> %wide.masked.load164, %wide.masked.load 1017 %19 = fadd fast <4 x float> %18, %vec.phi161 1018 %20 = add i32 %index, %mul9 1019 %21 = getelementptr inbounds float, float* %2, i32 %20 1020 %22 = bitcast float* %21 to <4 x float>* 1021 %wide.masked.load165 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1022 %23 = fmul fast <4 x float> %wide.masked.load165, %wide.masked.load 1023 %24 = fadd fast <4 x float> %23, %vec.phi159 1024 %25 = add i32 %index, %mul11 1025 %26 = getelementptr inbounds float, float* %2, i32 %25 1026 %27 = bitcast float* %26 to <4 x float>* 1027 %wide.masked.load166 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1028 %28 = fmul fast <4 x float> %wide.masked.load166, %wide.masked.load 1029 %29 = fadd fast <4 x float> %28, %vec.phi157 1030 %30 = add i32 %index, %mul13 1031 %31 = getelementptr inbounds float, float* %2, i32 %30 1032 %32 = bitcast float* %31 to <4 x float>* 1033 %wide.masked.load167 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1034 %33 = fmul fast <4 x float> %wide.masked.load167, %wide.masked.load 1035 %34 = fadd fast <4 x float> %33, %vec.phi 1036 %35 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi 1037 %36 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi157 1038 %37 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi158 1039 %38 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi159 1040 %39 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi160 1041 %40 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi161 1042 %index.next = add i32 %index, 4 1043 %41 = icmp eq i32 %index.next, %n.vec 1044 br i1 %41, label %middle.block, label %vector.body 1045 1046middle.block: ; preds = %vector.body 1047 %42 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %40) 1048 %43 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %39) 1049 %44 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %38) 1050 %45 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %37) 1051 %46 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %36) 1052 %47 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %35) 1053 %arrayidx49 = getelementptr inbounds float, float* %pOut, i32 %k2.0156 1054 store float %45, float* %arrayidx49, align 4 1055 %arrayidx51 = getelementptr inbounds float, float* %pOut, i32 %add 1056 store float %43, float* %arrayidx51, align 4 1057 %arrayidx53 = getelementptr inbounds float, float* %pOut, i32 %add6 1058 store float %42, float* %arrayidx53, align 4 1059 %arrayidx55 = getelementptr inbounds float, float* %pOut, i32 %add8 1060 store float %44, float* %arrayidx55, align 4 1061 %arrayidx57 = getelementptr inbounds float, float* %pOut, i32 %add10 1062 store float %46, float* %arrayidx57, align 4 1063 %arrayidx59 = getelementptr inbounds float, float* %pOut, i32 %add12 1064 store float %47, float* %arrayidx59, align 4 1065 %add61 = add i32 %k2.0156, 6 1066 %cmp3 = icmp ult i32 %add61, %sub 1067 br i1 %cmp3, label %for.body, label %for.cond.cleanup 1068} 1069 1070define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) { 1071; CHECK-LABEL: DCT_mve7: 1072; CHECK: @ %bb.0: @ %entry 1073; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 1074; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 1075; CHECK-NEXT: .pad #4 1076; CHECK-NEXT: sub sp, #4 1077; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1078; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1079; CHECK-NEXT: .pad #88 1080; CHECK-NEXT: sub sp, #88 1081; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill 1082; CHECK-NEXT: ldr r1, [r0, #4] 1083; CHECK-NEXT: subs r1, #7 1084; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill 1085; CHECK-NEXT: cmp r1, #2 1086; CHECK-NEXT: blo.w .LBB6_5 1087; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1088; CHECK-NEXT: ldr r3, [r0, #8] 1089; CHECK-NEXT: ldr r1, [r0] 1090; CHECK-NEXT: adds r0, r3, #3 1091; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill 1092; CHECK-NEXT: bic r0, r0, #3 1093; CHECK-NEXT: add.w r12, r1, r3, lsl #2 1094; CHECK-NEXT: subs r1, r0, #4 1095; CHECK-NEXT: movs r0, #1 1096; CHECK-NEXT: lsls r5, r3, #2 1097; CHECK-NEXT: add.w r1, r0, r1, lsr #2 1098; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill 1099; CHECK-NEXT: rsb r1, r3, r3, lsl #3 1100; CHECK-NEXT: lsls r1, r1, #2 1101; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill 1102; CHECK-NEXT: .LBB6_2: @ %for.body 1103; CHECK-NEXT: @ =>This Loop Header: Depth=1 1104; CHECK-NEXT: @ Child Loop BB6_3 Depth 2 1105; CHECK-NEXT: adds r1, r0, #6 1106; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill 1107; CHECK-NEXT: adds r1, r0, #5 1108; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill 1109; CHECK-NEXT: adds r1, r0, #4 1110; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill 1111; CHECK-NEXT: adds r1, r0, #3 1112; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill 1113; CHECK-NEXT: ldrd r3, r1, [sp, #16] @ 8-byte Folded Reload 1114; CHECK-NEXT: vmov.i32 q2, #0x0 1115; CHECK-NEXT: adds r4, r0, #2 1116; CHECK-NEXT: add.w r8, r0, #1 1117; CHECK-NEXT: dls lr, r3 1118; CHECK-NEXT: ldr.w r9, [sp, #28] @ 4-byte Reload 1119; CHECK-NEXT: mov r3, r12 1120; CHECK-NEXT: vmov q4, q2 1121; CHECK-NEXT: vmov q5, q2 1122; CHECK-NEXT: vmov q3, q2 1123; CHECK-NEXT: vmov q6, q2 1124; CHECK-NEXT: vmov q1, q2 1125; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill 1126; CHECK-NEXT: .LBB6_3: @ %vector.body 1127; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1 1128; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 1129; CHECK-NEXT: add.w r10, r3, r5 1130; CHECK-NEXT: vctp.32 r1 1131; CHECK-NEXT: vpsttt 1132; CHECK-NEXT: vldrwt.u32 q7, [r9], #16 1133; CHECK-NEXT: vldrwt.u32 q0, [r3], #16 1134; CHECK-NEXT: vfmat.f32 q5, q0, q7 1135; CHECK-NEXT: add.w r11, r10, r5 1136; CHECK-NEXT: vpstt 1137; CHECK-NEXT: vldrwt.u32 q0, [r10] 1138; CHECK-NEXT: vfmat.f32 q6, q0, q7 1139; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill 1140; CHECK-NEXT: vpstt 1141; CHECK-NEXT: vldrwt.u32 q0, [r11] 1142; CHECK-NEXT: vfmat.f32 q1, q0, q7 1143; CHECK-NEXT: add.w r6, r11, r5 1144; CHECK-NEXT: vmov q6, q5 1145; CHECK-NEXT: vmov q5, q4 1146; CHECK-NEXT: vmov q4, q2 1147; CHECK-NEXT: vmov q2, q3 1148; CHECK-NEXT: vpst 1149; CHECK-NEXT: vldrwt.u32 q0, [r6] 1150; CHECK-NEXT: vmov q3, q1 1151; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload 1152; CHECK-NEXT: adds r7, r6, r5 1153; CHECK-NEXT: vpst 1154; CHECK-NEXT: vfmat.f32 q1, q0, q7 1155; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill 1156; CHECK-NEXT: vmov q1, q3 1157; CHECK-NEXT: vmov q3, q2 1158; CHECK-NEXT: vmov q2, q4 1159; CHECK-NEXT: vmov q4, q5 1160; CHECK-NEXT: vmov q5, q6 1161; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload 1162; CHECK-NEXT: subs r1, #4 1163; CHECK-NEXT: adds r6, r7, r5 1164; CHECK-NEXT: vpstt 1165; CHECK-NEXT: vldrwt.u32 q0, [r7] 1166; CHECK-NEXT: vfmat.f32 q3, q0, q7 1167; CHECK-NEXT: adds r7, r6, r5 1168; CHECK-NEXT: vpstttt 1169; CHECK-NEXT: vldrwt.u32 q0, [r6] 1170; CHECK-NEXT: vfmat.f32 q4, q0, q7 1171; CHECK-NEXT: vldrwt.u32 q0, [r7] 1172; CHECK-NEXT: vfmat.f32 q2, q0, q7 1173; CHECK-NEXT: le lr, .LBB6_3 1174; CHECK-NEXT: @ %bb.4: @ %middle.block 1175; CHECK-NEXT: @ in Loop: Header=BB6_2 Depth=1 1176; CHECK-NEXT: vadd.f32 s0, s26, s27 1177; CHECK-NEXT: add.w r1, r2, r8, lsl #2 1178; CHECK-NEXT: vadd.f32 s2, s24, s25 1179; CHECK-NEXT: vadd.f32 s3, s20, s21 1180; CHECK-NEXT: vadd.f32 s1, s22, s23 1181; CHECK-NEXT: vadd.f32 s8, s8, s9 1182; CHECK-NEXT: vadd.f32 s20, s10, s11 1183; CHECK-NEXT: vadd.f32 s11, s14, s15 1184; CHECK-NEXT: vadd.f32 s12, s12, s13 1185; CHECK-NEXT: vadd.f32 s14, s6, s7 1186; CHECK-NEXT: vadd.f32 s4, s4, s5 1187; CHECK-NEXT: vadd.f32 s0, s2, s0 1188; CHECK-NEXT: vadd.f32 s10, s18, s19 1189; CHECK-NEXT: vadd.f32 s9, s16, s17 1190; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload 1191; CHECK-NEXT: vadd.f32 s2, s3, s1 1192; CHECK-NEXT: vadd.f32 s6, s18, s19 1193; CHECK-NEXT: vadd.f32 s5, s16, s17 1194; CHECK-NEXT: vadd.f32 s4, s4, s14 1195; CHECK-NEXT: vstr s0, [r1] 1196; CHECK-NEXT: add.w r1, r2, r0, lsl #2 1197; CHECK-NEXT: vadd.f32 s12, s12, s11 1198; CHECK-NEXT: adds r0, #7 1199; CHECK-NEXT: vadd.f32 s10, s9, s10 1200; CHECK-NEXT: vstr s2, [r1] 1201; CHECK-NEXT: add.w r1, r2, r4, lsl #2 1202; CHECK-NEXT: vadd.f32 s8, s8, s20 1203; CHECK-NEXT: vadd.f32 s6, s5, s6 1204; CHECK-NEXT: vstr s4, [r1] 1205; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload 1206; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1207; CHECK-NEXT: vstr s6, [r1] 1208; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload 1209; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1210; CHECK-NEXT: vstr s12, [r1] 1211; CHECK-NEXT: ldr r1, [sp, #40] @ 4-byte Reload 1212; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1213; CHECK-NEXT: vstr s10, [r1] 1214; CHECK-NEXT: ldr r1, [sp, #44] @ 4-byte Reload 1215; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1216; CHECK-NEXT: vstr s8, [r1] 1217; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload 1218; CHECK-NEXT: add r12, r1 1219; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload 1220; CHECK-NEXT: cmp r0, r1 1221; CHECK-NEXT: blo.w .LBB6_2 1222; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup 1223; CHECK-NEXT: add sp, #88 1224; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1225; CHECK-NEXT: add sp, #4 1226; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 1227entry: 1228 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2 1229 %0 = load i32, i32* %NumInputs, align 4 1230 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1 1231 %1 = load i32, i32* %NumFilters, align 4 1232 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0 1233 %2 = load float*, float** %pDCTCoefs, align 4 1234 %cmp = icmp ugt i32 %0, 1 1235 tail call void @llvm.assume(i1 %cmp) 1236 %sub = add i32 %1, -7 1237 %cmp3176 = icmp ugt i32 %sub, 1 1238 br i1 %cmp3176, label %for.body.preheader, label %for.cond.cleanup 1239 1240for.body.preheader: ; preds = %entry 1241 %n.rnd.up = add i32 %0, 3 1242 %n.vec = and i32 %n.rnd.up, -4 1243 br label %for.body 1244 1245for.cond.cleanup: ; preds = %middle.block, %entry 1246 ret void 1247 1248for.body: ; preds = %for.body.preheader, %middle.block 1249 %k2.0177 = phi i32 [ %add70, %middle.block ], [ 1, %for.body.preheader ] 1250 %mul4 = mul i32 %k2.0177, %0 1251 %add = add nuw i32 %k2.0177, 1 1252 %mul5 = mul i32 %add, %0 1253 %add6 = add i32 %k2.0177, 2 1254 %mul7 = mul i32 %add6, %0 1255 %add8 = add i32 %k2.0177, 3 1256 %mul9 = mul i32 %add8, %0 1257 %add10 = add i32 %k2.0177, 4 1258 %mul11 = mul i32 %add10, %0 1259 %add12 = add i32 %k2.0177, 5 1260 %mul13 = mul i32 %add12, %0 1261 %add14 = add i32 %k2.0177, 6 1262 %mul15 = mul i32 %add14, %0 1263 br label %vector.body 1264 1265vector.body: ; preds = %vector.body, %for.body 1266 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] 1267 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %40, %vector.body ] 1268 %vec.phi178 = phi <4 x float> [ zeroinitializer, %for.body ], [ %41, %vector.body ] 1269 %vec.phi179 = phi <4 x float> [ zeroinitializer, %for.body ], [ %42, %vector.body ] 1270 %vec.phi180 = phi <4 x float> [ zeroinitializer, %for.body ], [ %43, %vector.body ] 1271 %vec.phi181 = phi <4 x float> [ zeroinitializer, %for.body ], [ %44, %vector.body ] 1272 %vec.phi182 = phi <4 x float> [ zeroinitializer, %for.body ], [ %45, %vector.body ] 1273 %vec.phi183 = phi <4 x float> [ zeroinitializer, %for.body ], [ %46, %vector.body ] 1274 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0) 1275 %3 = getelementptr inbounds float, float* %pIn, i32 %index 1276 %4 = bitcast float* %3 to <4 x float>* 1277 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1278 %5 = add i32 %index, %mul4 1279 %6 = getelementptr inbounds float, float* %2, i32 %5 1280 %7 = bitcast float* %6 to <4 x float>* 1281 %wide.masked.load184 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1282 %8 = fmul fast <4 x float> %wide.masked.load184, %wide.masked.load 1283 %9 = fadd fast <4 x float> %8, %vec.phi179 1284 %10 = add i32 %index, %mul5 1285 %11 = getelementptr inbounds float, float* %2, i32 %10 1286 %12 = bitcast float* %11 to <4 x float>* 1287 %wide.masked.load185 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1288 %13 = fmul fast <4 x float> %wide.masked.load185, %wide.masked.load 1289 %14 = fadd fast <4 x float> %13, %vec.phi181 1290 %15 = add i32 %index, %mul7 1291 %16 = getelementptr inbounds float, float* %2, i32 %15 1292 %17 = bitcast float* %16 to <4 x float>* 1293 %wide.masked.load186 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1294 %18 = fmul fast <4 x float> %wide.masked.load186, %wide.masked.load 1295 %19 = fadd fast <4 x float> %18, %vec.phi183 1296 %20 = add i32 %index, %mul9 1297 %21 = getelementptr inbounds float, float* %2, i32 %20 1298 %22 = bitcast float* %21 to <4 x float>* 1299 %wide.masked.load187 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1300 %23 = fmul fast <4 x float> %wide.masked.load187, %wide.masked.load 1301 %24 = fadd fast <4 x float> %23, %vec.phi182 1302 %25 = add i32 %index, %mul11 1303 %26 = getelementptr inbounds float, float* %2, i32 %25 1304 %27 = bitcast float* %26 to <4 x float>* 1305 %wide.masked.load188 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1306 %28 = fmul fast <4 x float> %wide.masked.load188, %wide.masked.load 1307 %29 = fadd fast <4 x float> %28, %vec.phi180 1308 %30 = add i32 %index, %mul13 1309 %31 = getelementptr inbounds float, float* %2, i32 %30 1310 %32 = bitcast float* %31 to <4 x float>* 1311 %wide.masked.load189 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1312 %33 = fmul fast <4 x float> %wide.masked.load189, %wide.masked.load 1313 %34 = fadd fast <4 x float> %33, %vec.phi178 1314 %35 = add i32 %index, %mul15 1315 %36 = getelementptr inbounds float, float* %2, i32 %35 1316 %37 = bitcast float* %36 to <4 x float>* 1317 %wide.masked.load190 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %37, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1318 %38 = fmul fast <4 x float> %wide.masked.load190, %wide.masked.load 1319 %39 = fadd fast <4 x float> %38, %vec.phi 1320 %40 = select <4 x i1> %active.lane.mask, <4 x float> %39, <4 x float> %vec.phi 1321 %41 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi178 1322 %42 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi179 1323 %43 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi180 1324 %44 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi181 1325 %45 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi182 1326 %46 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi183 1327 %index.next = add i32 %index, 4 1328 %47 = icmp eq i32 %index.next, %n.vec 1329 br i1 %47, label %middle.block, label %vector.body 1330 1331middle.block: ; preds = %vector.body 1332 %48 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %46) 1333 %49 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %45) 1334 %50 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %44) 1335 %51 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %43) 1336 %52 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %42) 1337 %53 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %41) 1338 %54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %40) 1339 %arrayidx56 = getelementptr inbounds float, float* %pOut, i32 %k2.0177 1340 store float %52, float* %arrayidx56, align 4 1341 %arrayidx58 = getelementptr inbounds float, float* %pOut, i32 %add 1342 store float %50, float* %arrayidx58, align 4 1343 %arrayidx60 = getelementptr inbounds float, float* %pOut, i32 %add6 1344 store float %48, float* %arrayidx60, align 4 1345 %arrayidx62 = getelementptr inbounds float, float* %pOut, i32 %add8 1346 store float %49, float* %arrayidx62, align 4 1347 %arrayidx64 = getelementptr inbounds float, float* %pOut, i32 %add10 1348 store float %51, float* %arrayidx64, align 4 1349 %arrayidx66 = getelementptr inbounds float, float* %pOut, i32 %add12 1350 store float %53, float* %arrayidx66, align 4 1351 %arrayidx68 = getelementptr inbounds float, float* %pOut, i32 %add14 1352 store float %54, float* %arrayidx68, align 4 1353 %add70 = add i32 %k2.0177, 7 1354 %cmp3 = icmp ult i32 %add70, %sub 1355 br i1 %cmp3, label %for.body, label %for.cond.cleanup 1356} 1357 1358define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) { 1359; CHECK-LABEL: DCT_mve8: 1360; CHECK: @ %bb.0: @ %entry 1361; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 1362; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 1363; CHECK-NEXT: .pad #4 1364; CHECK-NEXT: sub sp, #4 1365; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1366; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1367; CHECK-NEXT: .pad #104 1368; CHECK-NEXT: sub sp, #104 1369; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill 1370; CHECK-NEXT: ldr r1, [r0, #4] 1371; CHECK-NEXT: subs r1, #8 1372; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill 1373; CHECK-NEXT: cmp r1, #2 1374; CHECK-NEXT: blo.w .LBB7_5 1375; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1376; CHECK-NEXT: ldr r3, [r0, #8] 1377; CHECK-NEXT: ldr r1, [r0] 1378; CHECK-NEXT: adds r0, r3, #3 1379; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill 1380; CHECK-NEXT: bic r0, r0, #3 1381; CHECK-NEXT: add.w r9, r1, r3, lsl #2 1382; CHECK-NEXT: subs r1, r0, #4 1383; CHECK-NEXT: movs r0, #1 1384; CHECK-NEXT: lsls r5, r3, #2 1385; CHECK-NEXT: add.w r1, r0, r1, lsr #2 1386; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill 1387; CHECK-NEXT: lsls r1, r3, #5 1388; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill 1389; CHECK-NEXT: .LBB7_2: @ %for.body 1390; CHECK-NEXT: @ =>This Loop Header: Depth=1 1391; CHECK-NEXT: @ Child Loop BB7_3 Depth 2 1392; CHECK-NEXT: adds r1, r0, #7 1393; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill 1394; CHECK-NEXT: adds r1, r0, #6 1395; CHECK-NEXT: ldrd r3, r10, [sp, #16] @ 8-byte Folded Reload 1396; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill 1397; CHECK-NEXT: adds r1, r0, #5 1398; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill 1399; CHECK-NEXT: adds r1, r0, #4 1400; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill 1401; CHECK-NEXT: dls lr, r3 1402; CHECK-NEXT: ldr.w r12, [sp, #28] @ 4-byte Reload 1403; CHECK-NEXT: vmov.i32 q3, #0x0 1404; CHECK-NEXT: adds r4, r0, #3 1405; CHECK-NEXT: add.w r8, r0, #2 1406; CHECK-NEXT: adds r1, r0, #1 1407; CHECK-NEXT: mov r3, r9 1408; CHECK-NEXT: vmov q5, q3 1409; CHECK-NEXT: vmov q6, q3 1410; CHECK-NEXT: vmov q4, q3 1411; CHECK-NEXT: vmov q7, q3 1412; CHECK-NEXT: vmov q2, q3 1413; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill 1414; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill 1415; CHECK-NEXT: .LBB7_3: @ %vector.body 1416; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1 1417; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 1418; CHECK-NEXT: add.w r11, r3, r5 1419; CHECK-NEXT: vctp.32 r10 1420; CHECK-NEXT: vpsttt 1421; CHECK-NEXT: vldrwt.u32 q0, [r12], #16 1422; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 1423; CHECK-NEXT: vfmat.f32 q6, q1, q0 1424; CHECK-NEXT: add.w r6, r11, r5 1425; CHECK-NEXT: vpstt 1426; CHECK-NEXT: vldrwt.u32 q1, [r11] 1427; CHECK-NEXT: vfmat.f32 q7, q1, q0 1428; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill 1429; CHECK-NEXT: vmov q7, q6 1430; CHECK-NEXT: vmov q6, q5 1431; CHECK-NEXT: vmov q5, q3 1432; CHECK-NEXT: vmov q3, q4 1433; CHECK-NEXT: vpst 1434; CHECK-NEXT: vldrwt.u32 q1, [r6] 1435; CHECK-NEXT: vmov q4, q2 1436; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload 1437; CHECK-NEXT: adds r7, r6, r5 1438; CHECK-NEXT: vpst 1439; CHECK-NEXT: vfmat.f32 q2, q1, q0 1440; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill 1441; CHECK-NEXT: vpst 1442; CHECK-NEXT: vldrwt.u32 q1, [r7] 1443; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload 1444; CHECK-NEXT: vpst 1445; CHECK-NEXT: vfmat.f32 q2, q1, q0 1446; CHECK-NEXT: adds r6, r7, r5 1447; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill 1448; CHECK-NEXT: vmov q2, q4 1449; CHECK-NEXT: vmov q4, q3 1450; CHECK-NEXT: vmov q3, q5 1451; CHECK-NEXT: vmov q5, q6 1452; CHECK-NEXT: vmov q6, q7 1453; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload 1454; CHECK-NEXT: adds r7, r6, r5 1455; CHECK-NEXT: vpstt 1456; CHECK-NEXT: vldrwt.u32 q1, [r6] 1457; CHECK-NEXT: vfmat.f32 q2, q1, q0 1458; CHECK-NEXT: sub.w r10, r10, #4 1459; CHECK-NEXT: adds r6, r7, r5 1460; CHECK-NEXT: vpstttt 1461; CHECK-NEXT: vldrwt.u32 q1, [r7] 1462; CHECK-NEXT: vfmat.f32 q4, q1, q0 1463; CHECK-NEXT: vldrwt.u32 q1, [r6] 1464; CHECK-NEXT: vfmat.f32 q5, q1, q0 1465; CHECK-NEXT: add r6, r5 1466; CHECK-NEXT: vpstt 1467; CHECK-NEXT: vldrwt.u32 q1, [r6] 1468; CHECK-NEXT: vfmat.f32 q3, q1, q0 1469; CHECK-NEXT: le lr, .LBB7_3 1470; CHECK-NEXT: @ %bb.4: @ %middle.block 1471; CHECK-NEXT: @ in Loop: Header=BB7_2 Depth=1 1472; CHECK-NEXT: vadd.f32 s0, s30, s31 1473; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1474; CHECK-NEXT: vadd.f32 s2, s28, s29 1475; CHECK-NEXT: vadd.f32 s12, s12, s13 1476; CHECK-NEXT: vadd.f32 s5, s14, s15 1477; CHECK-NEXT: vadd.f32 s4, s26, s27 1478; CHECK-NEXT: vadd.f32 s6, s24, s25 1479; CHECK-NEXT: vadd.f32 s14, s18, s19 1480; CHECK-NEXT: vadd.f32 s7, s16, s17 1481; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload 1482; CHECK-NEXT: vadd.f32 s8, s8, s9 1483; CHECK-NEXT: vadd.f32 s13, s10, s11 1484; CHECK-NEXT: vadd.f32 s10, s18, s19 1485; CHECK-NEXT: vadd.f32 s9, s16, s17 1486; CHECK-NEXT: vldrw.u32 q4, [sp, #80] @ 16-byte Reload 1487; CHECK-NEXT: vadd.f32 s0, s2, s0 1488; CHECK-NEXT: vadd.f32 s11, s18, s19 1489; CHECK-NEXT: vadd.f32 s15, s16, s17 1490; CHECK-NEXT: vadd.f32 s2, s6, s4 1491; CHECK-NEXT: vadd.f32 s6, s12, s5 1492; CHECK-NEXT: vadd.f32 s12, s7, s14 1493; CHECK-NEXT: vadd.f32 s10, s9, s10 1494; CHECK-NEXT: vstr s0, [r1] 1495; CHECK-NEXT: add.w r1, r2, r0, lsl #2 1496; CHECK-NEXT: vadd.f32 s8, s8, s13 1497; CHECK-NEXT: adds r0, #8 1498; CHECK-NEXT: vadd.f32 s14, s15, s11 1499; CHECK-NEXT: vstr s2, [r1] 1500; CHECK-NEXT: add.w r1, r2, r8, lsl #2 1501; CHECK-NEXT: vadd.f32 s1, s22, s23 1502; CHECK-NEXT: vadd.f32 s3, s20, s21 1503; CHECK-NEXT: vstr s10, [r1] 1504; CHECK-NEXT: add.w r1, r2, r4, lsl #2 1505; CHECK-NEXT: vstr s14, [r1] 1506; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload 1507; CHECK-NEXT: vadd.f32 s4, s3, s1 1508; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1509; CHECK-NEXT: vstr s8, [r1] 1510; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload 1511; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1512; CHECK-NEXT: vstr s12, [r1] 1513; CHECK-NEXT: ldr r1, [sp, #40] @ 4-byte Reload 1514; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1515; CHECK-NEXT: vstr s4, [r1] 1516; CHECK-NEXT: ldr r1, [sp, #44] @ 4-byte Reload 1517; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1518; CHECK-NEXT: vstr s6, [r1] 1519; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload 1520; CHECK-NEXT: add r9, r1 1521; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload 1522; CHECK-NEXT: cmp r0, r1 1523; CHECK-NEXT: blo.w .LBB7_2 1524; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup 1525; CHECK-NEXT: add sp, #104 1526; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1527; CHECK-NEXT: add sp, #4 1528; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 1529entry: 1530 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2 1531 %0 = load i32, i32* %NumInputs, align 4 1532 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1 1533 %1 = load i32, i32* %NumFilters, align 4 1534 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0 1535 %2 = load float*, float** %pDCTCoefs, align 4 1536 %cmp = icmp ugt i32 %0, 1 1537 tail call void @llvm.assume(i1 %cmp) 1538 %sub = add i32 %1, -8 1539 %cmp3197 = icmp ugt i32 %sub, 1 1540 br i1 %cmp3197, label %for.body.preheader, label %for.cond.cleanup 1541 1542for.body.preheader: ; preds = %entry 1543 %n.rnd.up = add i32 %0, 3 1544 %n.vec = and i32 %n.rnd.up, -4 1545 br label %for.body 1546 1547for.cond.cleanup: ; preds = %middle.block, %entry 1548 ret void 1549 1550for.body: ; preds = %for.body.preheader, %middle.block 1551 %k2.0198 = phi i32 [ %add79, %middle.block ], [ 1, %for.body.preheader ] 1552 %mul4 = mul i32 %k2.0198, %0 1553 %add = add nuw nsw i32 %k2.0198, 1 1554 %mul5 = mul i32 %add, %0 1555 %add6 = add nuw nsw i32 %k2.0198, 2 1556 %mul7 = mul i32 %add6, %0 1557 %add8 = add nuw nsw i32 %k2.0198, 3 1558 %mul9 = mul i32 %add8, %0 1559 %add10 = add nuw nsw i32 %k2.0198, 4 1560 %mul11 = mul i32 %add10, %0 1561 %add12 = add nuw nsw i32 %k2.0198, 5 1562 %mul13 = mul i32 %add12, %0 1563 %add14 = add nuw nsw i32 %k2.0198, 6 1564 %mul15 = mul i32 %add14, %0 1565 %add16 = add i32 %k2.0198, 7 1566 %mul17 = mul i32 %add16, %0 1567 br label %vector.body 1568 1569vector.body: ; preds = %vector.body, %for.body 1570 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] 1571 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %45, %vector.body ] 1572 %vec.phi199 = phi <4 x float> [ zeroinitializer, %for.body ], [ %46, %vector.body ] 1573 %vec.phi200 = phi <4 x float> [ zeroinitializer, %for.body ], [ %47, %vector.body ] 1574 %vec.phi201 = phi <4 x float> [ zeroinitializer, %for.body ], [ %48, %vector.body ] 1575 %vec.phi202 = phi <4 x float> [ zeroinitializer, %for.body ], [ %49, %vector.body ] 1576 %vec.phi203 = phi <4 x float> [ zeroinitializer, %for.body ], [ %50, %vector.body ] 1577 %vec.phi204 = phi <4 x float> [ zeroinitializer, %for.body ], [ %51, %vector.body ] 1578 %vec.phi205 = phi <4 x float> [ zeroinitializer, %for.body ], [ %52, %vector.body ] 1579 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0) 1580 %3 = getelementptr inbounds float, float* %pIn, i32 %index 1581 %4 = bitcast float* %3 to <4 x float>* 1582 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1583 %5 = add i32 %index, %mul4 1584 %6 = getelementptr inbounds float, float* %2, i32 %5 1585 %7 = bitcast float* %6 to <4 x float>* 1586 %wide.masked.load206 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1587 %8 = fmul fast <4 x float> %wide.masked.load206, %wide.masked.load 1588 %9 = fadd fast <4 x float> %8, %vec.phi200 1589 %10 = add i32 %index, %mul5 1590 %11 = getelementptr inbounds float, float* %2, i32 %10 1591 %12 = bitcast float* %11 to <4 x float>* 1592 %wide.masked.load207 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1593 %13 = fmul fast <4 x float> %wide.masked.load207, %wide.masked.load 1594 %14 = fadd fast <4 x float> %13, %vec.phi202 1595 %15 = add i32 %index, %mul7 1596 %16 = getelementptr inbounds float, float* %2, i32 %15 1597 %17 = bitcast float* %16 to <4 x float>* 1598 %wide.masked.load208 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1599 %18 = fmul fast <4 x float> %wide.masked.load208, %wide.masked.load 1600 %19 = fadd fast <4 x float> %18, %vec.phi204 1601 %20 = add i32 %index, %mul9 1602 %21 = getelementptr inbounds float, float* %2, i32 %20 1603 %22 = bitcast float* %21 to <4 x float>* 1604 %wide.masked.load209 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1605 %23 = fmul fast <4 x float> %wide.masked.load209, %wide.masked.load 1606 %24 = fadd fast <4 x float> %23, %vec.phi205 1607 %25 = add i32 %index, %mul11 1608 %26 = getelementptr inbounds float, float* %2, i32 %25 1609 %27 = bitcast float* %26 to <4 x float>* 1610 %wide.masked.load210 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1611 %28 = fmul fast <4 x float> %wide.masked.load210, %wide.masked.load 1612 %29 = fadd fast <4 x float> %28, %vec.phi203 1613 %30 = add i32 %index, %mul13 1614 %31 = getelementptr inbounds float, float* %2, i32 %30 1615 %32 = bitcast float* %31 to <4 x float>* 1616 %wide.masked.load211 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1617 %33 = fmul fast <4 x float> %wide.masked.load211, %wide.masked.load 1618 %34 = fadd fast <4 x float> %33, %vec.phi201 1619 %35 = add i32 %index, %mul15 1620 %36 = getelementptr inbounds float, float* %2, i32 %35 1621 %37 = bitcast float* %36 to <4 x float>* 1622 %wide.masked.load212 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %37, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1623 %38 = fmul fast <4 x float> %wide.masked.load212, %wide.masked.load 1624 %39 = fadd fast <4 x float> %38, %vec.phi199 1625 %40 = add i32 %index, %mul17 1626 %41 = getelementptr inbounds float, float* %2, i32 %40 1627 %42 = bitcast float* %41 to <4 x float>* 1628 %wide.masked.load213 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %42, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1629 %43 = fmul fast <4 x float> %wide.masked.load213, %wide.masked.load 1630 %44 = fadd fast <4 x float> %43, %vec.phi 1631 %45 = select <4 x i1> %active.lane.mask, <4 x float> %44, <4 x float> %vec.phi 1632 %46 = select <4 x i1> %active.lane.mask, <4 x float> %39, <4 x float> %vec.phi199 1633 %47 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi200 1634 %48 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi201 1635 %49 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi202 1636 %50 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi203 1637 %51 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi204 1638 %52 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi205 1639 %index.next = add i32 %index, 4 1640 %53 = icmp eq i32 %index.next, %n.vec 1641 br i1 %53, label %middle.block, label %vector.body 1642 1643middle.block: ; preds = %vector.body 1644 %54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %52) 1645 %55 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %51) 1646 %56 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %50) 1647 %57 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %49) 1648 %58 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %48) 1649 %59 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %47) 1650 %60 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %46) 1651 %61 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %45) 1652 %arrayidx63 = getelementptr inbounds float, float* %pOut, i32 %k2.0198 1653 store float %59, float* %arrayidx63, align 4 1654 %arrayidx65 = getelementptr inbounds float, float* %pOut, i32 %add 1655 store float %57, float* %arrayidx65, align 4 1656 %arrayidx67 = getelementptr inbounds float, float* %pOut, i32 %add6 1657 store float %55, float* %arrayidx67, align 4 1658 %arrayidx69 = getelementptr inbounds float, float* %pOut, i32 %add8 1659 store float %54, float* %arrayidx69, align 4 1660 %arrayidx71 = getelementptr inbounds float, float* %pOut, i32 %add10 1661 store float %56, float* %arrayidx71, align 4 1662 %arrayidx73 = getelementptr inbounds float, float* %pOut, i32 %add12 1663 store float %58, float* %arrayidx73, align 4 1664 %arrayidx75 = getelementptr inbounds float, float* %pOut, i32 %add14 1665 store float %60, float* %arrayidx75, align 4 1666 %arrayidx77 = getelementptr inbounds float, float* %pOut, i32 %add16 1667 store float %61, float* %arrayidx77, align 4 1668 %add79 = add i32 %k2.0198, 8 1669 %cmp3 = icmp ult i32 %add79, %sub 1670 br i1 %cmp3, label %for.body, label %for.cond.cleanup 1671} 1672 1673declare void @llvm.assume(i1 noundef) 1674declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) 1675declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) 1676declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) 1677