1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
3
4%struct.DCT_InstanceTypeDef = type { float*, i32, i32 }
5
6define void @DCT_mve1(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
7; CHECK-LABEL: DCT_mve1:
8; CHECK:       @ %bb.0: @ %entry
9; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
10; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
11; CHECK-NEXT:    ldr r3, [r0, #4]
12; CHECK-NEXT:    sub.w r12, r3, #1
13; CHECK-NEXT:    cmp.w r12, #2
14; CHECK-NEXT:    blo .LBB0_5
15; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
16; CHECK-NEXT:    ldr r5, [r0, #8]
17; CHECK-NEXT:    ldr r3, [r0]
18; CHECK-NEXT:    add.w r4, r3, r5, lsl #2
19; CHECK-NEXT:    movs r0, #1
20; CHECK-NEXT:    lsl.w r9, r5, #2
21; CHECK-NEXT:  .LBB0_2: @ %for.body
22; CHECK-NEXT:    @ =>This Loop Header: Depth=1
23; CHECK-NEXT:    @ Child Loop BB0_3 Depth 2
24; CHECK-NEXT:    vmov.i32 q0, #0x0
25; CHECK-NEXT:    dlstp.32 lr, r5
26; CHECK-NEXT:    mov r7, r1
27; CHECK-NEXT:    mov r3, r4
28; CHECK-NEXT:    mov r6, r5
29; CHECK-NEXT:  .LBB0_3: @ %vector.body
30; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
31; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
32; CHECK-NEXT:    vldrw.u32 q1, [r7], #16
33; CHECK-NEXT:    vldrw.u32 q2, [r3], #16
34; CHECK-NEXT:    vfma.f32 q0, q2, q1
35; CHECK-NEXT:    letp lr, .LBB0_3
36; CHECK-NEXT:  @ %bb.4: @ %middle.block
37; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
38; CHECK-NEXT:    vadd.f32 s4, s2, s3
39; CHECK-NEXT:    add.w r3, r2, r0, lsl #2
40; CHECK-NEXT:    vadd.f32 s0, s0, s1
41; CHECK-NEXT:    adds r0, #1
42; CHECK-NEXT:    add r4, r9
43; CHECK-NEXT:    cmp r0, r12
44; CHECK-NEXT:    vadd.f32 s0, s0, s4
45; CHECK-NEXT:    vstr s0, [r3]
46; CHECK-NEXT:    bne .LBB0_2
47; CHECK-NEXT:  .LBB0_5: @ %for.cond.cleanup
48; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
49entry:
50  %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
51  %0 = load i32, i32* %NumInputs, align 4
52  %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
53  %1 = load i32, i32* %NumFilters, align 4
54  %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
55  %2 = load float*, float** %pDCTCoefs, align 4
56  %cmp = icmp ugt i32 %0, 1
57  tail call void @llvm.assume(i1 %cmp)
58  %sub = add i32 %1, -1
59  %cmp350 = icmp ugt i32 %sub, 1
60  br i1 %cmp350, label %for.body.preheader, label %for.cond.cleanup
61
62for.body.preheader:                               ; preds = %entry
63  %n.rnd.up = add i32 %0, 3
64  %n.vec = and i32 %n.rnd.up, -4
65  br label %for.body
66
67for.cond.cleanup:                                 ; preds = %middle.block, %entry
68  ret void
69
70for.body:                                         ; preds = %for.body.preheader, %middle.block
71  %k2.051 = phi i32 [ %add16, %middle.block ], [ 1, %for.body.preheader ]
72  %mul4 = mul i32 %k2.051, %0
73  br label %vector.body
74
75vector.body:                                      ; preds = %vector.body, %for.body
76  %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
77  %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %10, %vector.body ]
78  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
79  %3 = getelementptr inbounds float, float* %pIn, i32 %index
80  %4 = bitcast float* %3 to <4 x float>*
81  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
82  %5 = add i32 %index, %mul4
83  %6 = getelementptr inbounds float, float* %2, i32 %5
84  %7 = bitcast float* %6 to <4 x float>*
85  %wide.masked.load53 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
86  %8 = fmul fast <4 x float> %wide.masked.load53, %wide.masked.load
87  %9 = fadd fast <4 x float> %8, %vec.phi
88  %10 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi
89  %index.next = add i32 %index, 4
90  %11 = icmp eq i32 %index.next, %n.vec
91  br i1 %11, label %middle.block, label %vector.body
92
93middle.block:                                     ; preds = %vector.body
94  %12 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %10)
95  %arrayidx14 = getelementptr inbounds float, float* %pOut, i32 %k2.051
96  store float %12, float* %arrayidx14, align 4
97  %add16 = add nuw i32 %k2.051, 1
98  %exitcond52.not = icmp eq i32 %add16, %sub
99  br i1 %exitcond52.not, label %for.cond.cleanup, label %for.body
100}
101
102define void @DCT_mve2(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
103; CHECK-LABEL: DCT_mve2:
104; CHECK:       @ %bb.0: @ %entry
105; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
106; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
107; CHECK-NEXT:    .pad #4
108; CHECK-NEXT:    sub sp, #4
109; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
110; CHECK-NEXT:    ldr r1, [r0, #4]
111; CHECK-NEXT:    subs r1, #2
112; CHECK-NEXT:    cmp r1, #2
113; CHECK-NEXT:    blo .LBB1_5
114; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
115; CHECK-NEXT:    ldr.w r12, [r0, #8]
116; CHECK-NEXT:    movs r4, #1
117; CHECK-NEXT:    ldr r3, [r0]
118; CHECK-NEXT:    add.w r0, r12, #3
119; CHECK-NEXT:    bic r0, r0, #3
120; CHECK-NEXT:    add.w r5, r3, r12, lsl #2
121; CHECK-NEXT:    subs r0, #4
122; CHECK-NEXT:    add.w r7, r3, r12, lsl #3
123; CHECK-NEXT:    lsl.w r9, r12, #3
124; CHECK-NEXT:    add.w r8, r4, r0, lsr #2
125; CHECK-NEXT:  .LBB1_2: @ %for.body
126; CHECK-NEXT:    @ =>This Loop Header: Depth=1
127; CHECK-NEXT:    @ Child Loop BB1_3 Depth 2
128; CHECK-NEXT:    dls lr, r8
129; CHECK-NEXT:    ldr r6, [sp] @ 4-byte Reload
130; CHECK-NEXT:    vmov.i32 q0, #0x0
131; CHECK-NEXT:    add.w r11, r4, #1
132; CHECK-NEXT:    mov r3, r5
133; CHECK-NEXT:    mov r0, r7
134; CHECK-NEXT:    vmov q1, q0
135; CHECK-NEXT:    mov r10, r12
136; CHECK-NEXT:  .LBB1_3: @ %vector.body
137; CHECK-NEXT:    @ Parent Loop BB1_2 Depth=1
138; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
139; CHECK-NEXT:    vctp.32 r10
140; CHECK-NEXT:    sub.w r10, r10, #4
141; CHECK-NEXT:    vpstttt
142; CHECK-NEXT:    vldrwt.u32 q2, [r6], #16
143; CHECK-NEXT:    vldrwt.u32 q3, [r3], #16
144; CHECK-NEXT:    vfmat.f32 q1, q3, q2
145; CHECK-NEXT:    vldrwt.u32 q3, [r0], #16
146; CHECK-NEXT:    vpst
147; CHECK-NEXT:    vfmat.f32 q0, q3, q2
148; CHECK-NEXT:    le lr, .LBB1_3
149; CHECK-NEXT:  @ %bb.4: @ %middle.block
150; CHECK-NEXT:    @ in Loop: Header=BB1_2 Depth=1
151; CHECK-NEXT:    vadd.f32 s8, s2, s3
152; CHECK-NEXT:    add.w r0, r2, r11, lsl #2
153; CHECK-NEXT:    vadd.f32 s0, s0, s1
154; CHECK-NEXT:    add r5, r9
155; CHECK-NEXT:    vadd.f32 s2, s6, s7
156; CHECK-NEXT:    add r7, r9
157; CHECK-NEXT:    vadd.f32 s4, s4, s5
158; CHECK-NEXT:    vadd.f32 s0, s0, s8
159; CHECK-NEXT:    vadd.f32 s2, s4, s2
160; CHECK-NEXT:    vstr s0, [r0]
161; CHECK-NEXT:    add.w r0, r2, r4, lsl #2
162; CHECK-NEXT:    adds r4, #2
163; CHECK-NEXT:    cmp r4, r1
164; CHECK-NEXT:    vstr s2, [r0]
165; CHECK-NEXT:    blo .LBB1_2
166; CHECK-NEXT:  .LBB1_5: @ %for.cond.cleanup
167; CHECK-NEXT:    add sp, #4
168; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
169entry:
170  %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
171  %0 = load i32, i32* %NumInputs, align 4
172  %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
173  %1 = load i32, i32* %NumFilters, align 4
174  %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
175  %2 = load float*, float** %pDCTCoefs, align 4
176  %cmp = icmp ugt i32 %0, 1
177  tail call void @llvm.assume(i1 %cmp)
178  %sub = add i32 %1, -2
179  %cmp371 = icmp ugt i32 %sub, 1
180  br i1 %cmp371, label %for.body.preheader, label %for.cond.cleanup
181
182for.body.preheader:                               ; preds = %entry
183  %n.rnd.up = add i32 %0, 3
184  %n.vec = and i32 %n.rnd.up, -4
185  br label %for.body
186
187for.cond.cleanup:                                 ; preds = %middle.block, %entry
188  ret void
189
190for.body:                                         ; preds = %for.body.preheader, %middle.block
191  %k2.072 = phi i32 [ %add25, %middle.block ], [ 1, %for.body.preheader ]
192  %mul4 = mul i32 %k2.072, %0
193  %add = add nuw i32 %k2.072, 1
194  %mul5 = mul i32 %add, %0
195  br label %vector.body
196
197vector.body:                                      ; preds = %vector.body, %for.body
198  %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
199  %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %15, %vector.body ]
200  %vec.phi73 = phi <4 x float> [ zeroinitializer, %for.body ], [ %16, %vector.body ]
201  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
202  %3 = getelementptr inbounds float, float* %pIn, i32 %index
203  %4 = bitcast float* %3 to <4 x float>*
204  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
205  %5 = add i32 %index, %mul4
206  %6 = getelementptr inbounds float, float* %2, i32 %5
207  %7 = bitcast float* %6 to <4 x float>*
208  %wide.masked.load74 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
209  %8 = fmul fast <4 x float> %wide.masked.load74, %wide.masked.load
210  %9 = fadd fast <4 x float> %8, %vec.phi73
211  %10 = add i32 %index, %mul5
212  %11 = getelementptr inbounds float, float* %2, i32 %10
213  %12 = bitcast float* %11 to <4 x float>*
214  %wide.masked.load75 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
215  %13 = fmul fast <4 x float> %wide.masked.load75, %wide.masked.load
216  %14 = fadd fast <4 x float> %13, %vec.phi
217  %15 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi
218  %16 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi73
219  %index.next = add i32 %index, 4
220  %17 = icmp eq i32 %index.next, %n.vec
221  br i1 %17, label %middle.block, label %vector.body
222
223middle.block:                                     ; preds = %vector.body
224  %18 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %16)
225  %19 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %15)
226  %arrayidx21 = getelementptr inbounds float, float* %pOut, i32 %k2.072
227  store float %18, float* %arrayidx21, align 4
228  %arrayidx23 = getelementptr inbounds float, float* %pOut, i32 %add
229  store float %19, float* %arrayidx23, align 4
230  %add25 = add i32 %k2.072, 2
231  %cmp3 = icmp ult i32 %add25, %sub
232  br i1 %cmp3, label %for.body, label %for.cond.cleanup
233}
234
235define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
236; CHECK-LABEL: DCT_mve3:
237; CHECK:       @ %bb.0: @ %entry
238; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
239; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
240; CHECK-NEXT:    .pad #4
241; CHECK-NEXT:    sub sp, #4
242; CHECK-NEXT:    .vsave {d8, d9}
243; CHECK-NEXT:    vpush {d8, d9}
244; CHECK-NEXT:    .pad #16
245; CHECK-NEXT:    sub sp, #16
246; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
247; CHECK-NEXT:    ldr r1, [r0, #4]
248; CHECK-NEXT:    subs r1, #3
249; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
250; CHECK-NEXT:    cmp r1, #2
251; CHECK-NEXT:    blo .LBB2_5
252; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
253; CHECK-NEXT:    ldr r7, [r0, #8]
254; CHECK-NEXT:    movs r5, #1
255; CHECK-NEXT:    ldr r3, [r0]
256; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
257; CHECK-NEXT:    add.w r0, r7, r7, lsl #1
258; CHECK-NEXT:    add.w r12, r3, r7, lsl #2
259; CHECK-NEXT:    add.w r1, r3, r7, lsl #3
260; CHECK-NEXT:    add.w r8, r3, r0, lsl #2
261; CHECK-NEXT:    adds r3, r7, #3
262; CHECK-NEXT:    bic r3, r3, #3
263; CHECK-NEXT:    lsls r7, r0, #2
264; CHECK-NEXT:    subs r3, #4
265; CHECK-NEXT:    add.w r3, r5, r3, lsr #2
266; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
267; CHECK-NEXT:  .LBB2_2: @ %for.body
268; CHECK-NEXT:    @ =>This Loop Header: Depth=1
269; CHECK-NEXT:    @ Child Loop BB2_3 Depth 2
270; CHECK-NEXT:    ldrd r0, r10, [sp] @ 8-byte Folded Reload
271; CHECK-NEXT:    vmov.i32 q0, #0x0
272; CHECK-NEXT:    add.w r9, r5, #2
273; CHECK-NEXT:    add.w r11, r5, #1
274; CHECK-NEXT:    dls lr, r0
275; CHECK-NEXT:    ldr r6, [sp, #12] @ 4-byte Reload
276; CHECK-NEXT:    mov r3, r12
277; CHECK-NEXT:    mov r0, r1
278; CHECK-NEXT:    mov r4, r8
279; CHECK-NEXT:    vmov q2, q0
280; CHECK-NEXT:    vmov q1, q0
281; CHECK-NEXT:  .LBB2_3: @ %vector.body
282; CHECK-NEXT:    @ Parent Loop BB2_2 Depth=1
283; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
284; CHECK-NEXT:    vctp.32 r10
285; CHECK-NEXT:    sub.w r10, r10, #4
286; CHECK-NEXT:    vpstttt
287; CHECK-NEXT:    vldrwt.u32 q3, [r6], #16
288; CHECK-NEXT:    vldrwt.u32 q4, [r3], #16
289; CHECK-NEXT:    vfmat.f32 q1, q4, q3
290; CHECK-NEXT:    vldrwt.u32 q4, [r0], #16
291; CHECK-NEXT:    vpsttt
292; CHECK-NEXT:    vfmat.f32 q2, q4, q3
293; CHECK-NEXT:    vldrwt.u32 q4, [r4], #16
294; CHECK-NEXT:    vfmat.f32 q0, q4, q3
295; CHECK-NEXT:    le lr, .LBB2_3
296; CHECK-NEXT:  @ %bb.4: @ %middle.block
297; CHECK-NEXT:    @ in Loop: Header=BB2_2 Depth=1
298; CHECK-NEXT:    vadd.f32 s12, s10, s11
299; CHECK-NEXT:    add.w r0, r2, r11, lsl #2
300; CHECK-NEXT:    vadd.f32 s8, s8, s9
301; CHECK-NEXT:    add r12, r7
302; CHECK-NEXT:    vadd.f32 s10, s6, s7
303; CHECK-NEXT:    add r1, r7
304; CHECK-NEXT:    vadd.f32 s4, s4, s5
305; CHECK-NEXT:    add r8, r7
306; CHECK-NEXT:    vadd.f32 s6, s2, s3
307; CHECK-NEXT:    vadd.f32 s0, s0, s1
308; CHECK-NEXT:    vadd.f32 s2, s8, s12
309; CHECK-NEXT:    vadd.f32 s4, s4, s10
310; CHECK-NEXT:    vadd.f32 s0, s0, s6
311; CHECK-NEXT:    vstr s2, [r0]
312; CHECK-NEXT:    add.w r0, r2, r5, lsl #2
313; CHECK-NEXT:    adds r5, #3
314; CHECK-NEXT:    vstr s4, [r0]
315; CHECK-NEXT:    add.w r0, r2, r9, lsl #2
316; CHECK-NEXT:    vstr s0, [r0]
317; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
318; CHECK-NEXT:    cmp r5, r0
319; CHECK-NEXT:    blo .LBB2_2
320; CHECK-NEXT:  .LBB2_5: @ %for.cond.cleanup
321; CHECK-NEXT:    add sp, #16
322; CHECK-NEXT:    vpop {d8, d9}
323; CHECK-NEXT:    add sp, #4
324; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
325entry:
326  %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
327  %0 = load i32, i32* %NumInputs, align 4
328  %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
329  %1 = load i32, i32* %NumFilters, align 4
330  %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
331  %2 = load float*, float** %pDCTCoefs, align 4
332  %cmp = icmp ugt i32 %0, 1
333  tail call void @llvm.assume(i1 %cmp)
334  %sub = add i32 %1, -3
335  %cmp392 = icmp ugt i32 %sub, 1
336  br i1 %cmp392, label %for.body.preheader, label %for.cond.cleanup
337
338for.body.preheader:                               ; preds = %entry
339  %n.rnd.up = add i32 %0, 3
340  %n.vec = and i32 %n.rnd.up, -4
341  br label %for.body
342
343for.cond.cleanup:                                 ; preds = %middle.block, %entry
344  ret void
345
346for.body:                                         ; preds = %for.body.preheader, %middle.block
347  %k2.093 = phi i32 [ %add34, %middle.block ], [ 1, %for.body.preheader ]
348  %mul4 = mul i32 %k2.093, %0
349  %add = add nuw i32 %k2.093, 1
350  %mul5 = mul i32 %add, %0
351  %add6 = add i32 %k2.093, 2
352  %mul7 = mul i32 %add6, %0
353  br label %vector.body
354
355vector.body:                                      ; preds = %vector.body, %for.body
356  %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
357  %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %20, %vector.body ]
358  %vec.phi94 = phi <4 x float> [ zeroinitializer, %for.body ], [ %21, %vector.body ]
359  %vec.phi95 = phi <4 x float> [ zeroinitializer, %for.body ], [ %22, %vector.body ]
360  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
361  %3 = getelementptr inbounds float, float* %pIn, i32 %index
362  %4 = bitcast float* %3 to <4 x float>*
363  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
364  %5 = add i32 %index, %mul4
365  %6 = getelementptr inbounds float, float* %2, i32 %5
366  %7 = bitcast float* %6 to <4 x float>*
367  %wide.masked.load96 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
368  %8 = fmul fast <4 x float> %wide.masked.load96, %wide.masked.load
369  %9 = fadd fast <4 x float> %8, %vec.phi95
370  %10 = add i32 %index, %mul5
371  %11 = getelementptr inbounds float, float* %2, i32 %10
372  %12 = bitcast float* %11 to <4 x float>*
373  %wide.masked.load97 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
374  %13 = fmul fast <4 x float> %wide.masked.load97, %wide.masked.load
375  %14 = fadd fast <4 x float> %13, %vec.phi94
376  %15 = add i32 %index, %mul7
377  %16 = getelementptr inbounds float, float* %2, i32 %15
378  %17 = bitcast float* %16 to <4 x float>*
379  %wide.masked.load98 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
380  %18 = fmul fast <4 x float> %wide.masked.load98, %wide.masked.load
381  %19 = fadd fast <4 x float> %18, %vec.phi
382  %20 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi
383  %21 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi94
384  %22 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi95
385  %index.next = add i32 %index, 4
386  %23 = icmp eq i32 %index.next, %n.vec
387  br i1 %23, label %middle.block, label %vector.body
388
389middle.block:                                     ; preds = %vector.body
390  %24 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %22)
391  %25 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %21)
392  %26 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %20)
393  %arrayidx28 = getelementptr inbounds float, float* %pOut, i32 %k2.093
394  store float %24, float* %arrayidx28, align 4
395  %arrayidx30 = getelementptr inbounds float, float* %pOut, i32 %add
396  store float %25, float* %arrayidx30, align 4
397  %arrayidx32 = getelementptr inbounds float, float* %pOut, i32 %add6
398  store float %26, float* %arrayidx32, align 4
399  %add34 = add i32 %k2.093, 3
400  %cmp3 = icmp ult i32 %add34, %sub
401  br i1 %cmp3, label %for.body, label %for.cond.cleanup
402}
403
404define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
405; CHECK-LABEL: DCT_mve4:
406; CHECK:       @ %bb.0: @ %entry
407; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
408; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
409; CHECK-NEXT:    .pad #4
410; CHECK-NEXT:    sub sp, #4
411; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
412; CHECK-NEXT:    vpush {d8, d9, d10, d11}
413; CHECK-NEXT:    .pad #32
414; CHECK-NEXT:    sub sp, #32
415; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
416; CHECK-NEXT:    ldr r1, [r0, #4]
417; CHECK-NEXT:    subs r1, #4
418; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
419; CHECK-NEXT:    cmp r1, #2
420; CHECK-NEXT:    blo.w .LBB3_5
421; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
422; CHECK-NEXT:    ldr r3, [r0, #8]
423; CHECK-NEXT:    movs r6, #1
424; CHECK-NEXT:    ldr r1, [r0]
425; CHECK-NEXT:    add.w r0, r3, r3, lsl #1
426; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
427; CHECK-NEXT:    add.w r12, r1, r3, lsl #3
428; CHECK-NEXT:    add.w r10, r1, r3, lsl #4
429; CHECK-NEXT:    add.w r9, r1, r0, lsl #2
430; CHECK-NEXT:    adds r0, r3, #3
431; CHECK-NEXT:    bic r0, r0, #3
432; CHECK-NEXT:    lsls r7, r3, #4
433; CHECK-NEXT:    subs r0, #4
434; CHECK-NEXT:    add.w r0, r6, r0, lsr #2
435; CHECK-NEXT:    strd r0, r3, [sp, #4] @ 8-byte Folded Spill
436; CHECK-NEXT:  .LBB3_2: @ %for.body
437; CHECK-NEXT:    @ =>This Loop Header: Depth=1
438; CHECK-NEXT:    @ Child Loop BB3_3 Depth 2
439; CHECK-NEXT:    adds r0, r6, #3
440; CHECK-NEXT:    str r0, [sp, #28] @ 4-byte Spill
441; CHECK-NEXT:    adds r0, r6, #2
442; CHECK-NEXT:    str r0, [sp, #24] @ 4-byte Spill
443; CHECK-NEXT:    adds r0, r6, #1
444; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
445; CHECK-NEXT:    ldrd r0, r11, [sp, #4] @ 8-byte Folded Reload
446; CHECK-NEXT:    vmov.i32 q0, #0x0
447; CHECK-NEXT:    mov r3, r8
448; CHECK-NEXT:    mov r5, r9
449; CHECK-NEXT:    dls lr, r0
450; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
451; CHECK-NEXT:    mov r0, r12
452; CHECK-NEXT:    mov r4, r10
453; CHECK-NEXT:    vmov q1, q0
454; CHECK-NEXT:    vmov q2, q0
455; CHECK-NEXT:    vmov q3, q0
456; CHECK-NEXT:  .LBB3_3: @ %vector.body
457; CHECK-NEXT:    @ Parent Loop BB3_2 Depth=1
458; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
459; CHECK-NEXT:    vctp.32 r11
460; CHECK-NEXT:    sub.w r11, r11, #4
461; CHECK-NEXT:    vpstttt
462; CHECK-NEXT:    vldrwt.u32 q4, [r1], #16
463; CHECK-NEXT:    vldrwt.u32 q5, [r0], #16
464; CHECK-NEXT:    vfmat.f32 q3, q5, q4
465; CHECK-NEXT:    vldrwt.u32 q5, [r3], #16
466; CHECK-NEXT:    vpstttt
467; CHECK-NEXT:    vfmat.f32 q2, q5, q4
468; CHECK-NEXT:    vldrwt.u32 q5, [r5], #16
469; CHECK-NEXT:    vfmat.f32 q1, q5, q4
470; CHECK-NEXT:    vldrwt.u32 q5, [r4], #16
471; CHECK-NEXT:    vpst
472; CHECK-NEXT:    vfmat.f32 q0, q5, q4
473; CHECK-NEXT:    le lr, .LBB3_3
474; CHECK-NEXT:  @ %bb.4: @ %middle.block
475; CHECK-NEXT:    @ in Loop: Header=BB3_2 Depth=1
476; CHECK-NEXT:    vadd.f32 s16, s14, s15
477; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
478; CHECK-NEXT:    vadd.f32 s12, s12, s13
479; CHECK-NEXT:    add r8, r7
480; CHECK-NEXT:    vadd.f32 s14, s10, s11
481; CHECK-NEXT:    add r12, r7
482; CHECK-NEXT:    vadd.f32 s8, s8, s9
483; CHECK-NEXT:    add.w r0, r2, r0, lsl #2
484; CHECK-NEXT:    vadd.f32 s10, s6, s7
485; CHECK-NEXT:    add r9, r7
486; CHECK-NEXT:    vadd.f32 s4, s4, s5
487; CHECK-NEXT:    add r10, r7
488; CHECK-NEXT:    vadd.f32 s6, s2, s3
489; CHECK-NEXT:    vadd.f32 s0, s0, s1
490; CHECK-NEXT:    vadd.f32 s2, s12, s16
491; CHECK-NEXT:    vadd.f32 s8, s8, s14
492; CHECK-NEXT:    vadd.f32 s4, s4, s10
493; CHECK-NEXT:    vadd.f32 s0, s0, s6
494; CHECK-NEXT:    vstr s2, [r0]
495; CHECK-NEXT:    add.w r0, r2, r6, lsl #2
496; CHECK-NEXT:    adds r6, #4
497; CHECK-NEXT:    vstr s8, [r0]
498; CHECK-NEXT:    ldr r0, [sp, #24] @ 4-byte Reload
499; CHECK-NEXT:    add.w r0, r2, r0, lsl #2
500; CHECK-NEXT:    vstr s4, [r0]
501; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
502; CHECK-NEXT:    add.w r0, r2, r0, lsl #2
503; CHECK-NEXT:    vstr s0, [r0]
504; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
505; CHECK-NEXT:    cmp r6, r0
506; CHECK-NEXT:    blo .LBB3_2
507; CHECK-NEXT:  .LBB3_5: @ %for.cond.cleanup
508; CHECK-NEXT:    add sp, #32
509; CHECK-NEXT:    vpop {d8, d9, d10, d11}
510; CHECK-NEXT:    add sp, #4
511; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
512entry:
513  %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
514  %0 = load i32, i32* %NumInputs, align 4
515  %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
516  %1 = load i32, i32* %NumFilters, align 4
517  %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
518  %2 = load float*, float** %pDCTCoefs, align 4
519  %cmp = icmp ugt i32 %0, 1
520  tail call void @llvm.assume(i1 %cmp)
521  %sub = add i32 %1, -4
522  %cmp3113 = icmp ugt i32 %sub, 1
523  br i1 %cmp3113, label %for.body.preheader, label %for.cond.cleanup
524
525for.body.preheader:                               ; preds = %entry
526  %n.rnd.up = add i32 %0, 3
527  %n.vec = and i32 %n.rnd.up, -4
528  br label %for.body
529
530for.cond.cleanup:                                 ; preds = %middle.block, %entry
531  ret void
532
533for.body:                                         ; preds = %for.body.preheader, %middle.block
534  %k2.0114 = phi i32 [ %add43, %middle.block ], [ 1, %for.body.preheader ]
535  %mul4 = mul i32 %k2.0114, %0
536  %add = add nuw nsw i32 %k2.0114, 1
537  %mul5 = mul i32 %add, %0
538  %add6 = add nuw nsw i32 %k2.0114, 2
539  %mul7 = mul i32 %add6, %0
540  %add8 = add i32 %k2.0114, 3
541  %mul9 = mul i32 %add8, %0
542  br label %vector.body
543
544vector.body:                                      ; preds = %vector.body, %for.body
545  %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
546  %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %25, %vector.body ]
547  %vec.phi115 = phi <4 x float> [ zeroinitializer, %for.body ], [ %26, %vector.body ]
548  %vec.phi116 = phi <4 x float> [ zeroinitializer, %for.body ], [ %27, %vector.body ]
549  %vec.phi117 = phi <4 x float> [ zeroinitializer, %for.body ], [ %28, %vector.body ]
550  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
551  %3 = getelementptr inbounds float, float* %pIn, i32 %index
552  %4 = bitcast float* %3 to <4 x float>*
553  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
554  %5 = add i32 %index, %mul4
555  %6 = getelementptr inbounds float, float* %2, i32 %5
556  %7 = bitcast float* %6 to <4 x float>*
557  %wide.masked.load118 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
558  %8 = fmul fast <4 x float> %wide.masked.load118, %wide.masked.load
559  %9 = fadd fast <4 x float> %8, %vec.phi116
560  %10 = add i32 %index, %mul5
561  %11 = getelementptr inbounds float, float* %2, i32 %10
562  %12 = bitcast float* %11 to <4 x float>*
563  %wide.masked.load119 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
564  %13 = fmul fast <4 x float> %wide.masked.load119, %wide.masked.load
565  %14 = fadd fast <4 x float> %13, %vec.phi117
566  %15 = add i32 %index, %mul7
567  %16 = getelementptr inbounds float, float* %2, i32 %15
568  %17 = bitcast float* %16 to <4 x float>*
569  %wide.masked.load120 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
570  %18 = fmul fast <4 x float> %wide.masked.load120, %wide.masked.load
571  %19 = fadd fast <4 x float> %18, %vec.phi115
572  %20 = add i32 %index, %mul9
573  %21 = getelementptr inbounds float, float* %2, i32 %20
574  %22 = bitcast float* %21 to <4 x float>*
575  %wide.masked.load121 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
576  %23 = fmul fast <4 x float> %wide.masked.load121, %wide.masked.load
577  %24 = fadd fast <4 x float> %23, %vec.phi
578  %25 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi
579  %26 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi115
580  %27 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi116
581  %28 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi117
582  %index.next = add i32 %index, 4
583  %29 = icmp eq i32 %index.next, %n.vec
584  br i1 %29, label %middle.block, label %vector.body
585
586middle.block:                                     ; preds = %vector.body
587  %30 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %28)
588  %31 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %27)
589  %32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %26)
590  %33 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %25)
591  %arrayidx35 = getelementptr inbounds float, float* %pOut, i32 %k2.0114
592  store float %31, float* %arrayidx35, align 4
593  %arrayidx37 = getelementptr inbounds float, float* %pOut, i32 %add
594  store float %30, float* %arrayidx37, align 4
595  %arrayidx39 = getelementptr inbounds float, float* %pOut, i32 %add6
596  store float %32, float* %arrayidx39, align 4
597  %arrayidx41 = getelementptr inbounds float, float* %pOut, i32 %add8
598  store float %33, float* %arrayidx41, align 4
599  %add43 = add i32 %k2.0114, 4
600  %cmp3 = icmp ult i32 %add43, %sub
601  br i1 %cmp3, label %for.body, label %for.cond.cleanup
602}
603
604define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
605; CHECK-LABEL: DCT_mve5:
606; CHECK:       @ %bb.0: @ %entry
607; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
608; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
609; CHECK-NEXT:    .pad #4
610; CHECK-NEXT:    sub sp, #4
611; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
612; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
613; CHECK-NEXT:    .pad #32
614; CHECK-NEXT:    sub sp, #32
615; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
616; CHECK-NEXT:    ldr r1, [r0, #4]
617; CHECK-NEXT:    subs r1, #5
618; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
619; CHECK-NEXT:    cmp r1, #2
620; CHECK-NEXT:    blo.w .LBB4_5
621; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
622; CHECK-NEXT:    ldr r3, [r0, #8]
623; CHECK-NEXT:    ldr r1, [r0]
624; CHECK-NEXT:    adds r0, r3, #3
625; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
626; CHECK-NEXT:    bic r0, r0, #3
627; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
628; CHECK-NEXT:    subs r1, r0, #4
629; CHECK-NEXT:    movs r0, #1
630; CHECK-NEXT:    lsls r5, r3, #2
631; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
632; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
633; CHECK-NEXT:    add.w r1, r3, r3, lsl #2
634; CHECK-NEXT:    lsls r1, r1, #2
635; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
636; CHECK-NEXT:  .LBB4_2: @ %for.body
637; CHECK-NEXT:    @ =>This Loop Header: Depth=1
638; CHECK-NEXT:    @ Child Loop BB4_3 Depth 2
639; CHECK-NEXT:    adds r1, r0, #4
640; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
641; CHECK-NEXT:    adds r1, r0, #3
642; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
643; CHECK-NEXT:    ldrd r1, r11, [sp, #8] @ 8-byte Folded Reload
644; CHECK-NEXT:    vmov.i32 q1, #0x0
645; CHECK-NEXT:    add.w r10, r0, #2
646; CHECK-NEXT:    adds r7, r0, #1
647; CHECK-NEXT:    dls lr, r1
648; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
649; CHECK-NEXT:    mov r3, r8
650; CHECK-NEXT:    vmov q0, q1
651; CHECK-NEXT:    vmov q3, q1
652; CHECK-NEXT:    vmov q2, q1
653; CHECK-NEXT:    vmov q4, q1
654; CHECK-NEXT:  .LBB4_3: @ %vector.body
655; CHECK-NEXT:    @ Parent Loop BB4_2 Depth=1
656; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
657; CHECK-NEXT:    add.w r9, r3, r5
658; CHECK-NEXT:    vctp.32 r11
659; CHECK-NEXT:    vpsttt
660; CHECK-NEXT:    vldrwt.u32 q5, [r1], #16
661; CHECK-NEXT:    vldrwt.u32 q6, [r3], #16
662; CHECK-NEXT:    vfmat.f32 q3, q6, q5
663; CHECK-NEXT:    add.w r12, r9, r5
664; CHECK-NEXT:    vpstt
665; CHECK-NEXT:    vldrwt.u32 q6, [r9]
666; CHECK-NEXT:    vfmat.f32 q4, q6, q5
667; CHECK-NEXT:    sub.w r11, r11, #4
668; CHECK-NEXT:    add.w r4, r12, r5
669; CHECK-NEXT:    vpstt
670; CHECK-NEXT:    vldrwt.u32 q6, [r12]
671; CHECK-NEXT:    vfmat.f32 q2, q6, q5
672; CHECK-NEXT:    adds r6, r4, r5
673; CHECK-NEXT:    vpstttt
674; CHECK-NEXT:    vldrwt.u32 q6, [r4]
675; CHECK-NEXT:    vfmat.f32 q0, q6, q5
676; CHECK-NEXT:    vldrwt.u32 q6, [r6]
677; CHECK-NEXT:    vfmat.f32 q1, q6, q5
678; CHECK-NEXT:    le lr, .LBB4_3
679; CHECK-NEXT:  @ %bb.4: @ %middle.block
680; CHECK-NEXT:    @ in Loop: Header=BB4_2 Depth=1
681; CHECK-NEXT:    vadd.f32 s20, s18, s19
682; CHECK-NEXT:    add.w r1, r2, r7, lsl #2
683; CHECK-NEXT:    vadd.f32 s16, s16, s17
684; CHECK-NEXT:    vadd.f32 s18, s14, s15
685; CHECK-NEXT:    vadd.f32 s12, s12, s13
686; CHECK-NEXT:    vadd.f32 s14, s6, s7
687; CHECK-NEXT:    vadd.f32 s4, s4, s5
688; CHECK-NEXT:    vadd.f32 s6, s10, s11
689; CHECK-NEXT:    vadd.f32 s8, s8, s9
690; CHECK-NEXT:    vadd.f32 s10, s2, s3
691; CHECK-NEXT:    vadd.f32 s0, s0, s1
692; CHECK-NEXT:    vadd.f32 s2, s16, s20
693; CHECK-NEXT:    vadd.f32 s12, s12, s18
694; CHECK-NEXT:    vadd.f32 s4, s4, s14
695; CHECK-NEXT:    vadd.f32 s6, s8, s6
696; CHECK-NEXT:    vadd.f32 s0, s0, s10
697; CHECK-NEXT:    vstr s2, [r1]
698; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
699; CHECK-NEXT:    adds r0, #5
700; CHECK-NEXT:    vstr s12, [r1]
701; CHECK-NEXT:    add.w r1, r2, r10, lsl #2
702; CHECK-NEXT:    vstr s6, [r1]
703; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
704; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
705; CHECK-NEXT:    vstr s0, [r1]
706; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
707; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
708; CHECK-NEXT:    vstr s4, [r1]
709; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
710; CHECK-NEXT:    add r8, r1
711; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
712; CHECK-NEXT:    cmp r0, r1
713; CHECK-NEXT:    blo.w .LBB4_2
714; CHECK-NEXT:  .LBB4_5: @ %for.cond.cleanup
715; CHECK-NEXT:    add sp, #32
716; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
717; CHECK-NEXT:    add sp, #4
718; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
719entry:
720  %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
721  %0 = load i32, i32* %NumInputs, align 4
722  %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
723  %1 = load i32, i32* %NumFilters, align 4
724  %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
725  %2 = load float*, float** %pDCTCoefs, align 4
726  %cmp = icmp ugt i32 %0, 1
727  tail call void @llvm.assume(i1 %cmp)
728  %sub = add i32 %1, -5
729  %cmp3134 = icmp ugt i32 %sub, 1
730  br i1 %cmp3134, label %for.body.preheader, label %for.cond.cleanup
731
732for.body.preheader:                               ; preds = %entry
733  %n.rnd.up = add i32 %0, 3
734  %n.vec = and i32 %n.rnd.up, -4
735  br label %for.body
736
737for.cond.cleanup:                                 ; preds = %middle.block, %entry
738  ret void
739
740for.body:                                         ; preds = %for.body.preheader, %middle.block
741  %k2.0135 = phi i32 [ %add52, %middle.block ], [ 1, %for.body.preheader ]
742  %mul4 = mul i32 %k2.0135, %0
743  %add = add nuw i32 %k2.0135, 1
744  %mul5 = mul i32 %add, %0
745  %add6 = add i32 %k2.0135, 2
746  %mul7 = mul i32 %add6, %0
747  %add8 = add i32 %k2.0135, 3
748  %mul9 = mul i32 %add8, %0
749  %add10 = add i32 %k2.0135, 4
750  %mul11 = mul i32 %add10, %0
751  br label %vector.body
752
753vector.body:                                      ; preds = %vector.body, %for.body
754  %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
755  %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %30, %vector.body ]
756  %vec.phi136 = phi <4 x float> [ zeroinitializer, %for.body ], [ %31, %vector.body ]
757  %vec.phi137 = phi <4 x float> [ zeroinitializer, %for.body ], [ %32, %vector.body ]
758  %vec.phi138 = phi <4 x float> [ zeroinitializer, %for.body ], [ %33, %vector.body ]
759  %vec.phi139 = phi <4 x float> [ zeroinitializer, %for.body ], [ %34, %vector.body ]
760  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
761  %3 = getelementptr inbounds float, float* %pIn, i32 %index
762  %4 = bitcast float* %3 to <4 x float>*
763  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
764  %5 = add i32 %index, %mul4
765  %6 = getelementptr inbounds float, float* %2, i32 %5
766  %7 = bitcast float* %6 to <4 x float>*
767  %wide.masked.load140 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
768  %8 = fmul fast <4 x float> %wide.masked.load140, %wide.masked.load
769  %9 = fadd fast <4 x float> %8, %vec.phi137
770  %10 = add i32 %index, %mul5
771  %11 = getelementptr inbounds float, float* %2, i32 %10
772  %12 = bitcast float* %11 to <4 x float>*
773  %wide.masked.load141 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
774  %13 = fmul fast <4 x float> %wide.masked.load141, %wide.masked.load
775  %14 = fadd fast <4 x float> %13, %vec.phi139
776  %15 = add i32 %index, %mul7
777  %16 = getelementptr inbounds float, float* %2, i32 %15
778  %17 = bitcast float* %16 to <4 x float>*
779  %wide.masked.load142 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
780  %18 = fmul fast <4 x float> %wide.masked.load142, %wide.masked.load
781  %19 = fadd fast <4 x float> %18, %vec.phi138
782  %20 = add i32 %index, %mul9
783  %21 = getelementptr inbounds float, float* %2, i32 %20
784  %22 = bitcast float* %21 to <4 x float>*
785  %wide.masked.load143 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
786  %23 = fmul fast <4 x float> %wide.masked.load143, %wide.masked.load
787  %24 = fadd fast <4 x float> %23, %vec.phi136
788  %25 = add i32 %index, %mul11
789  %26 = getelementptr inbounds float, float* %2, i32 %25
790  %27 = bitcast float* %26 to <4 x float>*
791  %wide.masked.load144 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
792  %28 = fmul fast <4 x float> %wide.masked.load144, %wide.masked.load
793  %29 = fadd fast <4 x float> %28, %vec.phi
794  %30 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi
795  %31 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi136
796  %32 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi137
797  %33 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi138
798  %34 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi139
799  %index.next = add i32 %index, 4
800  %35 = icmp eq i32 %index.next, %n.vec
801  br i1 %35, label %middle.block, label %vector.body
802
803middle.block:                                     ; preds = %vector.body
804  %36 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %34)
805  %37 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %33)
806  %38 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %32)
807  %39 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %31)
808  %40 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %30)
809  %arrayidx42 = getelementptr inbounds float, float* %pOut, i32 %k2.0135
810  store float %38, float* %arrayidx42, align 4
811  %arrayidx44 = getelementptr inbounds float, float* %pOut, i32 %add
812  store float %36, float* %arrayidx44, align 4
813  %arrayidx46 = getelementptr inbounds float, float* %pOut, i32 %add6
814  store float %37, float* %arrayidx46, align 4
815  %arrayidx48 = getelementptr inbounds float, float* %pOut, i32 %add8
816  store float %39, float* %arrayidx48, align 4
817  %arrayidx50 = getelementptr inbounds float, float* %pOut, i32 %add10
818  store float %40, float* %arrayidx50, align 4
819  %add52 = add i32 %k2.0135, 5
820  %cmp3 = icmp ult i32 %add52, %sub
821  br i1 %cmp3, label %for.body, label %for.cond.cleanup
822}
823
824define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
825; CHECK-LABEL: DCT_mve6:
826; CHECK:       @ %bb.0: @ %entry
827; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
828; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
829; CHECK-NEXT:    .pad #4
830; CHECK-NEXT:    sub sp, #4
831; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
832; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
833; CHECK-NEXT:    .pad #32
834; CHECK-NEXT:    sub sp, #32
835; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
836; CHECK-NEXT:    ldr r1, [r0, #4]
837; CHECK-NEXT:    subs r1, #6
838; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
839; CHECK-NEXT:    cmp r1, #2
840; CHECK-NEXT:    blo.w .LBB5_5
841; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
842; CHECK-NEXT:    ldr r3, [r0, #8]
843; CHECK-NEXT:    ldr r1, [r0]
844; CHECK-NEXT:    adds r0, r3, #3
845; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
846; CHECK-NEXT:    bic r0, r0, #3
847; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
848; CHECK-NEXT:    subs r1, r0, #4
849; CHECK-NEXT:    movs r0, #1
850; CHECK-NEXT:    lsls r5, r3, #2
851; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
852; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
853; CHECK-NEXT:    add.w r1, r3, r3, lsl #1
854; CHECK-NEXT:    lsls r1, r1, #3
855; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
856; CHECK-NEXT:  .LBB5_2: @ %for.body
857; CHECK-NEXT:    @ =>This Loop Header: Depth=1
858; CHECK-NEXT:    @ Child Loop BB5_3 Depth 2
859; CHECK-NEXT:    adds r1, r0, #5
860; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
861; CHECK-NEXT:    adds r1, r0, #4
862; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
863; CHECK-NEXT:    adds r1, r0, #3
864; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
865; CHECK-NEXT:    ldrd r1, r8, [sp, #4] @ 8-byte Folded Reload
866; CHECK-NEXT:    vmov.i32 q1, #0x0
867; CHECK-NEXT:    add.w r11, r0, #2
868; CHECK-NEXT:    adds r4, r0, #1
869; CHECK-NEXT:    dls lr, r1
870; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
871; CHECK-NEXT:    mov r3, r9
872; CHECK-NEXT:    vmov q3, q1
873; CHECK-NEXT:    vmov q4, q1
874; CHECK-NEXT:    vmov q0, q1
875; CHECK-NEXT:    vmov q5, q1
876; CHECK-NEXT:    vmov q2, q1
877; CHECK-NEXT:  .LBB5_3: @ %vector.body
878; CHECK-NEXT:    @ Parent Loop BB5_2 Depth=1
879; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
880; CHECK-NEXT:    add.w r12, r3, r5
881; CHECK-NEXT:    vctp.32 r8
882; CHECK-NEXT:    vpsttt
883; CHECK-NEXT:    vldrwt.u32 q6, [r1], #16
884; CHECK-NEXT:    vldrwt.u32 q7, [r3], #16
885; CHECK-NEXT:    vfmat.f32 q4, q7, q6
886; CHECK-NEXT:    add.w r10, r12, r5
887; CHECK-NEXT:    vpstt
888; CHECK-NEXT:    vldrwt.u32 q7, [r12]
889; CHECK-NEXT:    vfmat.f32 q5, q7, q6
890; CHECK-NEXT:    add.w r6, r10, r5
891; CHECK-NEXT:    vpstt
892; CHECK-NEXT:    vldrwt.u32 q7, [r10]
893; CHECK-NEXT:    vfmat.f32 q2, q7, q6
894; CHECK-NEXT:    sub.w r8, r8, #4
895; CHECK-NEXT:    adds r7, r6, r5
896; CHECK-NEXT:    vpstt
897; CHECK-NEXT:    vldrwt.u32 q7, [r6]
898; CHECK-NEXT:    vfmat.f32 q0, q7, q6
899; CHECK-NEXT:    adds r6, r7, r5
900; CHECK-NEXT:    vpstttt
901; CHECK-NEXT:    vldrwt.u32 q7, [r7]
902; CHECK-NEXT:    vfmat.f32 q3, q7, q6
903; CHECK-NEXT:    vldrwt.u32 q7, [r6]
904; CHECK-NEXT:    vfmat.f32 q1, q7, q6
905; CHECK-NEXT:    le lr, .LBB5_3
906; CHECK-NEXT:  @ %bb.4: @ %middle.block
907; CHECK-NEXT:    @ in Loop: Header=BB5_2 Depth=1
908; CHECK-NEXT:    vadd.f32 s24, s22, s23
909; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
910; CHECK-NEXT:    vadd.f32 s20, s20, s21
911; CHECK-NEXT:    vadd.f32 s22, s18, s19
912; CHECK-NEXT:    vadd.f32 s16, s16, s17
913; CHECK-NEXT:    vadd.f32 s18, s6, s7
914; CHECK-NEXT:    vadd.f32 s4, s4, s5
915; CHECK-NEXT:    vadd.f32 s6, s14, s15
916; CHECK-NEXT:    vadd.f32 s12, s12, s13
917; CHECK-NEXT:    vadd.f32 s14, s10, s11
918; CHECK-NEXT:    vadd.f32 s8, s8, s9
919; CHECK-NEXT:    vadd.f32 s0, s0, s1
920; CHECK-NEXT:    vadd.f32 s10, s2, s3
921; CHECK-NEXT:    vadd.f32 s2, s20, s24
922; CHECK-NEXT:    vadd.f32 s1, s16, s22
923; CHECK-NEXT:    vadd.f32 s6, s12, s6
924; CHECK-NEXT:    vadd.f32 s4, s4, s18
925; CHECK-NEXT:    vadd.f32 s8, s8, s14
926; CHECK-NEXT:    vadd.f32 s0, s0, s10
927; CHECK-NEXT:    vstr s2, [r1]
928; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
929; CHECK-NEXT:    adds r0, #6
930; CHECK-NEXT:    vstr s1, [r1]
931; CHECK-NEXT:    add.w r1, r2, r11, lsl #2
932; CHECK-NEXT:    vstr s8, [r1]
933; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
934; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
935; CHECK-NEXT:    vstr s0, [r1]
936; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
937; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
938; CHECK-NEXT:    vstr s6, [r1]
939; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
940; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
941; CHECK-NEXT:    vstr s4, [r1]
942; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
943; CHECK-NEXT:    add r9, r1
944; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
945; CHECK-NEXT:    cmp r0, r1
946; CHECK-NEXT:    blo.w .LBB5_2
947; CHECK-NEXT:  .LBB5_5: @ %for.cond.cleanup
948; CHECK-NEXT:    add sp, #32
949; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
950; CHECK-NEXT:    add sp, #4
951; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
952entry:
953  %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
954  %0 = load i32, i32* %NumInputs, align 4
955  %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
956  %1 = load i32, i32* %NumFilters, align 4
957  %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
958  %2 = load float*, float** %pDCTCoefs, align 4
959  %cmp = icmp ugt i32 %0, 1
960  tail call void @llvm.assume(i1 %cmp)
961  %sub = add i32 %1, -6
962  %cmp3155 = icmp ugt i32 %sub, 1
963  br i1 %cmp3155, label %for.body.preheader, label %for.cond.cleanup
964
965for.body.preheader:                               ; preds = %entry
966  %n.rnd.up = add i32 %0, 3
967  %n.vec = and i32 %n.rnd.up, -4
968  br label %for.body
969
970for.cond.cleanup:                                 ; preds = %middle.block, %entry
971  ret void
972
973for.body:                                         ; preds = %for.body.preheader, %middle.block
974  %k2.0156 = phi i32 [ %add61, %middle.block ], [ 1, %for.body.preheader ]
975  %mul4 = mul i32 %k2.0156, %0
976  %add = add nuw i32 %k2.0156, 1
977  %mul5 = mul i32 %add, %0
978  %add6 = add i32 %k2.0156, 2
979  %mul7 = mul i32 %add6, %0
980  %add8 = add i32 %k2.0156, 3
981  %mul9 = mul i32 %add8, %0
982  %add10 = add i32 %k2.0156, 4
983  %mul11 = mul i32 %add10, %0
984  %add12 = add i32 %k2.0156, 5
985  %mul13 = mul i32 %add12, %0
986  br label %vector.body
987
988vector.body:                                      ; preds = %vector.body, %for.body
989  %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
990  %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %35, %vector.body ]
991  %vec.phi157 = phi <4 x float> [ zeroinitializer, %for.body ], [ %36, %vector.body ]
992  %vec.phi158 = phi <4 x float> [ zeroinitializer, %for.body ], [ %37, %vector.body ]
993  %vec.phi159 = phi <4 x float> [ zeroinitializer, %for.body ], [ %38, %vector.body ]
994  %vec.phi160 = phi <4 x float> [ zeroinitializer, %for.body ], [ %39, %vector.body ]
995  %vec.phi161 = phi <4 x float> [ zeroinitializer, %for.body ], [ %40, %vector.body ]
996  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
997  %3 = getelementptr inbounds float, float* %pIn, i32 %index
998  %4 = bitcast float* %3 to <4 x float>*
999  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1000  %5 = add i32 %index, %mul4
1001  %6 = getelementptr inbounds float, float* %2, i32 %5
1002  %7 = bitcast float* %6 to <4 x float>*
1003  %wide.masked.load162 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1004  %8 = fmul fast <4 x float> %wide.masked.load162, %wide.masked.load
1005  %9 = fadd fast <4 x float> %8, %vec.phi158
1006  %10 = add i32 %index, %mul5
1007  %11 = getelementptr inbounds float, float* %2, i32 %10
1008  %12 = bitcast float* %11 to <4 x float>*
1009  %wide.masked.load163 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1010  %13 = fmul fast <4 x float> %wide.masked.load163, %wide.masked.load
1011  %14 = fadd fast <4 x float> %13, %vec.phi160
1012  %15 = add i32 %index, %mul7
1013  %16 = getelementptr inbounds float, float* %2, i32 %15
1014  %17 = bitcast float* %16 to <4 x float>*
1015  %wide.masked.load164 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1016  %18 = fmul fast <4 x float> %wide.masked.load164, %wide.masked.load
1017  %19 = fadd fast <4 x float> %18, %vec.phi161
1018  %20 = add i32 %index, %mul9
1019  %21 = getelementptr inbounds float, float* %2, i32 %20
1020  %22 = bitcast float* %21 to <4 x float>*
1021  %wide.masked.load165 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1022  %23 = fmul fast <4 x float> %wide.masked.load165, %wide.masked.load
1023  %24 = fadd fast <4 x float> %23, %vec.phi159
1024  %25 = add i32 %index, %mul11
1025  %26 = getelementptr inbounds float, float* %2, i32 %25
1026  %27 = bitcast float* %26 to <4 x float>*
1027  %wide.masked.load166 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1028  %28 = fmul fast <4 x float> %wide.masked.load166, %wide.masked.load
1029  %29 = fadd fast <4 x float> %28, %vec.phi157
1030  %30 = add i32 %index, %mul13
1031  %31 = getelementptr inbounds float, float* %2, i32 %30
1032  %32 = bitcast float* %31 to <4 x float>*
1033  %wide.masked.load167 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1034  %33 = fmul fast <4 x float> %wide.masked.load167, %wide.masked.load
1035  %34 = fadd fast <4 x float> %33, %vec.phi
1036  %35 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi
1037  %36 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi157
1038  %37 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi158
1039  %38 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi159
1040  %39 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi160
1041  %40 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi161
1042  %index.next = add i32 %index, 4
1043  %41 = icmp eq i32 %index.next, %n.vec
1044  br i1 %41, label %middle.block, label %vector.body
1045
1046middle.block:                                     ; preds = %vector.body
1047  %42 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %40)
1048  %43 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %39)
1049  %44 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %38)
1050  %45 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %37)
1051  %46 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %36)
1052  %47 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %35)
1053  %arrayidx49 = getelementptr inbounds float, float* %pOut, i32 %k2.0156
1054  store float %45, float* %arrayidx49, align 4
1055  %arrayidx51 = getelementptr inbounds float, float* %pOut, i32 %add
1056  store float %43, float* %arrayidx51, align 4
1057  %arrayidx53 = getelementptr inbounds float, float* %pOut, i32 %add6
1058  store float %42, float* %arrayidx53, align 4
1059  %arrayidx55 = getelementptr inbounds float, float* %pOut, i32 %add8
1060  store float %44, float* %arrayidx55, align 4
1061  %arrayidx57 = getelementptr inbounds float, float* %pOut, i32 %add10
1062  store float %46, float* %arrayidx57, align 4
1063  %arrayidx59 = getelementptr inbounds float, float* %pOut, i32 %add12
1064  store float %47, float* %arrayidx59, align 4
1065  %add61 = add i32 %k2.0156, 6
1066  %cmp3 = icmp ult i32 %add61, %sub
1067  br i1 %cmp3, label %for.body, label %for.cond.cleanup
1068}
1069
1070define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
1071; CHECK-LABEL: DCT_mve7:
1072; CHECK:       @ %bb.0: @ %entry
1073; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1074; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1075; CHECK-NEXT:    .pad #4
1076; CHECK-NEXT:    sub sp, #4
1077; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1078; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1079; CHECK-NEXT:    .pad #88
1080; CHECK-NEXT:    sub sp, #88
1081; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
1082; CHECK-NEXT:    ldr r1, [r0, #4]
1083; CHECK-NEXT:    subs r1, #7
1084; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
1085; CHECK-NEXT:    cmp r1, #2
1086; CHECK-NEXT:    blo.w .LBB6_5
1087; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1088; CHECK-NEXT:    ldr r3, [r0, #8]
1089; CHECK-NEXT:    ldr r1, [r0]
1090; CHECK-NEXT:    adds r0, r3, #3
1091; CHECK-NEXT:    str r3, [sp, #20] @ 4-byte Spill
1092; CHECK-NEXT:    bic r0, r0, #3
1093; CHECK-NEXT:    add.w r12, r1, r3, lsl #2
1094; CHECK-NEXT:    subs r1, r0, #4
1095; CHECK-NEXT:    movs r0, #1
1096; CHECK-NEXT:    lsls r5, r3, #2
1097; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
1098; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
1099; CHECK-NEXT:    rsb r1, r3, r3, lsl #3
1100; CHECK-NEXT:    lsls r1, r1, #2
1101; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
1102; CHECK-NEXT:  .LBB6_2: @ %for.body
1103; CHECK-NEXT:    @ =>This Loop Header: Depth=1
1104; CHECK-NEXT:    @ Child Loop BB6_3 Depth 2
1105; CHECK-NEXT:    adds r1, r0, #6
1106; CHECK-NEXT:    str r1, [sp, #44] @ 4-byte Spill
1107; CHECK-NEXT:    adds r1, r0, #5
1108; CHECK-NEXT:    str r1, [sp, #40] @ 4-byte Spill
1109; CHECK-NEXT:    adds r1, r0, #4
1110; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
1111; CHECK-NEXT:    adds r1, r0, #3
1112; CHECK-NEXT:    str r1, [sp, #32] @ 4-byte Spill
1113; CHECK-NEXT:    ldrd r3, r1, [sp, #16] @ 8-byte Folded Reload
1114; CHECK-NEXT:    vmov.i32 q2, #0x0
1115; CHECK-NEXT:    adds r4, r0, #2
1116; CHECK-NEXT:    add.w r8, r0, #1
1117; CHECK-NEXT:    dls lr, r3
1118; CHECK-NEXT:    ldr.w r9, [sp, #28] @ 4-byte Reload
1119; CHECK-NEXT:    mov r3, r12
1120; CHECK-NEXT:    vmov q4, q2
1121; CHECK-NEXT:    vmov q5, q2
1122; CHECK-NEXT:    vmov q3, q2
1123; CHECK-NEXT:    vmov q6, q2
1124; CHECK-NEXT:    vmov q1, q2
1125; CHECK-NEXT:    vstrw.32 q2, [sp, #64] @ 16-byte Spill
1126; CHECK-NEXT:  .LBB6_3: @ %vector.body
1127; CHECK-NEXT:    @ Parent Loop BB6_2 Depth=1
1128; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
1129; CHECK-NEXT:    add.w r10, r3, r5
1130; CHECK-NEXT:    vctp.32 r1
1131; CHECK-NEXT:    vpsttt
1132; CHECK-NEXT:    vldrwt.u32 q7, [r9], #16
1133; CHECK-NEXT:    vldrwt.u32 q0, [r3], #16
1134; CHECK-NEXT:    vfmat.f32 q5, q0, q7
1135; CHECK-NEXT:    add.w r11, r10, r5
1136; CHECK-NEXT:    vpstt
1137; CHECK-NEXT:    vldrwt.u32 q0, [r10]
1138; CHECK-NEXT:    vfmat.f32 q6, q0, q7
1139; CHECK-NEXT:    vstrw.32 q6, [sp, #48] @ 16-byte Spill
1140; CHECK-NEXT:    vpstt
1141; CHECK-NEXT:    vldrwt.u32 q0, [r11]
1142; CHECK-NEXT:    vfmat.f32 q1, q0, q7
1143; CHECK-NEXT:    add.w r6, r11, r5
1144; CHECK-NEXT:    vmov q6, q5
1145; CHECK-NEXT:    vmov q5, q4
1146; CHECK-NEXT:    vmov q4, q2
1147; CHECK-NEXT:    vmov q2, q3
1148; CHECK-NEXT:    vpst
1149; CHECK-NEXT:    vldrwt.u32 q0, [r6]
1150; CHECK-NEXT:    vmov q3, q1
1151; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
1152; CHECK-NEXT:    adds r7, r6, r5
1153; CHECK-NEXT:    vpst
1154; CHECK-NEXT:    vfmat.f32 q1, q0, q7
1155; CHECK-NEXT:    vstrw.32 q1, [sp, #64] @ 16-byte Spill
1156; CHECK-NEXT:    vmov q1, q3
1157; CHECK-NEXT:    vmov q3, q2
1158; CHECK-NEXT:    vmov q2, q4
1159; CHECK-NEXT:    vmov q4, q5
1160; CHECK-NEXT:    vmov q5, q6
1161; CHECK-NEXT:    vldrw.u32 q6, [sp, #48] @ 16-byte Reload
1162; CHECK-NEXT:    subs r1, #4
1163; CHECK-NEXT:    adds r6, r7, r5
1164; CHECK-NEXT:    vpstt
1165; CHECK-NEXT:    vldrwt.u32 q0, [r7]
1166; CHECK-NEXT:    vfmat.f32 q3, q0, q7
1167; CHECK-NEXT:    adds r7, r6, r5
1168; CHECK-NEXT:    vpstttt
1169; CHECK-NEXT:    vldrwt.u32 q0, [r6]
1170; CHECK-NEXT:    vfmat.f32 q4, q0, q7
1171; CHECK-NEXT:    vldrwt.u32 q0, [r7]
1172; CHECK-NEXT:    vfmat.f32 q2, q0, q7
1173; CHECK-NEXT:    le lr, .LBB6_3
1174; CHECK-NEXT:  @ %bb.4: @ %middle.block
1175; CHECK-NEXT:    @ in Loop: Header=BB6_2 Depth=1
1176; CHECK-NEXT:    vadd.f32 s0, s26, s27
1177; CHECK-NEXT:    add.w r1, r2, r8, lsl #2
1178; CHECK-NEXT:    vadd.f32 s2, s24, s25
1179; CHECK-NEXT:    vadd.f32 s3, s20, s21
1180; CHECK-NEXT:    vadd.f32 s1, s22, s23
1181; CHECK-NEXT:    vadd.f32 s8, s8, s9
1182; CHECK-NEXT:    vadd.f32 s20, s10, s11
1183; CHECK-NEXT:    vadd.f32 s11, s14, s15
1184; CHECK-NEXT:    vadd.f32 s12, s12, s13
1185; CHECK-NEXT:    vadd.f32 s14, s6, s7
1186; CHECK-NEXT:    vadd.f32 s4, s4, s5
1187; CHECK-NEXT:    vadd.f32 s0, s2, s0
1188; CHECK-NEXT:    vadd.f32 s10, s18, s19
1189; CHECK-NEXT:    vadd.f32 s9, s16, s17
1190; CHECK-NEXT:    vldrw.u32 q4, [sp, #64] @ 16-byte Reload
1191; CHECK-NEXT:    vadd.f32 s2, s3, s1
1192; CHECK-NEXT:    vadd.f32 s6, s18, s19
1193; CHECK-NEXT:    vadd.f32 s5, s16, s17
1194; CHECK-NEXT:    vadd.f32 s4, s4, s14
1195; CHECK-NEXT:    vstr s0, [r1]
1196; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
1197; CHECK-NEXT:    vadd.f32 s12, s12, s11
1198; CHECK-NEXT:    adds r0, #7
1199; CHECK-NEXT:    vadd.f32 s10, s9, s10
1200; CHECK-NEXT:    vstr s2, [r1]
1201; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
1202; CHECK-NEXT:    vadd.f32 s8, s8, s20
1203; CHECK-NEXT:    vadd.f32 s6, s5, s6
1204; CHECK-NEXT:    vstr s4, [r1]
1205; CHECK-NEXT:    ldr r1, [sp, #32] @ 4-byte Reload
1206; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1207; CHECK-NEXT:    vstr s6, [r1]
1208; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
1209; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1210; CHECK-NEXT:    vstr s12, [r1]
1211; CHECK-NEXT:    ldr r1, [sp, #40] @ 4-byte Reload
1212; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1213; CHECK-NEXT:    vstr s10, [r1]
1214; CHECK-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
1215; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1216; CHECK-NEXT:    vstr s8, [r1]
1217; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
1218; CHECK-NEXT:    add r12, r1
1219; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
1220; CHECK-NEXT:    cmp r0, r1
1221; CHECK-NEXT:    blo.w .LBB6_2
1222; CHECK-NEXT:  .LBB6_5: @ %for.cond.cleanup
1223; CHECK-NEXT:    add sp, #88
1224; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1225; CHECK-NEXT:    add sp, #4
1226; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1227entry:
1228  %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
1229  %0 = load i32, i32* %NumInputs, align 4
1230  %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
1231  %1 = load i32, i32* %NumFilters, align 4
1232  %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
1233  %2 = load float*, float** %pDCTCoefs, align 4
1234  %cmp = icmp ugt i32 %0, 1
1235  tail call void @llvm.assume(i1 %cmp)
1236  %sub = add i32 %1, -7
1237  %cmp3176 = icmp ugt i32 %sub, 1
1238  br i1 %cmp3176, label %for.body.preheader, label %for.cond.cleanup
1239
1240for.body.preheader:                               ; preds = %entry
1241  %n.rnd.up = add i32 %0, 3
1242  %n.vec = and i32 %n.rnd.up, -4
1243  br label %for.body
1244
1245for.cond.cleanup:                                 ; preds = %middle.block, %entry
1246  ret void
1247
1248for.body:                                         ; preds = %for.body.preheader, %middle.block
1249  %k2.0177 = phi i32 [ %add70, %middle.block ], [ 1, %for.body.preheader ]
1250  %mul4 = mul i32 %k2.0177, %0
1251  %add = add nuw i32 %k2.0177, 1
1252  %mul5 = mul i32 %add, %0
1253  %add6 = add i32 %k2.0177, 2
1254  %mul7 = mul i32 %add6, %0
1255  %add8 = add i32 %k2.0177, 3
1256  %mul9 = mul i32 %add8, %0
1257  %add10 = add i32 %k2.0177, 4
1258  %mul11 = mul i32 %add10, %0
1259  %add12 = add i32 %k2.0177, 5
1260  %mul13 = mul i32 %add12, %0
1261  %add14 = add i32 %k2.0177, 6
1262  %mul15 = mul i32 %add14, %0
1263  br label %vector.body
1264
1265vector.body:                                      ; preds = %vector.body, %for.body
1266  %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
1267  %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %40, %vector.body ]
1268  %vec.phi178 = phi <4 x float> [ zeroinitializer, %for.body ], [ %41, %vector.body ]
1269  %vec.phi179 = phi <4 x float> [ zeroinitializer, %for.body ], [ %42, %vector.body ]
1270  %vec.phi180 = phi <4 x float> [ zeroinitializer, %for.body ], [ %43, %vector.body ]
1271  %vec.phi181 = phi <4 x float> [ zeroinitializer, %for.body ], [ %44, %vector.body ]
1272  %vec.phi182 = phi <4 x float> [ zeroinitializer, %for.body ], [ %45, %vector.body ]
1273  %vec.phi183 = phi <4 x float> [ zeroinitializer, %for.body ], [ %46, %vector.body ]
1274  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
1275  %3 = getelementptr inbounds float, float* %pIn, i32 %index
1276  %4 = bitcast float* %3 to <4 x float>*
1277  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1278  %5 = add i32 %index, %mul4
1279  %6 = getelementptr inbounds float, float* %2, i32 %5
1280  %7 = bitcast float* %6 to <4 x float>*
1281  %wide.masked.load184 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1282  %8 = fmul fast <4 x float> %wide.masked.load184, %wide.masked.load
1283  %9 = fadd fast <4 x float> %8, %vec.phi179
1284  %10 = add i32 %index, %mul5
1285  %11 = getelementptr inbounds float, float* %2, i32 %10
1286  %12 = bitcast float* %11 to <4 x float>*
1287  %wide.masked.load185 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1288  %13 = fmul fast <4 x float> %wide.masked.load185, %wide.masked.load
1289  %14 = fadd fast <4 x float> %13, %vec.phi181
1290  %15 = add i32 %index, %mul7
1291  %16 = getelementptr inbounds float, float* %2, i32 %15
1292  %17 = bitcast float* %16 to <4 x float>*
1293  %wide.masked.load186 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1294  %18 = fmul fast <4 x float> %wide.masked.load186, %wide.masked.load
1295  %19 = fadd fast <4 x float> %18, %vec.phi183
1296  %20 = add i32 %index, %mul9
1297  %21 = getelementptr inbounds float, float* %2, i32 %20
1298  %22 = bitcast float* %21 to <4 x float>*
1299  %wide.masked.load187 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1300  %23 = fmul fast <4 x float> %wide.masked.load187, %wide.masked.load
1301  %24 = fadd fast <4 x float> %23, %vec.phi182
1302  %25 = add i32 %index, %mul11
1303  %26 = getelementptr inbounds float, float* %2, i32 %25
1304  %27 = bitcast float* %26 to <4 x float>*
1305  %wide.masked.load188 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1306  %28 = fmul fast <4 x float> %wide.masked.load188, %wide.masked.load
1307  %29 = fadd fast <4 x float> %28, %vec.phi180
1308  %30 = add i32 %index, %mul13
1309  %31 = getelementptr inbounds float, float* %2, i32 %30
1310  %32 = bitcast float* %31 to <4 x float>*
1311  %wide.masked.load189 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1312  %33 = fmul fast <4 x float> %wide.masked.load189, %wide.masked.load
1313  %34 = fadd fast <4 x float> %33, %vec.phi178
1314  %35 = add i32 %index, %mul15
1315  %36 = getelementptr inbounds float, float* %2, i32 %35
1316  %37 = bitcast float* %36 to <4 x float>*
1317  %wide.masked.load190 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %37, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1318  %38 = fmul fast <4 x float> %wide.masked.load190, %wide.masked.load
1319  %39 = fadd fast <4 x float> %38, %vec.phi
1320  %40 = select <4 x i1> %active.lane.mask, <4 x float> %39, <4 x float> %vec.phi
1321  %41 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi178
1322  %42 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi179
1323  %43 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi180
1324  %44 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi181
1325  %45 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi182
1326  %46 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi183
1327  %index.next = add i32 %index, 4
1328  %47 = icmp eq i32 %index.next, %n.vec
1329  br i1 %47, label %middle.block, label %vector.body
1330
1331middle.block:                                     ; preds = %vector.body
1332  %48 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %46)
1333  %49 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %45)
1334  %50 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %44)
1335  %51 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %43)
1336  %52 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %42)
1337  %53 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %41)
1338  %54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %40)
1339  %arrayidx56 = getelementptr inbounds float, float* %pOut, i32 %k2.0177
1340  store float %52, float* %arrayidx56, align 4
1341  %arrayidx58 = getelementptr inbounds float, float* %pOut, i32 %add
1342  store float %50, float* %arrayidx58, align 4
1343  %arrayidx60 = getelementptr inbounds float, float* %pOut, i32 %add6
1344  store float %48, float* %arrayidx60, align 4
1345  %arrayidx62 = getelementptr inbounds float, float* %pOut, i32 %add8
1346  store float %49, float* %arrayidx62, align 4
1347  %arrayidx64 = getelementptr inbounds float, float* %pOut, i32 %add10
1348  store float %51, float* %arrayidx64, align 4
1349  %arrayidx66 = getelementptr inbounds float, float* %pOut, i32 %add12
1350  store float %53, float* %arrayidx66, align 4
1351  %arrayidx68 = getelementptr inbounds float, float* %pOut, i32 %add14
1352  store float %54, float* %arrayidx68, align 4
1353  %add70 = add i32 %k2.0177, 7
1354  %cmp3 = icmp ult i32 %add70, %sub
1355  br i1 %cmp3, label %for.body, label %for.cond.cleanup
1356}
1357
1358define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
1359; CHECK-LABEL: DCT_mve8:
1360; CHECK:       @ %bb.0: @ %entry
1361; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1362; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1363; CHECK-NEXT:    .pad #4
1364; CHECK-NEXT:    sub sp, #4
1365; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1366; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1367; CHECK-NEXT:    .pad #104
1368; CHECK-NEXT:    sub sp, #104
1369; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
1370; CHECK-NEXT:    ldr r1, [r0, #4]
1371; CHECK-NEXT:    subs r1, #8
1372; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
1373; CHECK-NEXT:    cmp r1, #2
1374; CHECK-NEXT:    blo.w .LBB7_5
1375; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1376; CHECK-NEXT:    ldr r3, [r0, #8]
1377; CHECK-NEXT:    ldr r1, [r0]
1378; CHECK-NEXT:    adds r0, r3, #3
1379; CHECK-NEXT:    str r3, [sp, #20] @ 4-byte Spill
1380; CHECK-NEXT:    bic r0, r0, #3
1381; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
1382; CHECK-NEXT:    subs r1, r0, #4
1383; CHECK-NEXT:    movs r0, #1
1384; CHECK-NEXT:    lsls r5, r3, #2
1385; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
1386; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
1387; CHECK-NEXT:    lsls r1, r3, #5
1388; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
1389; CHECK-NEXT:  .LBB7_2: @ %for.body
1390; CHECK-NEXT:    @ =>This Loop Header: Depth=1
1391; CHECK-NEXT:    @ Child Loop BB7_3 Depth 2
1392; CHECK-NEXT:    adds r1, r0, #7
1393; CHECK-NEXT:    str r1, [sp, #44] @ 4-byte Spill
1394; CHECK-NEXT:    adds r1, r0, #6
1395; CHECK-NEXT:    ldrd r3, r10, [sp, #16] @ 8-byte Folded Reload
1396; CHECK-NEXT:    str r1, [sp, #40] @ 4-byte Spill
1397; CHECK-NEXT:    adds r1, r0, #5
1398; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
1399; CHECK-NEXT:    adds r1, r0, #4
1400; CHECK-NEXT:    str r1, [sp, #32] @ 4-byte Spill
1401; CHECK-NEXT:    dls lr, r3
1402; CHECK-NEXT:    ldr.w r12, [sp, #28] @ 4-byte Reload
1403; CHECK-NEXT:    vmov.i32 q3, #0x0
1404; CHECK-NEXT:    adds r4, r0, #3
1405; CHECK-NEXT:    add.w r8, r0, #2
1406; CHECK-NEXT:    adds r1, r0, #1
1407; CHECK-NEXT:    mov r3, r9
1408; CHECK-NEXT:    vmov q5, q3
1409; CHECK-NEXT:    vmov q6, q3
1410; CHECK-NEXT:    vmov q4, q3
1411; CHECK-NEXT:    vmov q7, q3
1412; CHECK-NEXT:    vmov q2, q3
1413; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
1414; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
1415; CHECK-NEXT:  .LBB7_3: @ %vector.body
1416; CHECK-NEXT:    @ Parent Loop BB7_2 Depth=1
1417; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
1418; CHECK-NEXT:    add.w r11, r3, r5
1419; CHECK-NEXT:    vctp.32 r10
1420; CHECK-NEXT:    vpsttt
1421; CHECK-NEXT:    vldrwt.u32 q0, [r12], #16
1422; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
1423; CHECK-NEXT:    vfmat.f32 q6, q1, q0
1424; CHECK-NEXT:    add.w r6, r11, r5
1425; CHECK-NEXT:    vpstt
1426; CHECK-NEXT:    vldrwt.u32 q1, [r11]
1427; CHECK-NEXT:    vfmat.f32 q7, q1, q0
1428; CHECK-NEXT:    vstrw.32 q7, [sp, #48] @ 16-byte Spill
1429; CHECK-NEXT:    vmov q7, q6
1430; CHECK-NEXT:    vmov q6, q5
1431; CHECK-NEXT:    vmov q5, q3
1432; CHECK-NEXT:    vmov q3, q4
1433; CHECK-NEXT:    vpst
1434; CHECK-NEXT:    vldrwt.u32 q1, [r6]
1435; CHECK-NEXT:    vmov q4, q2
1436; CHECK-NEXT:    vldrw.u32 q2, [sp, #64] @ 16-byte Reload
1437; CHECK-NEXT:    adds r7, r6, r5
1438; CHECK-NEXT:    vpst
1439; CHECK-NEXT:    vfmat.f32 q2, q1, q0
1440; CHECK-NEXT:    vstrw.32 q2, [sp, #64] @ 16-byte Spill
1441; CHECK-NEXT:    vpst
1442; CHECK-NEXT:    vldrwt.u32 q1, [r7]
1443; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
1444; CHECK-NEXT:    vpst
1445; CHECK-NEXT:    vfmat.f32 q2, q1, q0
1446; CHECK-NEXT:    adds r6, r7, r5
1447; CHECK-NEXT:    vstrw.32 q2, [sp, #80] @ 16-byte Spill
1448; CHECK-NEXT:    vmov q2, q4
1449; CHECK-NEXT:    vmov q4, q3
1450; CHECK-NEXT:    vmov q3, q5
1451; CHECK-NEXT:    vmov q5, q6
1452; CHECK-NEXT:    vmov q6, q7
1453; CHECK-NEXT:    vldrw.u32 q7, [sp, #48] @ 16-byte Reload
1454; CHECK-NEXT:    adds r7, r6, r5
1455; CHECK-NEXT:    vpstt
1456; CHECK-NEXT:    vldrwt.u32 q1, [r6]
1457; CHECK-NEXT:    vfmat.f32 q2, q1, q0
1458; CHECK-NEXT:    sub.w r10, r10, #4
1459; CHECK-NEXT:    adds r6, r7, r5
1460; CHECK-NEXT:    vpstttt
1461; CHECK-NEXT:    vldrwt.u32 q1, [r7]
1462; CHECK-NEXT:    vfmat.f32 q4, q1, q0
1463; CHECK-NEXT:    vldrwt.u32 q1, [r6]
1464; CHECK-NEXT:    vfmat.f32 q5, q1, q0
1465; CHECK-NEXT:    add r6, r5
1466; CHECK-NEXT:    vpstt
1467; CHECK-NEXT:    vldrwt.u32 q1, [r6]
1468; CHECK-NEXT:    vfmat.f32 q3, q1, q0
1469; CHECK-NEXT:    le lr, .LBB7_3
1470; CHECK-NEXT:  @ %bb.4: @ %middle.block
1471; CHECK-NEXT:    @ in Loop: Header=BB7_2 Depth=1
1472; CHECK-NEXT:    vadd.f32 s0, s30, s31
1473; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1474; CHECK-NEXT:    vadd.f32 s2, s28, s29
1475; CHECK-NEXT:    vadd.f32 s12, s12, s13
1476; CHECK-NEXT:    vadd.f32 s5, s14, s15
1477; CHECK-NEXT:    vadd.f32 s4, s26, s27
1478; CHECK-NEXT:    vadd.f32 s6, s24, s25
1479; CHECK-NEXT:    vadd.f32 s14, s18, s19
1480; CHECK-NEXT:    vadd.f32 s7, s16, s17
1481; CHECK-NEXT:    vldrw.u32 q4, [sp, #64] @ 16-byte Reload
1482; CHECK-NEXT:    vadd.f32 s8, s8, s9
1483; CHECK-NEXT:    vadd.f32 s13, s10, s11
1484; CHECK-NEXT:    vadd.f32 s10, s18, s19
1485; CHECK-NEXT:    vadd.f32 s9, s16, s17
1486; CHECK-NEXT:    vldrw.u32 q4, [sp, #80] @ 16-byte Reload
1487; CHECK-NEXT:    vadd.f32 s0, s2, s0
1488; CHECK-NEXT:    vadd.f32 s11, s18, s19
1489; CHECK-NEXT:    vadd.f32 s15, s16, s17
1490; CHECK-NEXT:    vadd.f32 s2, s6, s4
1491; CHECK-NEXT:    vadd.f32 s6, s12, s5
1492; CHECK-NEXT:    vadd.f32 s12, s7, s14
1493; CHECK-NEXT:    vadd.f32 s10, s9, s10
1494; CHECK-NEXT:    vstr s0, [r1]
1495; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
1496; CHECK-NEXT:    vadd.f32 s8, s8, s13
1497; CHECK-NEXT:    adds r0, #8
1498; CHECK-NEXT:    vadd.f32 s14, s15, s11
1499; CHECK-NEXT:    vstr s2, [r1]
1500; CHECK-NEXT:    add.w r1, r2, r8, lsl #2
1501; CHECK-NEXT:    vadd.f32 s1, s22, s23
1502; CHECK-NEXT:    vadd.f32 s3, s20, s21
1503; CHECK-NEXT:    vstr s10, [r1]
1504; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
1505; CHECK-NEXT:    vstr s14, [r1]
1506; CHECK-NEXT:    ldr r1, [sp, #32] @ 4-byte Reload
1507; CHECK-NEXT:    vadd.f32 s4, s3, s1
1508; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1509; CHECK-NEXT:    vstr s8, [r1]
1510; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
1511; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1512; CHECK-NEXT:    vstr s12, [r1]
1513; CHECK-NEXT:    ldr r1, [sp, #40] @ 4-byte Reload
1514; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1515; CHECK-NEXT:    vstr s4, [r1]
1516; CHECK-NEXT:    ldr r1, [sp, #44] @ 4-byte Reload
1517; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1518; CHECK-NEXT:    vstr s6, [r1]
1519; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
1520; CHECK-NEXT:    add r9, r1
1521; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
1522; CHECK-NEXT:    cmp r0, r1
1523; CHECK-NEXT:    blo.w .LBB7_2
1524; CHECK-NEXT:  .LBB7_5: @ %for.cond.cleanup
1525; CHECK-NEXT:    add sp, #104
1526; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1527; CHECK-NEXT:    add sp, #4
1528; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1529entry:
1530  %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
1531  %0 = load i32, i32* %NumInputs, align 4
1532  %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
1533  %1 = load i32, i32* %NumFilters, align 4
1534  %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
1535  %2 = load float*, float** %pDCTCoefs, align 4
1536  %cmp = icmp ugt i32 %0, 1
1537  tail call void @llvm.assume(i1 %cmp)
1538  %sub = add i32 %1, -8
1539  %cmp3197 = icmp ugt i32 %sub, 1
1540  br i1 %cmp3197, label %for.body.preheader, label %for.cond.cleanup
1541
1542for.body.preheader:                               ; preds = %entry
1543  %n.rnd.up = add i32 %0, 3
1544  %n.vec = and i32 %n.rnd.up, -4
1545  br label %for.body
1546
1547for.cond.cleanup:                                 ; preds = %middle.block, %entry
1548  ret void
1549
1550for.body:                                         ; preds = %for.body.preheader, %middle.block
1551  %k2.0198 = phi i32 [ %add79, %middle.block ], [ 1, %for.body.preheader ]
1552  %mul4 = mul i32 %k2.0198, %0
1553  %add = add nuw nsw i32 %k2.0198, 1
1554  %mul5 = mul i32 %add, %0
1555  %add6 = add nuw nsw i32 %k2.0198, 2
1556  %mul7 = mul i32 %add6, %0
1557  %add8 = add nuw nsw i32 %k2.0198, 3
1558  %mul9 = mul i32 %add8, %0
1559  %add10 = add nuw nsw i32 %k2.0198, 4
1560  %mul11 = mul i32 %add10, %0
1561  %add12 = add nuw nsw i32 %k2.0198, 5
1562  %mul13 = mul i32 %add12, %0
1563  %add14 = add nuw nsw i32 %k2.0198, 6
1564  %mul15 = mul i32 %add14, %0
1565  %add16 = add i32 %k2.0198, 7
1566  %mul17 = mul i32 %add16, %0
1567  br label %vector.body
1568
1569vector.body:                                      ; preds = %vector.body, %for.body
1570  %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
1571  %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %45, %vector.body ]
1572  %vec.phi199 = phi <4 x float> [ zeroinitializer, %for.body ], [ %46, %vector.body ]
1573  %vec.phi200 = phi <4 x float> [ zeroinitializer, %for.body ], [ %47, %vector.body ]
1574  %vec.phi201 = phi <4 x float> [ zeroinitializer, %for.body ], [ %48, %vector.body ]
1575  %vec.phi202 = phi <4 x float> [ zeroinitializer, %for.body ], [ %49, %vector.body ]
1576  %vec.phi203 = phi <4 x float> [ zeroinitializer, %for.body ], [ %50, %vector.body ]
1577  %vec.phi204 = phi <4 x float> [ zeroinitializer, %for.body ], [ %51, %vector.body ]
1578  %vec.phi205 = phi <4 x float> [ zeroinitializer, %for.body ], [ %52, %vector.body ]
1579  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
1580  %3 = getelementptr inbounds float, float* %pIn, i32 %index
1581  %4 = bitcast float* %3 to <4 x float>*
1582  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1583  %5 = add i32 %index, %mul4
1584  %6 = getelementptr inbounds float, float* %2, i32 %5
1585  %7 = bitcast float* %6 to <4 x float>*
1586  %wide.masked.load206 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1587  %8 = fmul fast <4 x float> %wide.masked.load206, %wide.masked.load
1588  %9 = fadd fast <4 x float> %8, %vec.phi200
1589  %10 = add i32 %index, %mul5
1590  %11 = getelementptr inbounds float, float* %2, i32 %10
1591  %12 = bitcast float* %11 to <4 x float>*
1592  %wide.masked.load207 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1593  %13 = fmul fast <4 x float> %wide.masked.load207, %wide.masked.load
1594  %14 = fadd fast <4 x float> %13, %vec.phi202
1595  %15 = add i32 %index, %mul7
1596  %16 = getelementptr inbounds float, float* %2, i32 %15
1597  %17 = bitcast float* %16 to <4 x float>*
1598  %wide.masked.load208 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1599  %18 = fmul fast <4 x float> %wide.masked.load208, %wide.masked.load
1600  %19 = fadd fast <4 x float> %18, %vec.phi204
1601  %20 = add i32 %index, %mul9
1602  %21 = getelementptr inbounds float, float* %2, i32 %20
1603  %22 = bitcast float* %21 to <4 x float>*
1604  %wide.masked.load209 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1605  %23 = fmul fast <4 x float> %wide.masked.load209, %wide.masked.load
1606  %24 = fadd fast <4 x float> %23, %vec.phi205
1607  %25 = add i32 %index, %mul11
1608  %26 = getelementptr inbounds float, float* %2, i32 %25
1609  %27 = bitcast float* %26 to <4 x float>*
1610  %wide.masked.load210 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1611  %28 = fmul fast <4 x float> %wide.masked.load210, %wide.masked.load
1612  %29 = fadd fast <4 x float> %28, %vec.phi203
1613  %30 = add i32 %index, %mul13
1614  %31 = getelementptr inbounds float, float* %2, i32 %30
1615  %32 = bitcast float* %31 to <4 x float>*
1616  %wide.masked.load211 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1617  %33 = fmul fast <4 x float> %wide.masked.load211, %wide.masked.load
1618  %34 = fadd fast <4 x float> %33, %vec.phi201
1619  %35 = add i32 %index, %mul15
1620  %36 = getelementptr inbounds float, float* %2, i32 %35
1621  %37 = bitcast float* %36 to <4 x float>*
1622  %wide.masked.load212 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %37, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1623  %38 = fmul fast <4 x float> %wide.masked.load212, %wide.masked.load
1624  %39 = fadd fast <4 x float> %38, %vec.phi199
1625  %40 = add i32 %index, %mul17
1626  %41 = getelementptr inbounds float, float* %2, i32 %40
1627  %42 = bitcast float* %41 to <4 x float>*
1628  %wide.masked.load213 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %42, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1629  %43 = fmul fast <4 x float> %wide.masked.load213, %wide.masked.load
1630  %44 = fadd fast <4 x float> %43, %vec.phi
1631  %45 = select <4 x i1> %active.lane.mask, <4 x float> %44, <4 x float> %vec.phi
1632  %46 = select <4 x i1> %active.lane.mask, <4 x float> %39, <4 x float> %vec.phi199
1633  %47 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi200
1634  %48 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi201
1635  %49 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi202
1636  %50 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi203
1637  %51 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi204
1638  %52 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi205
1639  %index.next = add i32 %index, 4
1640  %53 = icmp eq i32 %index.next, %n.vec
1641  br i1 %53, label %middle.block, label %vector.body
1642
1643middle.block:                                     ; preds = %vector.body
1644  %54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %52)
1645  %55 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %51)
1646  %56 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %50)
1647  %57 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %49)
1648  %58 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %48)
1649  %59 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %47)
1650  %60 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %46)
1651  %61 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %45)
1652  %arrayidx63 = getelementptr inbounds float, float* %pOut, i32 %k2.0198
1653  store float %59, float* %arrayidx63, align 4
1654  %arrayidx65 = getelementptr inbounds float, float* %pOut, i32 %add
1655  store float %57, float* %arrayidx65, align 4
1656  %arrayidx67 = getelementptr inbounds float, float* %pOut, i32 %add6
1657  store float %55, float* %arrayidx67, align 4
1658  %arrayidx69 = getelementptr inbounds float, float* %pOut, i32 %add8
1659  store float %54, float* %arrayidx69, align 4
1660  %arrayidx71 = getelementptr inbounds float, float* %pOut, i32 %add10
1661  store float %56, float* %arrayidx71, align 4
1662  %arrayidx73 = getelementptr inbounds float, float* %pOut, i32 %add12
1663  store float %58, float* %arrayidx73, align 4
1664  %arrayidx75 = getelementptr inbounds float, float* %pOut, i32 %add14
1665  store float %60, float* %arrayidx75, align 4
1666  %arrayidx77 = getelementptr inbounds float, float* %pOut, i32 %add16
1667  store float %61, float* %arrayidx77, align 4
1668  %add79 = add i32 %k2.0198, 8
1669  %cmp3 = icmp ult i32 %add79, %sub
1670  br i1 %cmp3, label %for.body, label %for.cond.cleanup
1671}
1672
1673declare void @llvm.assume(i1 noundef)
1674declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
1675declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
1676declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
1677