1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert ROW_TILE >= 1
7$assert ACCUMULATORS >= 1
8$VMULADDQ_LANE_F32 = "vfmaq_lane_f32" if FMA else "vmlaq_lane_f32"
9#include <assert.h>
10
11#include <arm_neon.h>
12
13#include <xnnpack/dwconv.h>
14#include <xnnpack/math.h>
15
16
17void xnn_f32_dwconv2d_chw_ukernel_5x5p2__${"neonfma" if FMA else "neon"}_${ROW_TILE}x4${"_acc%d" % ACCUMULATORS if ACCUMULATORS > 1 else ""}(
18    size_t input_height,
19    size_t input_width,
20    const float* input,
21    const float* weights,
22    const float* zero,
23    float* output,
24    uint32_t padding_top,
25    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
26{
27  assert(input_height != 0);
28  assert(input_width != 0);
29  assert(input_width % sizeof(float) == 0);
30  assert(padding_top == 2);
31
32  const uint32x4_t vmask = vld1q_u32(params->neon.mask);
33  const float32x4_t vmax = vld1q_dup_f32(&params->neon.max);
34  const float32x4_t vmin = vld1q_dup_f32(&params->neon.min);
35
36  const float32x4_t vw0123 = vld1q_f32(weights);
37  const float32x4_t vw4567 = vld1q_f32(weights + 4);
38  const float32x4_t vw89AB = vld1q_f32(weights + 8);
39  const float32x4_t vwCDEF = vld1q_f32(weights + 12);
40  const float32x4_t vwGHIJ = vld1q_f32(weights + 16);
41  const float32x4_t vwKLMN = vld1q_f32(weights + 20);
42  const float32x2_t vwOP = vld1_f32(weights + 24);
43
44  const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
45
46  const float* i0 = zero;
47  const float* i1 = zero;
48  const float* i2 = input;
49  $for M in range(3, 4 + ROW_TILE):
50    const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_width);
51
52  float* o0 = output;
53  $for M in range(1, ROW_TILE):
54    float* o${M} = (float*) ((uintptr_t) o${M-1} + input_width);
55
56  size_t output_height = input_height;
57  do {
58    $for M in range(2, 3 + ROW_TILE):
59      if XNN_UNPREDICTABLE(output_height < ${M}) {
60        i${M+1} = zero;
61        $if M <= ROW_TILE:
62          o${M-1} = o${M-2};
63      }
64
65    $for M in range(4 + ROW_TILE):
66      float32x4_t vi${M}x0123 = vmovq_n_f32(0.0f);
67
68    $for M in range(4 + ROW_TILE):
69      float32x4_t vi${M}x4567 = vld1q_f32(i${M}); i${M} += 4;
70
71    size_t w = input_width;
72    for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
73      $for M in range(ROW_TILE):
74        float32x4_t vo${M}p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
75
76      $for M in range(4 + ROW_TILE):
77        const float32x4_t vi${M}x89AB = vld1q_f32(i${M}); i${M} += 4;
78
79      $for M in range(ROW_TILE):
80        $if ACCUMULATORS > 1:
81          float32x4_t vo${M}p1 = vmulq_lane_f32(vi${M}x4567, vget_high_f32(vw0123), 1);
82        $else:
83          vo${M}p0 = ${VMULADDQ_LANE_F32}(vo${M}p0, vi${M}x4567, vget_high_f32(vw0123), 1);
84
85      $for M in range(ROW_TILE):
86        $if ACCUMULATORS > 2:
87          float32x4_t vo${M}p2 = vmulq_lane_f32(vi${M+1}x4567, vget_low_f32(vw89AB), 0);
88        $else:
89          vo${M}p0 = ${VMULADDQ_LANE_F32}(vo${M}p0, vi${M+1}x4567, vget_low_f32(vw89AB), 0);
90
91      $for M in range(ROW_TILE):
92        $if ACCUMULATORS > 3:
93          float32x4_t vo${M}p3 = vmulq_lane_f32(vi${M+2}x4567, vget_low_f32(vwCDEF), 1);
94        $else:
95          vo${M}p${4 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${4 % ACCUMULATORS}, vi${M+2}x4567, vget_low_f32(vwCDEF), 1);
96
97      $for M in range(ROW_TILE):
98        $if ACCUMULATORS > 4:
99          float32x4_t vo${M}p4 = vmulq_lane_f32(vi${M+3}x4567, vget_high_f32(vwGHIJ), 0);
100        $else:
101          vo${M}p${5 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${5 % ACCUMULATORS}, vi${M+3}x4567, vget_high_f32(vwGHIJ), 0);
102
103      $for M in range(ROW_TILE):
104        $if ACCUMULATORS > 6:
105          float32x4_t vo${M}p5 = vmulq_lane_f32(vi${M+4}x4567, vget_high_f32(vwKLMN), 1);
106        $else:
107          vo${M}p${6 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${6 % ACCUMULATORS}, vi${M+4}x4567, vget_high_f32(vwKLMN), 1);
108
109      $for M in range(4 + ROW_TILE):
110        const float32x4_t vi${M}x3456 = vextq_f32(vi${M}x0123, vi${M}x4567, 3);
111
112      $for M in range(ROW_TILE):
113        vo${M}p${7 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${7 % ACCUMULATORS}, vi${M}x3456, vget_high_f32(vw0123), 0);
114
115      $for M in range(ROW_TILE):
116        vo${M}p${8 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${8 % ACCUMULATORS}, vi${M+1}x3456, vget_high_f32(vw4567), 1);
117
118      $for M in range(ROW_TILE):
119        vo${M}p${9 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${9 % ACCUMULATORS}, vi${M+2}x3456, vget_low_f32(vwCDEF), 0);
120
121      $for M in range(ROW_TILE):
122        vo${M}p${10 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${10 % ACCUMULATORS}, vi${M+3}x3456, vget_low_f32(vwGHIJ), 1);
123
124      $for M in range(ROW_TILE):
125        vo${M}p${11 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${11 % ACCUMULATORS}, vi${M+4}x3456, vget_high_f32(vwKLMN), 0);
126
127      $for M in range(4 + ROW_TILE):
128        const float32x4_t vi${M}x2345 = vextq_f32(vi${M}x0123, vi${M}x4567, 2);
129        vi${M}x0123 = vi${M}x4567;
130
131      $for M in range(ROW_TILE):
132        vo${M}p${12 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${12 % ACCUMULATORS}, vi${M}x2345, vget_low_f32(vw0123), 1);
133
134      $for M in range(ROW_TILE):
135        vo${M}p${13 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${13 % ACCUMULATORS}, vi${M+1}x2345, vget_high_f32(vw4567), 0);
136
137      $for M in range(ROW_TILE):
138        vo${M}p${14 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${14 % ACCUMULATORS}, vi${M+2}x2345, vget_high_f32(vw89AB), 1);
139
140      $for M in range(ROW_TILE):
141        vo${M}p${15 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${15 % ACCUMULATORS}, vi${M+3}x2345, vget_low_f32(vwGHIJ), 0);
142
143      $for M in range(ROW_TILE):
144        vo${M}p${16 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${16 % ACCUMULATORS}, vi${M+4}x2345, vget_low_f32(vwKLMN), 1);
145
146      $for M in range(4 + ROW_TILE):
147        const float32x4_t vi${M}x5678 = vextq_f32(vi${M}x4567, vi${M}x89AB, 1);
148
149      $for M in range(ROW_TILE):
150        vo${M}p${17 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${17 % ACCUMULATORS}, vi${M}x5678, vget_low_f32(vw4567), 0);
151
152      $for M in range(ROW_TILE):
153        vo${M}p${18 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${18 % ACCUMULATORS}, vi${M+1}x5678, vget_low_f32(vw89AB), 1);
154
155      $for M in range(ROW_TILE):
156        vo${M}p${19 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${19 % ACCUMULATORS}, vi${M+2}x5678, vget_high_f32(vwCDEF), 0);
157
158      $for M in range(ROW_TILE):
159        vo${M}p${20 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${20 % ACCUMULATORS}, vi${M+3}x5678, vget_high_f32(vwGHIJ), 1);
160
161      $for M in range(ROW_TILE):
162        vo${M}p${21 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${21 % ACCUMULATORS}, vi${M+4}x5678, vwOP, 0);
163
164      $for M in range(4 + ROW_TILE):
165        const float32x4_t vi${M}x6789 = vextq_f32(vi${M}x4567, vi${M}x89AB, 2);
166        vi${M}x4567 = vi${M}x89AB;
167
168      $for M in range(ROW_TILE):
169        vo${M}p${22 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${22 % ACCUMULATORS}, vi${M}x6789, vget_low_f32(vw4567), 1);
170
171      $for M in range(ROW_TILE):
172        vo${M}p${23 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${23 % ACCUMULATORS}, vi${M+1}x6789, vget_high_f32(vw89AB), 0);
173
174      $for M in range(ROW_TILE):
175        vo${M}p${24 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${24 % ACCUMULATORS}, vi${M+2}x6789, vget_high_f32(vwCDEF), 1);
176
177      $for M in range(ROW_TILE):
178        vo${M}p${25 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${25 % ACCUMULATORS}, vi${M+3}x6789, vget_low_f32(vwKLMN), 0);
179
180      $for M in range(ROW_TILE):
181        vo${M}p${26 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${26 % ACCUMULATORS}, vi${M+4}x6789, vwOP, 1);
182
183      $if ACCUMULATORS > 1:
184        $ACC_SLICE = 1
185        $while ACC_SLICE < ACCUMULATORS:
186          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
187            $if A + ACC_SLICE < ACCUMULATORS:
188              $for M in range(ROW_TILE):
189                vo${M}p${A} = vaddq_f32(vo${M}p${A}, vo${M}p${A + ACC_SLICE});
190          $ACC_SLICE *= 2
191
192      $for M in range(ROW_TILE):
193        float32x4_t vo${M} = vmaxq_f32(vo${M}p0, vmin);
194
195      $for M in range(ROW_TILE):
196        vo${M} = vminq_f32(vo${M}, vmax);
197
198      $for M in reversed(range(ROW_TILE)):
199        vst1q_f32(o${M}, vo${M}); o${M} += 4;
200    }
201    // Always process the last block of 5..8 pixels.
202    if XNN_LIKELY(w > 4 * sizeof(float)) {
203      $for M in range(ROW_TILE):
204        float32x4_t vo${M}p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
205
206      $for M in range(4 + ROW_TILE):
207        float32x4_t vi${M}x89AB = vld1q_f32(i${M}); i${M} += 4;
208
209      $for M in range(4 + ROW_TILE):
210        vi${M}x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi${M}x89AB)));
211
212      $for M in range(ROW_TILE):
213        $if ACCUMULATORS > 1:
214          float32x4_t vo${M}p1 = vmulq_lane_f32(vi${M}x4567, vget_high_f32(vw0123), 1);
215        $else:
216          vo${M}p0 = ${VMULADDQ_LANE_F32}(vo${M}p0, vi${M}x4567, vget_high_f32(vw0123), 1);
217
218      $for M in range(ROW_TILE):
219        $if ACCUMULATORS > 2:
220          float32x4_t vo${M}p2 = vmulq_lane_f32(vi${M+1}x4567, vget_low_f32(vw89AB), 0);
221        $else:
222          vo${M}p0 = ${VMULADDQ_LANE_F32}(vo${M}p0, vi${M+1}x4567, vget_low_f32(vw89AB), 0);
223
224      $for M in range(ROW_TILE):
225        $if ACCUMULATORS > 3:
226          float32x4_t vo${M}p3 = vmulq_lane_f32(vi${M+2}x4567, vget_low_f32(vwCDEF), 1);
227        $else:
228          vo${M}p${4 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${4 % ACCUMULATORS}, vi${M+2}x4567, vget_low_f32(vwCDEF), 1);
229
230      $for M in range(ROW_TILE):
231        $if ACCUMULATORS > 4:
232          float32x4_t vo${M}p4 = vmulq_lane_f32(vi${M+3}x4567, vget_high_f32(vwGHIJ), 0);
233        $else:
234          vo${M}p${5 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${5 % ACCUMULATORS}, vi${M+3}x4567, vget_high_f32(vwGHIJ), 0);
235
236      $for M in range(ROW_TILE):
237        $if ACCUMULATORS > 6:
238          float32x4_t vo${M}p5 = vmulq_lane_f32(vi${M+4}x4567, vget_high_f32(vwKLMN), 1);
239        $else:
240          vo${M}p${6 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${6 % ACCUMULATORS}, vi${M+4}x4567, vget_high_f32(vwKLMN), 1);
241
242      $for M in range(4 + ROW_TILE):
243        const float32x4_t vi${M}x3456 = vextq_f32(vi${M}x0123, vi${M}x4567, 3);
244
245      $for M in range(ROW_TILE):
246        vo${M}p${7 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${7 % ACCUMULATORS}, vi${M}x3456, vget_high_f32(vw0123), 0);
247
248      $for M in range(ROW_TILE):
249        vo${M}p${8 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${8 % ACCUMULATORS}, vi${M+1}x3456, vget_high_f32(vw4567), 1);
250
251      $for M in range(ROW_TILE):
252        vo${M}p${9 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${9 % ACCUMULATORS}, vi${M+2}x3456, vget_low_f32(vwCDEF), 0);
253
254      $for M in range(ROW_TILE):
255        vo${M}p${10 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${10 % ACCUMULATORS}, vi${M+3}x3456, vget_low_f32(vwGHIJ), 1);
256
257      $for M in range(ROW_TILE):
258        vo${M}p${11 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${11 % ACCUMULATORS}, vi${M+4}x3456, vget_high_f32(vwKLMN), 0);
259
260      $for M in range(4 + ROW_TILE):
261        const float32x4_t vi${M}x2345 = vextq_f32(vi${M}x0123, vi${M}x4567, 2);
262        vi${M}x0123 = vi${M}x4567;
263
264      $for M in range(ROW_TILE):
265        vo${M}p${12 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${12 % ACCUMULATORS}, vi${M}x2345, vget_low_f32(vw0123), 1);
266
267      $for M in range(ROW_TILE):
268        vo${M}p${13 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${13 % ACCUMULATORS}, vi${M+1}x2345, vget_high_f32(vw4567), 0);
269
270      $for M in range(ROW_TILE):
271        vo${M}p${14 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${14 % ACCUMULATORS}, vi${M+2}x2345, vget_high_f32(vw89AB), 1);
272
273      $for M in range(ROW_TILE):
274        vo${M}p${15 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${15 % ACCUMULATORS}, vi${M+3}x2345, vget_low_f32(vwGHIJ), 0);
275
276      $for M in range(ROW_TILE):
277        vo${M}p${16 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${16 % ACCUMULATORS}, vi${M+4}x2345, vget_low_f32(vwKLMN), 1);
278
279      $for M in range(4 + ROW_TILE):
280        const float32x4_t vi${M}x5678 = vextq_f32(vi${M}x4567, vi${M}x89AB, 1);
281
282      $for M in range(ROW_TILE):
283        vo${M}p${17 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${17 % ACCUMULATORS}, vi${M}x5678, vget_low_f32(vw4567), 0);
284
285      $for M in range(ROW_TILE):
286        vo${M}p${18 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${18 % ACCUMULATORS}, vi${M+1}x5678, vget_low_f32(vw89AB), 1);
287
288      $for M in range(ROW_TILE):
289        vo${M}p${19 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${19 % ACCUMULATORS}, vi${M+2}x5678, vget_high_f32(vwCDEF), 0);
290
291      $for M in range(ROW_TILE):
292        vo${M}p${20 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${20 % ACCUMULATORS}, vi${M+3}x5678, vget_high_f32(vwGHIJ), 1);
293
294      $for M in range(ROW_TILE):
295        vo${M}p${21 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${21 % ACCUMULATORS}, vi${M+4}x5678, vwOP, 0);
296
297      $for M in range(4 + ROW_TILE):
298        const float32x4_t vi${M}x6789 = vextq_f32(vi${M}x4567, vi${M}x89AB, 2);
299        vi${M}x4567 = vi${M}x89AB;
300
301      $for M in range(ROW_TILE):
302        vo${M}p${22 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${22 % ACCUMULATORS}, vi${M}x6789, vget_low_f32(vw4567), 1);
303
304      $for M in range(ROW_TILE):
305        vo${M}p${23 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${23 % ACCUMULATORS}, vi${M+1}x6789, vget_high_f32(vw89AB), 0);
306
307      $for M in range(ROW_TILE):
308        vo${M}p${24 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${24 % ACCUMULATORS}, vi${M+2}x6789, vget_high_f32(vwCDEF), 1);
309
310      $for M in range(ROW_TILE):
311        vo${M}p${25 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${25 % ACCUMULATORS}, vi${M+3}x6789, vget_low_f32(vwKLMN), 0);
312
313      $for M in range(ROW_TILE):
314        vo${M}p${26 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${26 % ACCUMULATORS}, vi${M+4}x6789, vwOP, 1);
315
316      $if ACCUMULATORS > 1:
317        $ACC_SLICE = 1
318        $while ACC_SLICE < ACCUMULATORS:
319          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
320            $if A + ACC_SLICE < ACCUMULATORS:
321              $for M in range(ROW_TILE):
322                vo${M}p${A} = vaddq_f32(vo${M}p${A}, vo${M}p${A + ACC_SLICE});
323          $ACC_SLICE *= 2
324
325      $for M in range(ROW_TILE):
326        float32x4_t vo${M} = vmaxq_f32(vo${M}p0, vmin);
327
328      $for M in range(ROW_TILE):
329        vo${M} = vminq_f32(vo${M}, vmax);
330
331      $for M in reversed(range(ROW_TILE)):
332        vst1q_f32(o${M}, vo${M}); o${M} += 4;
333
334      w -= 4 * sizeof(float);
335    }
336    assert(w >= 1 * sizeof(float));
337    assert(w <= 4 * sizeof(float));
338    {
339      $for M in range(ROW_TILE):
340        float32x4_t vo${M}p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
341
342      $for M in range(4 + ROW_TILE):
343        vi${M}x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi${M}x4567)));
344
345      $for M in range(ROW_TILE):
346        $if ACCUMULATORS > 1:
347          float32x4_t vo${M}p1 = vmulq_lane_f32(vi${M}x4567, vget_high_f32(vw0123), 1);
348        $else:
349          vo${M}p0 = ${VMULADDQ_LANE_F32}(vo${M}p0, vi${M}x4567, vget_high_f32(vw0123), 1);
350
351      $for M in range(ROW_TILE):
352        $if ACCUMULATORS > 2:
353          float32x4_t vo${M}p2 = vmulq_lane_f32(vi${M+1}x4567, vget_low_f32(vw89AB), 0);
354        $else:
355          vo${M}p0 = ${VMULADDQ_LANE_F32}(vo${M}p0, vi${M+1}x4567, vget_low_f32(vw89AB), 0);
356
357      $for M in range(ROW_TILE):
358        $if ACCUMULATORS > 3:
359          float32x4_t vo${M}p3 = vmulq_lane_f32(vi${M+2}x4567, vget_low_f32(vwCDEF), 1);
360        $else:
361          vo${M}p${4 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${4 % ACCUMULATORS}, vi${M+2}x4567, vget_low_f32(vwCDEF), 1);
362
363      $for M in range(ROW_TILE):
364        $if ACCUMULATORS > 4:
365          float32x4_t vo${M}p4 = vmulq_lane_f32(vi${M+3}x4567, vget_high_f32(vwGHIJ), 0);
366        $else:
367          vo${M}p${5 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${5 % ACCUMULATORS}, vi${M+3}x4567, vget_high_f32(vwGHIJ), 0);
368
369      $for M in range(ROW_TILE):
370        $if ACCUMULATORS > 6:
371          float32x4_t vo${M}p5 = vmulq_lane_f32(vi${M+4}x4567, vget_high_f32(vwKLMN), 1);
372        $else:
373          vo${M}p${6 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${6 % ACCUMULATORS}, vi${M+4}x4567, vget_high_f32(vwKLMN), 1);
374
375      $for M in range(4 + ROW_TILE):
376        const float32x4_t vi${M}x3456 = vextq_f32(vi${M}x0123, vi${M}x4567, 3);
377
378      $for M in range(ROW_TILE):
379        vo${M}p${7 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${7 % ACCUMULATORS}, vi${M}x3456, vget_high_f32(vw0123), 0);
380
381      $for M in range(ROW_TILE):
382        vo${M}p${8 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${8 % ACCUMULATORS}, vi${M+1}x3456, vget_high_f32(vw4567), 1);
383
384      $for M in range(ROW_TILE):
385        vo${M}p${9 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${9 % ACCUMULATORS}, vi${M+2}x3456, vget_low_f32(vwCDEF), 0);
386
387      $for M in range(ROW_TILE):
388        vo${M}p${10 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${10 % ACCUMULATORS}, vi${M+3}x3456, vget_low_f32(vwGHIJ), 1);
389
390      $for M in range(ROW_TILE):
391        vo${M}p${11 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${11 % ACCUMULATORS}, vi${M+4}x3456, vget_high_f32(vwKLMN), 0);
392
393      $for M in range(4 + ROW_TILE):
394        const float32x4_t vi${M}x2345 = vextq_f32(vi${M}x0123, vi${M}x4567, 2);
395
396      $for M in range(ROW_TILE):
397        vo${M}p${12 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${12 % ACCUMULATORS}, vi${M}x2345, vget_low_f32(vw0123), 1);
398
399      $for M in range(ROW_TILE):
400        vo${M}p${13 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${13 % ACCUMULATORS}, vi${M+1}x2345, vget_high_f32(vw4567), 0);
401
402      $for M in range(ROW_TILE):
403        vo${M}p${14 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${14 % ACCUMULATORS}, vi${M+2}x2345, vget_high_f32(vw89AB), 1);
404
405      $for M in range(ROW_TILE):
406        vo${M}p${15 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${15 % ACCUMULATORS}, vi${M+3}x2345, vget_low_f32(vwGHIJ), 0);
407
408      $for M in range(ROW_TILE):
409        vo${M}p${16 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${16 % ACCUMULATORS}, vi${M+4}x2345, vget_low_f32(vwKLMN), 1);
410
411      const float32x4_t vzero = vmovq_n_f32(0.0f);
412      $for M in range(4 + ROW_TILE):
413        const float32x4_t vi${M}x5678 = vextq_f32(vi${M}x4567, vzero, 1);
414
415      $for M in range(ROW_TILE):
416        vo${M}p${17 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${17 % ACCUMULATORS}, vi${M}x5678, vget_low_f32(vw4567), 0);
417
418      $for M in range(ROW_TILE):
419        vo${M}p${18 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${18 % ACCUMULATORS}, vi${M+1}x5678, vget_low_f32(vw89AB), 1);
420
421      $for M in range(ROW_TILE):
422        vo${M}p${19 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${19 % ACCUMULATORS}, vi${M+2}x5678, vget_high_f32(vwCDEF), 0);
423
424      $for M in range(ROW_TILE):
425        vo${M}p${20 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${20 % ACCUMULATORS}, vi${M+3}x5678, vget_high_f32(vwGHIJ), 1);
426
427      $for M in range(ROW_TILE):
428        vo${M}p${21 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${21 % ACCUMULATORS}, vi${M+4}x5678, vwOP, 0);
429
430      $for M in range(4 + ROW_TILE):
431        const float32x4_t vi${M}x6789 = vextq_f32(vi${M}x5678, vzero, 1);
432
433      $for M in range(ROW_TILE):
434        vo${M}p${22 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${22 % ACCUMULATORS}, vi${M}x6789, vget_low_f32(vw4567), 1);
435
436      $for M in range(ROW_TILE):
437        vo${M}p${23 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${23 % ACCUMULATORS}, vi${M+1}x6789, vget_high_f32(vw89AB), 0);
438
439      $for M in range(ROW_TILE):
440        vo${M}p${24 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${24 % ACCUMULATORS}, vi${M+2}x6789, vget_high_f32(vwCDEF), 1);
441
442      $for M in range(ROW_TILE):
443        vo${M}p${25 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${25 % ACCUMULATORS}, vi${M+3}x6789, vget_low_f32(vwKLMN), 0);
444
445      $for M in range(ROW_TILE):
446        vo${M}p${26 % ACCUMULATORS} = ${VMULADDQ_LANE_F32}(vo${M}p${26 % ACCUMULATORS}, vi${M+4}x6789, vwOP, 1);
447
448      $if ACCUMULATORS > 1:
449        $ACC_SLICE = 1
450        $while ACC_SLICE < ACCUMULATORS:
451          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
452            $if A + ACC_SLICE < ACCUMULATORS:
453              $for M in range(ROW_TILE):
454                vo${M}p${A} = vaddq_f32(vo${M}p${A}, vo${M}p${A + ACC_SLICE});
455          $ACC_SLICE *= 2
456
457      $for M in range(ROW_TILE):
458        float32x4_t vo${M} = vmaxq_f32(vo${M}p0, vmin);
459
460      $for M in range(ROW_TILE):
461        vo${M} = vminq_f32(vo${M}, vmax);
462
463      if XNN_LIKELY(w & (4 * sizeof(float))) {
464        $for M in reversed(range(ROW_TILE)):
465          vst1q_f32(o${M}, vo${M}); o${M} += 4;
466      } else {
467        $for M in range(ROW_TILE):
468          float32x2_t vo${M}_lo = vget_low_f32(vo${M});
469        if (w & (2 * sizeof(float))) {
470          $for M in reversed(range(ROW_TILE)):
471            vst1_f32(o${M}, vo${M}_lo); o${M} += 2;
472
473          $for M in range(ROW_TILE):
474            vo${M}_lo = vget_high_f32(vo${M});
475        }
476        if (w & (1 * sizeof(float))) {
477          $for M in reversed(range(ROW_TILE)):
478            vst1_lane_f32(o${M}, vo${M}_lo, 0); o${M} += 1;
479        }
480      }
481    }
482
483    i0 = (const float*) ((uintptr_t) i${ROW_TILE} - input_decrement);
484    i1 = (const float*) ((uintptr_t) i${ROW_TILE+1} - input_decrement);
485    $for M in range(2, 4 + ROW_TILE):
486      i${M} = (const float*) ((uintptr_t) i${M-1} + input_width);
487
488    $if ROW_TILE > 1:
489      o0 = o${ROW_TILE - 1};
490      $for M in range(1, ROW_TILE):
491        o${M} = (float*) ((uintptr_t) o${M-1} + input_width);
492
493    $if ROW_TILE > 1:
494      output_height = doz(output_height, ${ROW_TILE});
495  } while (${"--" if ROW_TILE == 1 else ""}output_height != 0);
496}
497