1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-dwconv2d-chw/5x5p2-neon.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16 
17 
xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2(
19     size_t input_height,
20     size_t input_width,
21     const float* input,
22     const float* weights,
23     const float* zero,
24     float* output,
25     uint32_t padding_top,
26     const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28   assert(input_height != 0);
29   assert(input_width != 0);
30   assert(input_width % sizeof(float) == 0);
31   assert(padding_top == 2);
32 
33   const uint32x4_t vmask = vld1q_u32(params->neon.mask);
34   const float32x4_t vmax = vld1q_dup_f32(&params->neon.max);
35   const float32x4_t vmin = vld1q_dup_f32(&params->neon.min);
36 
37   const float32x4_t vw0123 = vld1q_f32(weights);
38   const float32x4_t vw4567 = vld1q_f32(weights + 4);
39   const float32x4_t vw89AB = vld1q_f32(weights + 8);
40   const float32x4_t vwCDEF = vld1q_f32(weights + 12);
41   const float32x4_t vwGHIJ = vld1q_f32(weights + 16);
42   const float32x4_t vwKLMN = vld1q_f32(weights + 20);
43   const float32x2_t vwOP = vld1_f32(weights + 24);
44 
45   const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
46 
47   const float* i0 = zero;
48   const float* i1 = zero;
49   const float* i2 = input;
50   const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
51   const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
52   const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
53   const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
54   const float* i7 = (const float*) ((uintptr_t) i6 + input_width);
55 
56   float* o0 = output;
57   float* o1 = (float*) ((uintptr_t) o0 + input_width);
58   float* o2 = (float*) ((uintptr_t) o1 + input_width);
59   float* o3 = (float*) ((uintptr_t) o2 + input_width);
60 
61   size_t output_height = input_height;
62   do {
63     if XNN_UNPREDICTABLE(output_height < 2) {
64       i3 = zero;
65       o1 = o0;
66     }
67     if XNN_UNPREDICTABLE(output_height < 3) {
68       i4 = zero;
69       o2 = o1;
70     }
71     if XNN_UNPREDICTABLE(output_height < 4) {
72       i5 = zero;
73       o3 = o2;
74     }
75     if XNN_UNPREDICTABLE(output_height < 5) {
76       i6 = zero;
77     }
78     if XNN_UNPREDICTABLE(output_height < 6) {
79       i7 = zero;
80     }
81 
82     float32x4_t vi0x0123 = vmovq_n_f32(0.0f);
83     float32x4_t vi1x0123 = vmovq_n_f32(0.0f);
84     float32x4_t vi2x0123 = vmovq_n_f32(0.0f);
85     float32x4_t vi3x0123 = vmovq_n_f32(0.0f);
86     float32x4_t vi4x0123 = vmovq_n_f32(0.0f);
87     float32x4_t vi5x0123 = vmovq_n_f32(0.0f);
88     float32x4_t vi6x0123 = vmovq_n_f32(0.0f);
89     float32x4_t vi7x0123 = vmovq_n_f32(0.0f);
90 
91     float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
92     float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
93     float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
94     float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
95     float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4;
96     float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4;
97     float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4;
98     float32x4_t vi7x4567 = vld1q_f32(i7); i7 += 4;
99 
100     size_t w = input_width;
101     for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
102       float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
103       float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
104       float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
105       float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
106 
107       const float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4;
108       const float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4;
109       const float32x4_t vi2x89AB = vld1q_f32(i2); i2 += 4;
110       const float32x4_t vi3x89AB = vld1q_f32(i3); i3 += 4;
111       const float32x4_t vi4x89AB = vld1q_f32(i4); i4 += 4;
112       const float32x4_t vi5x89AB = vld1q_f32(i5); i5 += 4;
113       const float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4;
114       const float32x4_t vi7x89AB = vld1q_f32(i7); i7 += 4;
115 
116       float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1);
117       float32x4_t vo1p1 = vmulq_lane_f32(vi1x4567, vget_high_f32(vw0123), 1);
118       float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1);
119       float32x4_t vo3p1 = vmulq_lane_f32(vi3x4567, vget_high_f32(vw0123), 1);
120 
121       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0);
122       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw89AB), 0);
123       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0);
124       vo3p0 = vfmaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0);
125 
126       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1);
127       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x4567, vget_low_f32(vwCDEF), 1);
128       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1);
129       vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1);
130 
131       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0);
132       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x4567, vget_high_f32(vwGHIJ), 0);
133       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0);
134       vo3p1 = vfmaq_lane_f32(vo3p1, vi6x4567, vget_high_f32(vwGHIJ), 0);
135 
136       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1);
137       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1);
138       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1);
139       vo3p0 = vfmaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1);
140 
141       const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3);
142       const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3);
143       const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3);
144       const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3);
145       const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3);
146       const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3);
147       const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3);
148       const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3);
149 
150       vo0p1 = vfmaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0);
151       vo1p1 = vfmaq_lane_f32(vo1p1, vi1x3456, vget_high_f32(vw0123), 0);
152       vo2p1 = vfmaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0);
153       vo3p1 = vfmaq_lane_f32(vo3p1, vi3x3456, vget_high_f32(vw0123), 0);
154 
155       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1);
156       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x3456, vget_high_f32(vw4567), 1);
157       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1);
158       vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1);
159 
160       vo0p1 = vfmaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0);
161       vo1p1 = vfmaq_lane_f32(vo1p1, vi3x3456, vget_low_f32(vwCDEF), 0);
162       vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0);
163       vo3p1 = vfmaq_lane_f32(vo3p1, vi5x3456, vget_low_f32(vwCDEF), 0);
164 
165       vo0p0 = vfmaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1);
166       vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1);
167       vo2p0 = vfmaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1);
168       vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1);
169 
170       vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0);
171       vo1p1 = vfmaq_lane_f32(vo1p1, vi5x3456, vget_high_f32(vwKLMN), 0);
172       vo2p1 = vfmaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0);
173       vo3p1 = vfmaq_lane_f32(vo3p1, vi7x3456, vget_high_f32(vwKLMN), 0);
174 
175       const float32x4_t vi0x2345 = vextq_f32(vi0x0123, vi0x4567, 2);
176       vi0x0123 = vi0x4567;
177       const float32x4_t vi1x2345 = vextq_f32(vi1x0123, vi1x4567, 2);
178       vi1x0123 = vi1x4567;
179       const float32x4_t vi2x2345 = vextq_f32(vi2x0123, vi2x4567, 2);
180       vi2x0123 = vi2x4567;
181       const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2);
182       vi3x0123 = vi3x4567;
183       const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2);
184       vi4x0123 = vi4x4567;
185       const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2);
186       vi5x0123 = vi5x4567;
187       const float32x4_t vi6x2345 = vextq_f32(vi6x0123, vi6x4567, 2);
188       vi6x0123 = vi6x4567;
189       const float32x4_t vi7x2345 = vextq_f32(vi7x0123, vi7x4567, 2);
190       vi7x0123 = vi7x4567;
191 
192       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1);
193       vo1p0 = vfmaq_lane_f32(vo1p0, vi1x2345, vget_low_f32(vw0123), 1);
194       vo2p0 = vfmaq_lane_f32(vo2p0, vi2x2345, vget_low_f32(vw0123), 1);
195       vo3p0 = vfmaq_lane_f32(vo3p0, vi3x2345, vget_low_f32(vw0123), 1);
196 
197       vo0p1 = vfmaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0);
198       vo1p1 = vfmaq_lane_f32(vo1p1, vi2x2345, vget_high_f32(vw4567), 0);
199       vo2p1 = vfmaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0);
200       vo3p1 = vfmaq_lane_f32(vo3p1, vi4x2345, vget_high_f32(vw4567), 0);
201 
202       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1);
203       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x2345, vget_high_f32(vw89AB), 1);
204       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x2345, vget_high_f32(vw89AB), 1);
205       vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1);
206 
207       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0);
208       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x2345, vget_low_f32(vwGHIJ), 0);
209       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0);
210       vo3p1 = vfmaq_lane_f32(vo3p1, vi6x2345, vget_low_f32(vwGHIJ), 0);
211 
212       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1);
213       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1);
214       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x2345, vget_low_f32(vwKLMN), 1);
215       vo3p0 = vfmaq_lane_f32(vo3p0, vi7x2345, vget_low_f32(vwKLMN), 1);
216 
217       const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vi0x89AB, 1);
218       const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1);
219       const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vi2x89AB, 1);
220       const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1);
221       const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1);
222       const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1);
223       const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1);
224       const float32x4_t vi7x5678 = vextq_f32(vi7x4567, vi7x89AB, 1);
225 
226       vo0p1 = vfmaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0);
227       vo1p1 = vfmaq_lane_f32(vo1p1, vi1x5678, vget_low_f32(vw4567), 0);
228       vo2p1 = vfmaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0);
229       vo3p1 = vfmaq_lane_f32(vo3p1, vi3x5678, vget_low_f32(vw4567), 0);
230 
231       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1);
232       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x5678, vget_low_f32(vw89AB), 1);
233       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x5678, vget_low_f32(vw89AB), 1);
234       vo3p0 = vfmaq_lane_f32(vo3p0, vi4x5678, vget_low_f32(vw89AB), 1);
235 
236       vo0p1 = vfmaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0);
237       vo1p1 = vfmaq_lane_f32(vo1p1, vi3x5678, vget_high_f32(vwCDEF), 0);
238       vo2p1 = vfmaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0);
239       vo3p1 = vfmaq_lane_f32(vo3p1, vi5x5678, vget_high_f32(vwCDEF), 0);
240 
241       vo0p0 = vfmaq_lane_f32(vo0p0, vi3x5678, vget_high_f32(vwGHIJ), 1);
242       vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1);
243       vo2p0 = vfmaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1);
244       vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1);
245 
246       vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0);
247       vo1p1 = vfmaq_lane_f32(vo1p1, vi5x5678, vwOP, 0);
248       vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0);
249       vo3p1 = vfmaq_lane_f32(vo3p1, vi7x5678, vwOP, 0);
250 
251       const float32x4_t vi0x6789 = vextq_f32(vi0x4567, vi0x89AB, 2);
252       vi0x4567 = vi0x89AB;
253       const float32x4_t vi1x6789 = vextq_f32(vi1x4567, vi1x89AB, 2);
254       vi1x4567 = vi1x89AB;
255       const float32x4_t vi2x6789 = vextq_f32(vi2x4567, vi2x89AB, 2);
256       vi2x4567 = vi2x89AB;
257       const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2);
258       vi3x4567 = vi3x89AB;
259       const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2);
260       vi4x4567 = vi4x89AB;
261       const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2);
262       vi5x4567 = vi5x89AB;
263       const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2);
264       vi6x4567 = vi6x89AB;
265       const float32x4_t vi7x6789 = vextq_f32(vi7x4567, vi7x89AB, 2);
266       vi7x4567 = vi7x89AB;
267 
268       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x6789, vget_low_f32(vw4567), 1);
269       vo1p0 = vfmaq_lane_f32(vo1p0, vi1x6789, vget_low_f32(vw4567), 1);
270       vo2p0 = vfmaq_lane_f32(vo2p0, vi2x6789, vget_low_f32(vw4567), 1);
271       vo3p0 = vfmaq_lane_f32(vo3p0, vi3x6789, vget_low_f32(vw4567), 1);
272 
273       vo0p1 = vfmaq_lane_f32(vo0p1, vi1x6789, vget_high_f32(vw89AB), 0);
274       vo1p1 = vfmaq_lane_f32(vo1p1, vi2x6789, vget_high_f32(vw89AB), 0);
275       vo2p1 = vfmaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0);
276       vo3p1 = vfmaq_lane_f32(vo3p1, vi4x6789, vget_high_f32(vw89AB), 0);
277 
278       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x6789, vget_high_f32(vwCDEF), 1);
279       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1);
280       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x6789, vget_high_f32(vwCDEF), 1);
281       vo3p0 = vfmaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1);
282 
283       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0);
284       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x6789, vget_low_f32(vwKLMN), 0);
285       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0);
286       vo3p1 = vfmaq_lane_f32(vo3p1, vi6x6789, vget_low_f32(vwKLMN), 0);
287 
288       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x6789, vwOP, 1);
289       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1);
290       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x6789, vwOP, 1);
291       vo3p0 = vfmaq_lane_f32(vo3p0, vi7x6789, vwOP, 1);
292 
293       vo0p0 = vaddq_f32(vo0p0, vo0p1);
294       vo1p0 = vaddq_f32(vo1p0, vo1p1);
295       vo2p0 = vaddq_f32(vo2p0, vo2p1);
296       vo3p0 = vaddq_f32(vo3p0, vo3p1);
297 
298       float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);
299       float32x4_t vo1 = vmaxq_f32(vo1p0, vmin);
300       float32x4_t vo2 = vmaxq_f32(vo2p0, vmin);
301       float32x4_t vo3 = vmaxq_f32(vo3p0, vmin);
302 
303       vo0 = vminq_f32(vo0, vmax);
304       vo1 = vminq_f32(vo1, vmax);
305       vo2 = vminq_f32(vo2, vmax);
306       vo3 = vminq_f32(vo3, vmax);
307 
308       vst1q_f32(o3, vo3); o3 += 4;
309       vst1q_f32(o2, vo2); o2 += 4;
310       vst1q_f32(o1, vo1); o1 += 4;
311       vst1q_f32(o0, vo0); o0 += 4;
312     }
313     // Always process the last block of 5..8 pixels.
314     if XNN_LIKELY(w > 4 * sizeof(float)) {
315       float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
316       float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
317       float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
318       float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
319 
320       float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4;
321       float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4;
322       float32x4_t vi2x89AB = vld1q_f32(i2); i2 += 4;
323       float32x4_t vi3x89AB = vld1q_f32(i3); i3 += 4;
324       float32x4_t vi4x89AB = vld1q_f32(i4); i4 += 4;
325       float32x4_t vi5x89AB = vld1q_f32(i5); i5 += 4;
326       float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4;
327       float32x4_t vi7x89AB = vld1q_f32(i7); i7 += 4;
328 
329       vi0x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi0x89AB)));
330       vi1x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x89AB)));
331       vi2x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi2x89AB)));
332       vi3x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi3x89AB)));
333       vi4x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi4x89AB)));
334       vi5x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi5x89AB)));
335       vi6x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi6x89AB)));
336       vi7x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi7x89AB)));
337 
338       float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1);
339       float32x4_t vo1p1 = vmulq_lane_f32(vi1x4567, vget_high_f32(vw0123), 1);
340       float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1);
341       float32x4_t vo3p1 = vmulq_lane_f32(vi3x4567, vget_high_f32(vw0123), 1);
342 
343       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0);
344       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw89AB), 0);
345       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0);
346       vo3p0 = vfmaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0);
347 
348       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1);
349       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x4567, vget_low_f32(vwCDEF), 1);
350       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1);
351       vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1);
352 
353       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0);
354       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x4567, vget_high_f32(vwGHIJ), 0);
355       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0);
356       vo3p1 = vfmaq_lane_f32(vo3p1, vi6x4567, vget_high_f32(vwGHIJ), 0);
357 
358       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1);
359       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1);
360       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1);
361       vo3p0 = vfmaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1);
362 
363       const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3);
364       const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3);
365       const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3);
366       const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3);
367       const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3);
368       const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3);
369       const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3);
370       const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3);
371 
372       vo0p1 = vfmaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0);
373       vo1p1 = vfmaq_lane_f32(vo1p1, vi1x3456, vget_high_f32(vw0123), 0);
374       vo2p1 = vfmaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0);
375       vo3p1 = vfmaq_lane_f32(vo3p1, vi3x3456, vget_high_f32(vw0123), 0);
376 
377       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1);
378       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x3456, vget_high_f32(vw4567), 1);
379       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1);
380       vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1);
381 
382       vo0p1 = vfmaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0);
383       vo1p1 = vfmaq_lane_f32(vo1p1, vi3x3456, vget_low_f32(vwCDEF), 0);
384       vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0);
385       vo3p1 = vfmaq_lane_f32(vo3p1, vi5x3456, vget_low_f32(vwCDEF), 0);
386 
387       vo0p0 = vfmaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1);
388       vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1);
389       vo2p0 = vfmaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1);
390       vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1);
391 
392       vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0);
393       vo1p1 = vfmaq_lane_f32(vo1p1, vi5x3456, vget_high_f32(vwKLMN), 0);
394       vo2p1 = vfmaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0);
395       vo3p1 = vfmaq_lane_f32(vo3p1, vi7x3456, vget_high_f32(vwKLMN), 0);
396 
397       const float32x4_t vi0x2345 = vextq_f32(vi0x0123, vi0x4567, 2);
398       vi0x0123 = vi0x4567;
399       const float32x4_t vi1x2345 = vextq_f32(vi1x0123, vi1x4567, 2);
400       vi1x0123 = vi1x4567;
401       const float32x4_t vi2x2345 = vextq_f32(vi2x0123, vi2x4567, 2);
402       vi2x0123 = vi2x4567;
403       const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2);
404       vi3x0123 = vi3x4567;
405       const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2);
406       vi4x0123 = vi4x4567;
407       const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2);
408       vi5x0123 = vi5x4567;
409       const float32x4_t vi6x2345 = vextq_f32(vi6x0123, vi6x4567, 2);
410       vi6x0123 = vi6x4567;
411       const float32x4_t vi7x2345 = vextq_f32(vi7x0123, vi7x4567, 2);
412       vi7x0123 = vi7x4567;
413 
414       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1);
415       vo1p0 = vfmaq_lane_f32(vo1p0, vi1x2345, vget_low_f32(vw0123), 1);
416       vo2p0 = vfmaq_lane_f32(vo2p0, vi2x2345, vget_low_f32(vw0123), 1);
417       vo3p0 = vfmaq_lane_f32(vo3p0, vi3x2345, vget_low_f32(vw0123), 1);
418 
419       vo0p1 = vfmaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0);
420       vo1p1 = vfmaq_lane_f32(vo1p1, vi2x2345, vget_high_f32(vw4567), 0);
421       vo2p1 = vfmaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0);
422       vo3p1 = vfmaq_lane_f32(vo3p1, vi4x2345, vget_high_f32(vw4567), 0);
423 
424       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1);
425       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x2345, vget_high_f32(vw89AB), 1);
426       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x2345, vget_high_f32(vw89AB), 1);
427       vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1);
428 
429       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0);
430       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x2345, vget_low_f32(vwGHIJ), 0);
431       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0);
432       vo3p1 = vfmaq_lane_f32(vo3p1, vi6x2345, vget_low_f32(vwGHIJ), 0);
433 
434       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1);
435       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1);
436       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x2345, vget_low_f32(vwKLMN), 1);
437       vo3p0 = vfmaq_lane_f32(vo3p0, vi7x2345, vget_low_f32(vwKLMN), 1);
438 
439       const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vi0x89AB, 1);
440       const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1);
441       const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vi2x89AB, 1);
442       const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1);
443       const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1);
444       const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1);
445       const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1);
446       const float32x4_t vi7x5678 = vextq_f32(vi7x4567, vi7x89AB, 1);
447 
448       vo0p1 = vfmaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0);
449       vo1p1 = vfmaq_lane_f32(vo1p1, vi1x5678, vget_low_f32(vw4567), 0);
450       vo2p1 = vfmaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0);
451       vo3p1 = vfmaq_lane_f32(vo3p1, vi3x5678, vget_low_f32(vw4567), 0);
452 
453       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1);
454       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x5678, vget_low_f32(vw89AB), 1);
455       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x5678, vget_low_f32(vw89AB), 1);
456       vo3p0 = vfmaq_lane_f32(vo3p0, vi4x5678, vget_low_f32(vw89AB), 1);
457 
458       vo0p1 = vfmaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0);
459       vo1p1 = vfmaq_lane_f32(vo1p1, vi3x5678, vget_high_f32(vwCDEF), 0);
460       vo2p1 = vfmaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0);
461       vo3p1 = vfmaq_lane_f32(vo3p1, vi5x5678, vget_high_f32(vwCDEF), 0);
462 
463       vo0p0 = vfmaq_lane_f32(vo0p0, vi3x5678, vget_high_f32(vwGHIJ), 1);
464       vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1);
465       vo2p0 = vfmaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1);
466       vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1);
467 
468       vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0);
469       vo1p1 = vfmaq_lane_f32(vo1p1, vi5x5678, vwOP, 0);
470       vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0);
471       vo3p1 = vfmaq_lane_f32(vo3p1, vi7x5678, vwOP, 0);
472 
473       const float32x4_t vi0x6789 = vextq_f32(vi0x4567, vi0x89AB, 2);
474       vi0x4567 = vi0x89AB;
475       const float32x4_t vi1x6789 = vextq_f32(vi1x4567, vi1x89AB, 2);
476       vi1x4567 = vi1x89AB;
477       const float32x4_t vi2x6789 = vextq_f32(vi2x4567, vi2x89AB, 2);
478       vi2x4567 = vi2x89AB;
479       const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2);
480       vi3x4567 = vi3x89AB;
481       const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2);
482       vi4x4567 = vi4x89AB;
483       const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2);
484       vi5x4567 = vi5x89AB;
485       const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2);
486       vi6x4567 = vi6x89AB;
487       const float32x4_t vi7x6789 = vextq_f32(vi7x4567, vi7x89AB, 2);
488       vi7x4567 = vi7x89AB;
489 
490       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x6789, vget_low_f32(vw4567), 1);
491       vo1p0 = vfmaq_lane_f32(vo1p0, vi1x6789, vget_low_f32(vw4567), 1);
492       vo2p0 = vfmaq_lane_f32(vo2p0, vi2x6789, vget_low_f32(vw4567), 1);
493       vo3p0 = vfmaq_lane_f32(vo3p0, vi3x6789, vget_low_f32(vw4567), 1);
494 
495       vo0p1 = vfmaq_lane_f32(vo0p1, vi1x6789, vget_high_f32(vw89AB), 0);
496       vo1p1 = vfmaq_lane_f32(vo1p1, vi2x6789, vget_high_f32(vw89AB), 0);
497       vo2p1 = vfmaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0);
498       vo3p1 = vfmaq_lane_f32(vo3p1, vi4x6789, vget_high_f32(vw89AB), 0);
499 
500       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x6789, vget_high_f32(vwCDEF), 1);
501       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1);
502       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x6789, vget_high_f32(vwCDEF), 1);
503       vo3p0 = vfmaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1);
504 
505       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0);
506       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x6789, vget_low_f32(vwKLMN), 0);
507       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0);
508       vo3p1 = vfmaq_lane_f32(vo3p1, vi6x6789, vget_low_f32(vwKLMN), 0);
509 
510       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x6789, vwOP, 1);
511       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1);
512       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x6789, vwOP, 1);
513       vo3p0 = vfmaq_lane_f32(vo3p0, vi7x6789, vwOP, 1);
514 
515       vo0p0 = vaddq_f32(vo0p0, vo0p1);
516       vo1p0 = vaddq_f32(vo1p0, vo1p1);
517       vo2p0 = vaddq_f32(vo2p0, vo2p1);
518       vo3p0 = vaddq_f32(vo3p0, vo3p1);
519 
520       float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);
521       float32x4_t vo1 = vmaxq_f32(vo1p0, vmin);
522       float32x4_t vo2 = vmaxq_f32(vo2p0, vmin);
523       float32x4_t vo3 = vmaxq_f32(vo3p0, vmin);
524 
525       vo0 = vminq_f32(vo0, vmax);
526       vo1 = vminq_f32(vo1, vmax);
527       vo2 = vminq_f32(vo2, vmax);
528       vo3 = vminq_f32(vo3, vmax);
529 
530       vst1q_f32(o3, vo3); o3 += 4;
531       vst1q_f32(o2, vo2); o2 += 4;
532       vst1q_f32(o1, vo1); o1 += 4;
533       vst1q_f32(o0, vo0); o0 += 4;
534 
535       w -= 4 * sizeof(float);
536     }
537     assert(w >= 1 * sizeof(float));
538     assert(w <= 4 * sizeof(float));
539     {
540       float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
541       float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
542       float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
543       float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
544 
545       vi0x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi0x4567)));
546       vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567)));
547       vi2x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi2x4567)));
548       vi3x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi3x4567)));
549       vi4x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi4x4567)));
550       vi5x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi5x4567)));
551       vi6x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi6x4567)));
552       vi7x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi7x4567)));
553 
554       float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1);
555       float32x4_t vo1p1 = vmulq_lane_f32(vi1x4567, vget_high_f32(vw0123), 1);
556       float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1);
557       float32x4_t vo3p1 = vmulq_lane_f32(vi3x4567, vget_high_f32(vw0123), 1);
558 
559       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0);
560       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw89AB), 0);
561       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0);
562       vo3p0 = vfmaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0);
563 
564       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1);
565       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x4567, vget_low_f32(vwCDEF), 1);
566       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1);
567       vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1);
568 
569       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0);
570       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x4567, vget_high_f32(vwGHIJ), 0);
571       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0);
572       vo3p1 = vfmaq_lane_f32(vo3p1, vi6x4567, vget_high_f32(vwGHIJ), 0);
573 
574       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1);
575       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1);
576       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1);
577       vo3p0 = vfmaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1);
578 
579       const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3);
580       const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3);
581       const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3);
582       const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3);
583       const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3);
584       const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3);
585       const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3);
586       const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3);
587 
588       vo0p1 = vfmaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0);
589       vo1p1 = vfmaq_lane_f32(vo1p1, vi1x3456, vget_high_f32(vw0123), 0);
590       vo2p1 = vfmaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0);
591       vo3p1 = vfmaq_lane_f32(vo3p1, vi3x3456, vget_high_f32(vw0123), 0);
592 
593       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1);
594       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x3456, vget_high_f32(vw4567), 1);
595       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1);
596       vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1);
597 
598       vo0p1 = vfmaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0);
599       vo1p1 = vfmaq_lane_f32(vo1p1, vi3x3456, vget_low_f32(vwCDEF), 0);
600       vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0);
601       vo3p1 = vfmaq_lane_f32(vo3p1, vi5x3456, vget_low_f32(vwCDEF), 0);
602 
603       vo0p0 = vfmaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1);
604       vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1);
605       vo2p0 = vfmaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1);
606       vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1);
607 
608       vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0);
609       vo1p1 = vfmaq_lane_f32(vo1p1, vi5x3456, vget_high_f32(vwKLMN), 0);
610       vo2p1 = vfmaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0);
611       vo3p1 = vfmaq_lane_f32(vo3p1, vi7x3456, vget_high_f32(vwKLMN), 0);
612 
613       const float32x4_t vi0x2345 = vextq_f32(vi0x0123, vi0x4567, 2);
614       const float32x4_t vi1x2345 = vextq_f32(vi1x0123, vi1x4567, 2);
615       const float32x4_t vi2x2345 = vextq_f32(vi2x0123, vi2x4567, 2);
616       const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2);
617       const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2);
618       const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2);
619       const float32x4_t vi6x2345 = vextq_f32(vi6x0123, vi6x4567, 2);
620       const float32x4_t vi7x2345 = vextq_f32(vi7x0123, vi7x4567, 2);
621 
622       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1);
623       vo1p0 = vfmaq_lane_f32(vo1p0, vi1x2345, vget_low_f32(vw0123), 1);
624       vo2p0 = vfmaq_lane_f32(vo2p0, vi2x2345, vget_low_f32(vw0123), 1);
625       vo3p0 = vfmaq_lane_f32(vo3p0, vi3x2345, vget_low_f32(vw0123), 1);
626 
627       vo0p1 = vfmaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0);
628       vo1p1 = vfmaq_lane_f32(vo1p1, vi2x2345, vget_high_f32(vw4567), 0);
629       vo2p1 = vfmaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0);
630       vo3p1 = vfmaq_lane_f32(vo3p1, vi4x2345, vget_high_f32(vw4567), 0);
631 
632       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1);
633       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x2345, vget_high_f32(vw89AB), 1);
634       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x2345, vget_high_f32(vw89AB), 1);
635       vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1);
636 
637       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0);
638       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x2345, vget_low_f32(vwGHIJ), 0);
639       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0);
640       vo3p1 = vfmaq_lane_f32(vo3p1, vi6x2345, vget_low_f32(vwGHIJ), 0);
641 
642       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1);
643       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1);
644       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x2345, vget_low_f32(vwKLMN), 1);
645       vo3p0 = vfmaq_lane_f32(vo3p0, vi7x2345, vget_low_f32(vwKLMN), 1);
646 
647       const float32x4_t vzero = vmovq_n_f32(0.0f);
648       const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vzero, 1);
649       const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1);
650       const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vzero, 1);
651       const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vzero, 1);
652       const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1);
653       const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vzero, 1);
654       const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1);
655       const float32x4_t vi7x5678 = vextq_f32(vi7x4567, vzero, 1);
656 
657       vo0p1 = vfmaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0);
658       vo1p1 = vfmaq_lane_f32(vo1p1, vi1x5678, vget_low_f32(vw4567), 0);
659       vo2p1 = vfmaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0);
660       vo3p1 = vfmaq_lane_f32(vo3p1, vi3x5678, vget_low_f32(vw4567), 0);
661 
662       vo0p0 = vfmaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1);
663       vo1p0 = vfmaq_lane_f32(vo1p0, vi2x5678, vget_low_f32(vw89AB), 1);
664       vo2p0 = vfmaq_lane_f32(vo2p0, vi3x5678, vget_low_f32(vw89AB), 1);
665       vo3p0 = vfmaq_lane_f32(vo3p0, vi4x5678, vget_low_f32(vw89AB), 1);
666 
667       vo0p1 = vfmaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0);
668       vo1p1 = vfmaq_lane_f32(vo1p1, vi3x5678, vget_high_f32(vwCDEF), 0);
669       vo2p1 = vfmaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0);
670       vo3p1 = vfmaq_lane_f32(vo3p1, vi5x5678, vget_high_f32(vwCDEF), 0);
671 
672       vo0p0 = vfmaq_lane_f32(vo0p0, vi3x5678, vget_high_f32(vwGHIJ), 1);
673       vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1);
674       vo2p0 = vfmaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1);
675       vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1);
676 
677       vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0);
678       vo1p1 = vfmaq_lane_f32(vo1p1, vi5x5678, vwOP, 0);
679       vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0);
680       vo3p1 = vfmaq_lane_f32(vo3p1, vi7x5678, vwOP, 0);
681 
682       const float32x4_t vi0x6789 = vextq_f32(vi0x5678, vzero, 1);
683       const float32x4_t vi1x6789 = vextq_f32(vi1x5678, vzero, 1);
684       const float32x4_t vi2x6789 = vextq_f32(vi2x5678, vzero, 1);
685       const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1);
686       const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1);
687       const float32x4_t vi5x6789 = vextq_f32(vi5x5678, vzero, 1);
688       const float32x4_t vi6x6789 = vextq_f32(vi6x5678, vzero, 1);
689       const float32x4_t vi7x6789 = vextq_f32(vi7x5678, vzero, 1);
690 
691       vo0p0 = vfmaq_lane_f32(vo0p0, vi0x6789, vget_low_f32(vw4567), 1);
692       vo1p0 = vfmaq_lane_f32(vo1p0, vi1x6789, vget_low_f32(vw4567), 1);
693       vo2p0 = vfmaq_lane_f32(vo2p0, vi2x6789, vget_low_f32(vw4567), 1);
694       vo3p0 = vfmaq_lane_f32(vo3p0, vi3x6789, vget_low_f32(vw4567), 1);
695 
696       vo0p1 = vfmaq_lane_f32(vo0p1, vi1x6789, vget_high_f32(vw89AB), 0);
697       vo1p1 = vfmaq_lane_f32(vo1p1, vi2x6789, vget_high_f32(vw89AB), 0);
698       vo2p1 = vfmaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0);
699       vo3p1 = vfmaq_lane_f32(vo3p1, vi4x6789, vget_high_f32(vw89AB), 0);
700 
701       vo0p0 = vfmaq_lane_f32(vo0p0, vi2x6789, vget_high_f32(vwCDEF), 1);
702       vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1);
703       vo2p0 = vfmaq_lane_f32(vo2p0, vi4x6789, vget_high_f32(vwCDEF), 1);
704       vo3p0 = vfmaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1);
705 
706       vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0);
707       vo1p1 = vfmaq_lane_f32(vo1p1, vi4x6789, vget_low_f32(vwKLMN), 0);
708       vo2p1 = vfmaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0);
709       vo3p1 = vfmaq_lane_f32(vo3p1, vi6x6789, vget_low_f32(vwKLMN), 0);
710 
711       vo0p0 = vfmaq_lane_f32(vo0p0, vi4x6789, vwOP, 1);
712       vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1);
713       vo2p0 = vfmaq_lane_f32(vo2p0, vi6x6789, vwOP, 1);
714       vo3p0 = vfmaq_lane_f32(vo3p0, vi7x6789, vwOP, 1);
715 
716       vo0p0 = vaddq_f32(vo0p0, vo0p1);
717       vo1p0 = vaddq_f32(vo1p0, vo1p1);
718       vo2p0 = vaddq_f32(vo2p0, vo2p1);
719       vo3p0 = vaddq_f32(vo3p0, vo3p1);
720 
721       float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);
722       float32x4_t vo1 = vmaxq_f32(vo1p0, vmin);
723       float32x4_t vo2 = vmaxq_f32(vo2p0, vmin);
724       float32x4_t vo3 = vmaxq_f32(vo3p0, vmin);
725 
726       vo0 = vminq_f32(vo0, vmax);
727       vo1 = vminq_f32(vo1, vmax);
728       vo2 = vminq_f32(vo2, vmax);
729       vo3 = vminq_f32(vo3, vmax);
730 
731       if XNN_LIKELY(w & (4 * sizeof(float))) {
732         vst1q_f32(o3, vo3); o3 += 4;
733         vst1q_f32(o2, vo2); o2 += 4;
734         vst1q_f32(o1, vo1); o1 += 4;
735         vst1q_f32(o0, vo0); o0 += 4;
736       } else {
737         float32x2_t vo0_lo = vget_low_f32(vo0);
738         float32x2_t vo1_lo = vget_low_f32(vo1);
739         float32x2_t vo2_lo = vget_low_f32(vo2);
740         float32x2_t vo3_lo = vget_low_f32(vo3);
741         if (w & (2 * sizeof(float))) {
742           vst1_f32(o3, vo3_lo); o3 += 2;
743           vst1_f32(o2, vo2_lo); o2 += 2;
744           vst1_f32(o1, vo1_lo); o1 += 2;
745           vst1_f32(o0, vo0_lo); o0 += 2;
746 
747           vo0_lo = vget_high_f32(vo0);
748           vo1_lo = vget_high_f32(vo1);
749           vo2_lo = vget_high_f32(vo2);
750           vo3_lo = vget_high_f32(vo3);
751         }
752         if (w & (1 * sizeof(float))) {
753           vst1_lane_f32(o3, vo3_lo, 0); o3 += 1;
754           vst1_lane_f32(o2, vo2_lo, 0); o2 += 1;
755           vst1_lane_f32(o1, vo1_lo, 0); o1 += 1;
756           vst1_lane_f32(o0, vo0_lo, 0); o0 += 1;
757         }
758       }
759     }
760 
761     i0 = (const float*) ((uintptr_t) i4 - input_decrement);
762     i1 = (const float*) ((uintptr_t) i5 - input_decrement);
763     i2 = (const float*) ((uintptr_t) i1 + input_width);
764     i3 = (const float*) ((uintptr_t) i2 + input_width);
765     i4 = (const float*) ((uintptr_t) i3 + input_width);
766     i5 = (const float*) ((uintptr_t) i4 + input_width);
767     i6 = (const float*) ((uintptr_t) i5 + input_width);
768     i7 = (const float*) ((uintptr_t) i6 + input_width);
769 
770     o0 = o3;
771     o1 = (float*) ((uintptr_t) o0 + input_width);
772     o2 = (float*) ((uintptr_t) o1 + input_width);
773     o3 = (float*) ((uintptr_t) o2 + input_width);
774 
775     output_height = doz(output_height, 4);
776   } while (output_height != 0);
777 }
778