1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-conv-hwc/3x3s2p1c3-neon-x2.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 
11 #include <assert.h>
12 
13 #include <arm_neon.h>
14 
15 #include <xnnpack/conv.h>
16 #include <xnnpack/math.h>
17 
18 
xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_width_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2(
20     size_t input_height,
21     size_t input_width,
22     size_t output_y_start,
23     size_t output_y_end,
24     const float* input,
25     const float* zero,
26     const float* weights,
27     float* output,
28     size_t input_padding_top,
29     size_t output_channels,
30     size_t output_height_stride,
31     size_t output_width_stride,
32     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
33 {
34   assert(input_width != 0);
35   assert(output_y_end > output_y_start);
36   assert(input_padding_top <= 1);
37   assert(output_channels != 0);
38 
39   const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
40   const size_t input_width_decrement = input_width * 3 /* channels */ * sizeof(float);
41   const size_t output_width = (input_width + 1) / 2;
42   const size_t output_channel_decrement = output_width * output_width_stride - 4 * sizeof(float);
43   const size_t output_height_increment = output_height_stride * 2 - round_up_po2(output_channels, 4) * sizeof(float);
44 
45   // Adjustment for padding processed below
46   const float* i0 = (const float*) ((uintptr_t) input +
47     input_height_stride * (output_y_start * 2 /* vertical stride */ - input_padding_top));
48   const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
49   const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
50   const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
51   const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
52   float* o0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
53   float* o1 = (float*) ((uintptr_t) o0 + output_height_stride);
54 
55   if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
56     i0 = zero;
57   }
58 
59 
60   for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
61     const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
62     const size_t input_y4 = input_y2 + 2;
63     if XNN_UNPREDICTABLE(input_y2 > input_height) {
64       i1 = zero;
65     }
66     if XNN_UNPREDICTABLE(input_y2 >= input_height) {
67       i2 = zero;
68     }
69     if XNN_UNPREDICTABLE(input_y4 > input_height) {
70       i3 = zero;
71     }
72     if XNN_UNPREDICTABLE(input_y4 >= input_height) {
73       i4 = zero;
74     }
75     if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
76       o1 = o0;
77     }
78 
79     const float* w = weights;
80     size_t c = output_channels;
81     do {
82       // viMx0 = ( iM0c2, iM0c1, iM0c0, --- )
83       float32x4_t vi0x0 = vmovq_n_f32(0.0f);
84       float32x4_t vi1x0 = vmovq_n_f32(0.0f);
85       float32x4_t vi2x0 = vmovq_n_f32(0.0f);
86       float32x4_t vi3x0 = vmovq_n_f32(0.0f);
87       float32x4_t vi4x0 = vmovq_n_f32(0.0f);
88 
89       size_t iw = input_width;
90       for (; iw >= 4; iw -= 4) {
91         float32x4_t vo0x0c0123 = vld1q_f32(w);
92         float32x4_t vo1x0c0123 = vo0x0c0123;
93         float32x4_t vo0x1c0123 = vo0x0c0123;
94         float32x4_t vo1x1c0123 = vo0x0c0123;
95 
96         const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
97 
98         // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
99         const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
100         const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
101         const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
102         const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
103         const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
104 
105         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk00c0x0123, vget_low_f32(vi0x0), 1);
106         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk00c0x0123, vget_low_f32(vi2x0), 1);
107 
108         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk00c0x0123, vget_high_f32(vi0x1), 1);
109         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk00c0x0123, vget_high_f32(vi2x1), 1);
110 
111         const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
112 
113         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk10c0x0123, vget_low_f32(vi1x0), 1);
114         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk10c0x0123, vget_low_f32(vi3x0), 1);
115 
116         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk10c0x0123, vget_high_f32(vi1x1), 1);
117         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk10c0x0123, vget_high_f32(vi3x1), 1);
118 
119         const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
120 
121         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1);
122         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk20c0x0123, vget_low_f32(vi4x0), 1);
123 
124         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk20c0x0123, vget_high_f32(vi2x1), 1);
125         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk20c0x0123, vget_high_f32(vi4x1), 1);
126 
127         const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
128 
129         // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
130         const float32x4_t vi0x2 = vld1q_f32(i0); i0 += 4;
131         const float32x4_t vi1x2 = vld1q_f32(i1); i1 += 4;
132         const float32x4_t vi2x2 = vld1q_f32(i2); i2 += 4;
133         const float32x4_t vi3x2 = vld1q_f32(i3); i3 += 4;
134         const float32x4_t vi4x2 = vld1q_f32(i4); i4 += 4;
135 
136         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk00c1x0123, vget_high_f32(vi0x0), 0);
137         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk00c1x0123, vget_high_f32(vi2x0), 0);
138 
139         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk00c1x0123, vget_low_f32(vi0x2), 0);
140         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk00c1x0123, vget_low_f32(vi2x2), 0);
141 
142         const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
143 
144         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk10c1x0123, vget_high_f32(vi1x0), 0);
145         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk10c1x0123, vget_high_f32(vi3x0), 0);
146 
147         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk10c1x0123, vget_low_f32(vi1x2), 0);
148         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk10c1x0123, vget_low_f32(vi3x2), 0);
149 
150         const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
151 
152         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0);
153         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk20c1x0123, vget_high_f32(vi4x0), 0);
154 
155         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk20c1x0123, vget_low_f32(vi2x2), 0);
156         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk20c1x0123, vget_low_f32(vi4x2), 0);
157 
158         const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
159 
160         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk00c2x0123, vget_high_f32(vi0x0), 1);
161         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk00c2x0123, vget_high_f32(vi2x0), 1);
162 
163         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk00c2x0123, vget_low_f32(vi0x2), 1);
164         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk00c2x0123, vget_low_f32(vi2x2), 1);
165 
166         const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
167 
168         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk10c2x0123, vget_high_f32(vi1x0), 1);
169         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk10c2x0123, vget_high_f32(vi3x0), 1);
170 
171         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk10c2x0123, vget_low_f32(vi1x2), 1);
172         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk10c2x0123, vget_low_f32(vi3x2), 1);
173 
174         const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
175 
176         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk20c2x0123, vget_high_f32(vi2x0), 1);
177         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk20c2x0123, vget_high_f32(vi4x0), 1);
178 
179         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk20c2x0123, vget_low_f32(vi2x2), 1);
180         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk20c2x0123, vget_low_f32(vi4x2), 1);
181 
182         const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
183 
184         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk01c0x0123, vget_low_f32(vi0x1), 0);
185         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk01c0x0123, vget_low_f32(vi2x1), 0);
186 
187         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk01c0x0123, vget_high_f32(vi0x2), 0);
188         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk01c0x0123, vget_high_f32(vi2x2), 0);
189 
190         const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
191 
192         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk11c0x0123, vget_low_f32(vi1x1), 0);
193         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk11c0x0123, vget_low_f32(vi3x1), 0);
194 
195         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk11c0x0123, vget_high_f32(vi1x2), 0);
196         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk11c0x0123, vget_high_f32(vi3x2), 0);
197 
198         const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
199 
200         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0);
201         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk21c0x0123, vget_low_f32(vi4x1), 0);
202 
203         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk21c0x0123, vget_high_f32(vi2x2), 0);
204         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk21c0x0123, vget_high_f32(vi4x2), 0);
205 
206         const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
207 
208         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk01c1x0123, vget_low_f32(vi0x1), 1);
209         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk01c1x0123, vget_low_f32(vi2x1), 1);
210 
211         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk01c1x0123, vget_high_f32(vi0x2), 1);
212         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk01c1x0123, vget_high_f32(vi2x2), 1);
213 
214         const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
215 
216         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk11c1x0123, vget_low_f32(vi1x1), 1);
217         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk11c1x0123, vget_low_f32(vi3x1), 1);
218 
219         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk11c1x0123, vget_high_f32(vi1x2), 1);
220         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk11c1x0123, vget_high_f32(vi3x2), 1);
221 
222         const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
223 
224         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1);
225         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk21c1x0123, vget_low_f32(vi4x1), 1);
226 
227         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk21c1x0123, vget_high_f32(vi2x2), 1);
228         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk21c1x0123, vget_high_f32(vi4x2), 1);
229 
230         const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
231 
232         // viMx3 = ( iM4c2, iM4c1, iM4c0, iM3c2 )
233         const float32x4_t vi0x3 = vld1q_f32(i0); i0 += 4;
234         const float32x4_t vi1x3 = vld1q_f32(i1); i1 += 4;
235         const float32x4_t vi2x3 = vld1q_f32(i2); i2 += 4;
236         const float32x4_t vi3x3 = vld1q_f32(i3); i3 += 4;
237         const float32x4_t vi4x3 = vld1q_f32(i4); i4 += 4;
238 
239         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk01c2x0123, vget_high_f32(vi0x1), 0);
240         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk01c2x0123, vget_high_f32(vi2x1), 0);
241 
242         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk01c2x0123, vget_low_f32(vi0x3), 0);
243         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk01c2x0123, vget_low_f32(vi2x3), 0);
244 
245         const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
246 
247         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk11c2x0123, vget_high_f32(vi1x1), 0);
248         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk11c2x0123, vget_high_f32(vi3x1), 0);
249 
250         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk11c2x0123, vget_low_f32(vi1x3), 0);
251         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk11c2x0123, vget_low_f32(vi3x3), 0);
252 
253         const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
254 
255         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk21c2x0123, vget_high_f32(vi2x1), 0);
256         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk21c2x0123, vget_high_f32(vi4x1), 0);
257 
258         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk21c2x0123, vget_low_f32(vi2x3), 0);
259         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk21c2x0123, vget_low_f32(vi4x3), 0);
260 
261         const float32x4_t vk02c0x0123 = vld1q_f32(w + 76);
262 
263         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk02c0x0123, vget_high_f32(vi0x1), 1);
264         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk02c0x0123, vget_high_f32(vi2x1), 1);
265 
266         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk02c0x0123, vget_low_f32(vi0x3), 1);
267         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk02c0x0123, vget_low_f32(vi2x3), 1);
268 
269         const float32x4_t vk12c0x0123 = vld1q_f32(w + 80);
270 
271         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk12c0x0123, vget_high_f32(vi1x1), 1);
272         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk12c0x0123, vget_high_f32(vi3x1), 1);
273 
274         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk12c0x0123, vget_low_f32(vi1x3), 1);
275         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk12c0x0123, vget_low_f32(vi3x3), 1);
276 
277         const float32x4_t vk22c0x0123 = vld1q_f32(w + 84);
278 
279         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk22c0x0123, vget_high_f32(vi2x1), 1);
280         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk22c0x0123, vget_high_f32(vi4x1), 1);
281 
282         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk22c0x0123, vget_low_f32(vi2x3), 1);
283         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk22c0x0123, vget_low_f32(vi4x3), 1);
284 
285         const float32x4_t vk02c1x0123 = vld1q_f32(w + 88);
286 
287         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk02c1x0123, vget_low_f32(vi0x2), 0);
288         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk02c1x0123, vget_low_f32(vi2x2), 0);
289 
290         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk02c1x0123, vget_high_f32(vi0x3), 0);
291         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk02c1x0123, vget_high_f32(vi2x3), 0);
292 
293         const float32x4_t vk12c1x0123 = vld1q_f32(w + 92);
294 
295         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk12c1x0123, vget_low_f32(vi1x2), 0);
296         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk12c1x0123, vget_low_f32(vi3x2), 0);
297 
298         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk12c1x0123, vget_high_f32(vi1x3), 0);
299         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk12c1x0123, vget_high_f32(vi3x3), 0);
300 
301         const float32x4_t vk22c1x0123 = vld1q_f32(w + 96);
302 
303         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk22c1x0123, vget_low_f32(vi2x2), 0);
304         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk22c1x0123, vget_low_f32(vi4x2), 0);
305 
306         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk22c1x0123, vget_high_f32(vi2x3), 0);
307         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk22c1x0123, vget_high_f32(vi4x3), 0);
308 
309         const float32x4_t vk02c2x0123 = vld1q_f32(w + 100);
310 
311         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk02c2x0123, vget_low_f32(vi0x2), 1);
312         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk02c2x0123, vget_low_f32(vi2x2), 1);
313 
314         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk02c2x0123, vget_high_f32(vi0x3), 1);
315         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk02c2x0123, vget_high_f32(vi2x3), 1);
316 
317         const float32x4_t vk12c2x0123 = vld1q_f32(w + 104);
318 
319         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk12c2x0123, vget_low_f32(vi1x2), 1);
320         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk12c2x0123, vget_low_f32(vi3x2), 1);
321 
322         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk12c2x0123, vget_high_f32(vi1x3), 1);
323         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk12c2x0123, vget_high_f32(vi3x3), 1);
324 
325         const float32x4_t vk22c2x0123 = vld1q_f32(w + 108);
326 
327         vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk22c2x0123, vget_low_f32(vi2x2), 1);
328         vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk22c2x0123, vget_low_f32(vi4x2), 1);
329 
330         vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk22c2x0123, vget_high_f32(vi2x3), 1);
331         vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk22c2x0123, vget_high_f32(vi4x3), 1);
332 
333         vi0x0 = vi0x3;
334         vi1x0 = vi1x3;
335         vi2x0 = vi2x3;
336         vi3x0 = vi3x3;
337         vi4x0 = vi4x3;
338 
339         const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
340         const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
341 
342         vo0x0c0123 = vmaxq_f32(vo0x0c0123, vmin);
343         vo1x0c0123 = vmaxq_f32(vo1x0c0123, vmin);
344 
345         vo0x1c0123 = vmaxq_f32(vo0x1c0123, vmin);
346         vo1x1c0123 = vmaxq_f32(vo1x1c0123, vmin);
347 
348         vo0x0c0123 = vminq_f32(vo0x0c0123, vmax);
349         vo1x0c0123 = vminq_f32(vo1x0c0123, vmax);
350 
351         vo0x1c0123 = vminq_f32(vo0x1c0123, vmax);
352         vo1x1c0123 = vminq_f32(vo1x1c0123, vmax);
353 
354         if XNN_LIKELY(c >= 4) {
355           vst1q_f32(o1, vo1x0c0123);
356           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
357           vst1q_f32(o0, vo0x0c0123);
358           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
359 
360           vst1q_f32(o1, vo1x1c0123);
361           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
362           vst1q_f32(o0, vo0x1c0123);
363           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
364         } else {
365           float* o0_tmp = o0;
366           float* o1_tmp = o1;
367           float32x2_t vo0x0c01 = vget_low_f32(vo0x0c0123);
368           float32x2_t vo1x0c01 = vget_low_f32(vo1x0c0123);
369           float32x2_t vo0x1c01 = vget_low_f32(vo0x1c0123);
370           float32x2_t vo1x1c01 = vget_low_f32(vo1x1c0123);
371           if (c & 2) {
372             vst1_f32((float*) ((uintptr_t) o1_tmp + output_width_stride), vo1x1c01);
373             vo1x1c01 = vget_high_f32(vo1x1c0123);
374             vst1_f32((float*) ((uintptr_t) o0_tmp + output_width_stride), vo0x1c01);
375             vo0x1c01 = vget_high_f32(vo0x1c0123);
376 
377             vst1_f32(o1_tmp, vo1x0c01); o1_tmp += 2;
378             vo1x0c01 = vget_high_f32(vo1x0c0123);
379             vst1_f32(o0_tmp, vo0x0c01); o0_tmp += 2;
380             vo0x0c01 = vget_high_f32(vo0x0c0123);
381           }
382           if (c & 1) {
383             vst1_lane_f32(o1_tmp, vo1x0c01, 0);
384             vst1_lane_f32(o0_tmp, vo0x0c01, 0);
385 
386             vst1_lane_f32((float*) ((uintptr_t) o1_tmp + output_width_stride), vo1x1c01, 0);
387             vst1_lane_f32((float*) ((uintptr_t) o0_tmp + output_width_stride), vo0x1c01, 0);
388           }
389 
390           o0 = (float*) ((uintptr_t) o0 + output_width_stride * 2);
391           o1 = (float*) ((uintptr_t) o1 + output_width_stride * 2);
392         }
393       }
394       assert(iw < 4);
395       if XNN_UNLIKELY(iw & 2) {
396         float32x4_t vo0c0123 = vld1q_f32(w);
397         float32x4_t vo1c0123 = vo0c0123;
398 
399         const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
400 
401         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 1);
402         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 1);
403 
404         const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
405 
406         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 1);
407         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 1);
408 
409         const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
410 
411         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1);
412         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 1);
413 
414         const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
415 
416         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c1x0123, vget_high_f32(vi0x0), 0);
417         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c1x0123, vget_high_f32(vi2x0), 0);
418 
419         const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
420 
421         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c1x0123, vget_high_f32(vi1x0), 0);
422         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c1x0123, vget_high_f32(vi3x0), 0);
423 
424         const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
425 
426         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0);
427         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c1x0123, vget_high_f32(vi4x0), 0);
428 
429         const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
430 
431         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 1);
432         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 1);
433 
434         const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
435 
436         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 1);
437         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 1);
438 
439         const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
440 
441         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 1);
442         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 1);
443 
444         // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
445         const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
446         const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
447         const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
448         const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
449         const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
450 
451         const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
452 
453         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c0x0123, vget_low_f32(vi0x1), 0);
454         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c0x0123, vget_low_f32(vi2x1), 0);
455 
456         const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
457 
458         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c0x0123, vget_low_f32(vi1x1), 0);
459         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c0x0123, vget_low_f32(vi3x1), 0);
460 
461         const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
462 
463         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0);
464         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c0x0123, vget_low_f32(vi4x1), 0);
465 
466         const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
467 
468         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c1x0123, vget_low_f32(vi0x1), 1);
469         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 1);
470 
471         const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
472 
473         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c1x0123, vget_low_f32(vi1x1), 1);
474         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c1x0123, vget_low_f32(vi3x1), 1);
475 
476         const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
477 
478         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1);
479         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c1x0123, vget_low_f32(vi4x1), 1);
480 
481         const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
482 
483         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c2x0123, vget_high_f32(vi0x1), 0);
484         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c2x0123, vget_high_f32(vi2x1), 0);
485 
486         const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
487 
488         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c2x0123, vget_high_f32(vi1x1), 0);
489         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c2x0123, vget_high_f32(vi3x1), 0);
490 
491         const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
492 
493         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c2x0123, vget_high_f32(vi2x1), 0);
494         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c2x0123, vget_high_f32(vi4x1), 0);
495 
496         const float32x4_t vk02c0x0123 = vld1q_f32(w + 76);
497 
498         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk02c0x0123, vget_high_f32(vi0x1), 1);
499         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk02c0x0123, vget_high_f32(vi2x1), 1);
500 
501         const float32x4_t vk12c0x0123 = vld1q_f32(w + 80);
502 
503         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk12c0x0123, vget_high_f32(vi1x1), 1);
504         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk12c0x0123, vget_high_f32(vi3x1), 1);
505 
506         const float32x4_t vk22c0x0123 = vld1q_f32(w + 84);
507 
508         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk22c0x0123, vget_high_f32(vi2x1), 1);
509         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk22c0x0123, vget_high_f32(vi4x1), 1);
510 
511         // viMx2 = ( iM2c2, iM2c1 )
512         const float32x2_t vi0x2 = vld1_f32(i0); i0 += 2;
513         const float32x2_t vi1x2 = vld1_f32(i1); i1 += 2;
514         const float32x2_t vi2x2 = vld1_f32(i2); i2 += 2;
515         const float32x2_t vi3x2 = vld1_f32(i3); i3 += 2;
516         const float32x2_t vi4x2 = vld1_f32(i4); i4 += 2;
517 
518         const float32x4_t vk02c1x0123 = vld1q_f32(w + 88);
519 
520         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk02c1x0123, vi0x2, 0);
521         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk02c1x0123, vi2x2, 0);
522 
523         const float32x4_t vk12c1x0123 = vld1q_f32(w + 92);
524 
525         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk12c1x0123, vi1x2, 0);
526         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk12c1x0123, vi3x2, 0);
527 
528         const float32x4_t vk22c1x0123 = vld1q_f32(w + 96);
529 
530         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk22c1x0123, vi2x2, 0);
531         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk22c1x0123, vi4x2, 0);
532 
533         const float32x4_t vk02c2x0123 = vld1q_f32(w + 100);
534 
535         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk02c2x0123, vi0x2, 1);
536         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk02c2x0123, vi2x2, 1);
537 
538         const float32x4_t vk12c2x0123 = vld1q_f32(w + 104);
539 
540         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk12c2x0123, vi1x2, 1);
541         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk12c2x0123, vi3x2, 1);
542 
543         const float32x4_t vk22c2x0123 = vld1q_f32(w + 108);
544 
545         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk22c2x0123, vi2x2, 1);
546         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk22c2x0123, vi4x2, 1);
547 
548         vi0x0 = vcombine_f32(vget_high_f32(vi0x1), vi0x2);
549         vi1x0 = vcombine_f32(vget_high_f32(vi1x1), vi1x2);
550         vi2x0 = vcombine_f32(vget_high_f32(vi2x1), vi2x2);
551         vi3x0 = vcombine_f32(vget_high_f32(vi3x1), vi3x2);
552         vi4x0 = vcombine_f32(vget_high_f32(vi4x1), vi4x2);
553 
554         const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
555         const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
556 
557         vo0c0123 = vmaxq_f32(vo0c0123, vmin);
558         vo1c0123 = vmaxq_f32(vo1c0123, vmin);
559 
560         vo0c0123 = vminq_f32(vo0c0123, vmax);
561         vo1c0123 = vminq_f32(vo1c0123, vmax);
562 
563         if XNN_LIKELY(c >= 4) {
564           vst1q_f32(o1, vo1c0123);
565           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
566           vst1q_f32(o0, vo0c0123);
567           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
568         } else {
569           float* o0_tmp = o0;
570           float* o1_tmp = o1;
571           float32x2_t vo0c01 = vget_low_f32(vo0c0123);
572           float32x2_t vo1c01 = vget_low_f32(vo1c0123);
573           if (c & 2) {
574             vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
575             vo1c01 = vget_high_f32(vo1c0123);
576             vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
577             vo0c01 = vget_high_f32(vo0c0123);
578           }
579           if (c & 1) {
580             vst1_lane_f32(o1_tmp, vo1c01, 0);
581             vst1_lane_f32(o0_tmp, vo0c01, 0);
582           }
583 
584           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
585           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
586         }
587       }
588       if XNN_UNLIKELY(iw & 1) {
589         float32x4_t vo0c0123 = vld1q_f32(w);
590         float32x4_t vo1c0123 = vo0c0123;
591 
592         const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
593 
594         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 1);
595         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 1);
596 
597         const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
598 
599         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 1);
600         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 1);
601 
602         const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
603 
604         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1);
605         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 1);
606 
607         const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
608 
609         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c1x0123, vget_high_f32(vi0x0), 0);
610         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c1x0123, vget_high_f32(vi2x0), 0);
611 
612         const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
613 
614         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c1x0123, vget_high_f32(vi1x0), 0);
615         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c1x0123, vget_high_f32(vi3x0), 0);
616 
617         const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
618 
619         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0);
620         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c1x0123, vget_high_f32(vi4x0), 0);
621 
622         const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
623 
624         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 1);
625         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 1);
626 
627         const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
628 
629         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 1);
630         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 1);
631 
632         const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
633 
634         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 1);
635         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 1);
636 
637         // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
638         const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 3;
639         const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 3;
640         const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 3;
641         const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 3;
642         const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 3;
643 
644         const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
645 
646         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c0x0123, vget_low_f32(vi0x1), 0);
647         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c0x0123, vget_low_f32(vi2x1), 0);
648 
649         const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
650 
651         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c0x0123, vget_low_f32(vi1x1), 0);
652         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c0x0123, vget_low_f32(vi3x1), 0);
653 
654         const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
655 
656         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0);
657         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c0x0123, vget_low_f32(vi4x1), 0);
658 
659         const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
660 
661         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c1x0123, vget_low_f32(vi0x1), 1);
662         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 1);
663 
664         const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
665 
666         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c1x0123, vget_low_f32(vi1x1), 1);
667         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c1x0123, vget_low_f32(vi3x1), 1);
668 
669         const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
670 
671         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1);
672         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c1x0123, vget_low_f32(vi4x1), 1);
673 
674         const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
675 
676         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c2x0123, vget_high_f32(vi0x1), 0);
677         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c2x0123, vget_high_f32(vi2x1), 0);
678 
679         const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
680 
681         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c2x0123, vget_high_f32(vi1x1), 0);
682         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c2x0123, vget_high_f32(vi3x1), 0);
683 
684         const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
685 
686         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c2x0123, vget_high_f32(vi2x1), 0);
687         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c2x0123, vget_high_f32(vi4x1), 0);
688 
689         const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
690         const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
691 
692         vo0c0123 = vmaxq_f32(vo0c0123, vmin);
693         vo1c0123 = vmaxq_f32(vo1c0123, vmin);
694 
695         vo0c0123 = vminq_f32(vo0c0123, vmax);
696         vo1c0123 = vminq_f32(vo1c0123, vmax);
697 
698         if XNN_LIKELY(c >= 4) {
699           vst1q_f32(o1, vo1c0123);
700           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
701           vst1q_f32(o0, vo0c0123);
702           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
703         } else {
704           float* o0_tmp = o0;
705           float* o1_tmp = o1;
706           float32x2_t vo0c01 = vget_low_f32(vo0c0123);
707           float32x2_t vo1c01 = vget_low_f32(vo1c0123);
708           if (c & 2) {
709             vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
710             vo1c01 = vget_high_f32(vo1c0123);
711             vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
712             vo0c01 = vget_high_f32(vo0c0123);
713           }
714           if (c & 1) {
715             vst1_lane_f32(o1_tmp, vo1c01, 0);
716             vst1_lane_f32(o0_tmp, vo0c01, 0);
717           }
718           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
719           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
720         }
721       }
722       // Move output pointers back to the position of the first pixel in a row,
723       // and forward to the next block of output channels
724       o0 = (float*) ((uintptr_t) o0 - output_channel_decrement);
725       o1 = (float*) ((uintptr_t) o1 - output_channel_decrement);
726       // Revert input pointers to the position of the first pixel in a row
727       i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
728       i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
729       i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
730       i3 = (const float*) ((uintptr_t) i3 - input_width_decrement);
731       i4 = (const float*) ((uintptr_t) i4 - input_width_decrement);
732       // Move to the block of weights for the next 4 output channels
733       w += 112;
734       c = doz(c, 4);
735     } while (c != 0);
736     // Move output pointers back to the position of the first channel, and forward to the next block of rows
737     o0 = (float*) ((uintptr_t) o0 + output_height_increment);
738     o1 = (float*) ((uintptr_t) o1 + output_height_increment);
739     // Move input pointers forward to the next four rows
740     i0 = i4;
741     i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
742     i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
743     i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
744     i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
745   }
746 }
747