1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-conv-hwc/3x3s2p1c3-neon-x1.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 
11 #include <assert.h>
12 
13 #include <arm_neon.h>
14 
15 #include <xnnpack/conv.h>
16 #include <xnnpack/math.h>
17 
18 
xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_width_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1(
20     size_t input_height,
21     size_t input_width,
22     size_t output_y_start,
23     size_t output_y_end,
24     const float* input,
25     const float* zero,
26     const float* weights,
27     float* output,
28     size_t input_padding_top,
29     size_t output_channels,
30     size_t output_height_stride,
31     size_t output_width_stride,
32     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
33 {
34   assert(input_width != 0);
35   assert(output_y_end > output_y_start);
36   assert(input_padding_top <= 1);
37   assert(output_channels != 0);
38 
39   const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
40   const size_t input_width_decrement = input_width * 3 /* channels */ * sizeof(float);
41   const size_t output_width = (input_width + 1) / 2;
42   const size_t output_channel_decrement = output_width * output_width_stride - 4 * sizeof(float);
43   const size_t output_height_increment = output_height_stride * 2 - round_up_po2(output_channels, 4) * sizeof(float);
44 
45   // Adjustment for padding processed below
46   const float* i0 = (const float*) ((uintptr_t) input +
47     input_height_stride * (output_y_start * 2 /* vertical stride */ - input_padding_top));
48   const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
49   const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
50   const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
51   const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
52   float* o0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
53   float* o1 = (float*) ((uintptr_t) o0 + output_height_stride);
54 
55   if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
56     i0 = zero;
57   }
58 
59   const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
60   const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
61 
62   for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
63     const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
64     const size_t input_y4 = input_y2 + 2;
65     if XNN_UNPREDICTABLE(input_y2 > input_height) {
66       i1 = zero;
67     }
68     if XNN_UNPREDICTABLE(input_y2 >= input_height) {
69       i2 = zero;
70     }
71     if XNN_UNPREDICTABLE(input_y4 > input_height) {
72       i3 = zero;
73     }
74     if XNN_UNPREDICTABLE(input_y4 >= input_height) {
75       i4 = zero;
76     }
77     if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
78       o1 = o0;
79     }
80 
81     const float* w = weights;
82     size_t c = output_channels;
83     do {
84       // viMx0 = ( iM0c2, iM0c1, iM0c0, --- )
85       float32x4_t vi0x0 = vmovq_n_f32(0.0f);
86       float32x4_t vi1x0 = vmovq_n_f32(0.0f);
87       float32x4_t vi2x0 = vmovq_n_f32(0.0f);
88       float32x4_t vi3x0 = vmovq_n_f32(0.0f);
89       float32x4_t vi4x0 = vmovq_n_f32(0.0f);
90 
91       size_t iw = input_width;
92       for (; iw >= 2; iw -= 2) {
93         float32x4_t vo0c0123 = vld1q_f32(w);
94         float32x4_t vo1c0123 = vo0c0123;
95 
96         const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
97 
98         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 1);
99         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 1);
100 
101         const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
102 
103         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 1);
104         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 1);
105 
106         const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
107 
108         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1);
109         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 1);
110 
111         const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
112 
113         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c1x0123, vget_high_f32(vi0x0), 0);
114         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_high_f32(vi2x0), 0);
115 
116         const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
117 
118         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c1x0123, vget_high_f32(vi1x0), 0);
119         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c1x0123, vget_high_f32(vi3x0), 0);
120 
121         const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
122 
123         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0);
124         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c1x0123, vget_high_f32(vi4x0), 0);
125 
126         const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
127 
128         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 1);
129         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 1);
130 
131         const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
132 
133         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 1);
134         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 1);
135 
136         const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
137 
138         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 1);
139         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 1);
140 
141         // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
142         const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
143         const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
144         const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
145         const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
146         const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
147 
148         const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
149 
150         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c0x0123, vget_low_f32(vi0x1), 0);
151         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_low_f32(vi2x1), 0);
152 
153         const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
154 
155         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c0x0123, vget_low_f32(vi1x1), 0);
156         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c0x0123, vget_low_f32(vi3x1), 0);
157 
158         const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
159 
160         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0);
161         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c0x0123, vget_low_f32(vi4x1), 0);
162 
163         const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
164 
165         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c1x0123, vget_low_f32(vi0x1), 1);
166         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 1);
167 
168         const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
169 
170         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c1x0123, vget_low_f32(vi1x1), 1);
171         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c1x0123, vget_low_f32(vi3x1), 1);
172 
173         const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
174 
175         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1);
176         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c1x0123, vget_low_f32(vi4x1), 1);
177 
178         const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
179 
180         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c2x0123, vget_high_f32(vi0x1), 0);
181         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vget_high_f32(vi2x1), 0);
182 
183         const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
184 
185         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c2x0123, vget_high_f32(vi1x1), 0);
186         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c2x0123, vget_high_f32(vi3x1), 0);
187 
188         const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
189 
190         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c2x0123, vget_high_f32(vi2x1), 0);
191         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c2x0123, vget_high_f32(vi4x1), 0);
192 
193         const float32x4_t vk02c0x0123 = vld1q_f32(w + 76);
194 
195         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c0x0123, vget_high_f32(vi0x1), 1);
196         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c0x0123, vget_high_f32(vi2x1), 1);
197 
198         const float32x4_t vk12c0x0123 = vld1q_f32(w + 80);
199 
200         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c0x0123, vget_high_f32(vi1x1), 1);
201         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c0x0123, vget_high_f32(vi3x1), 1);
202 
203         const float32x4_t vk22c0x0123 = vld1q_f32(w + 84);
204 
205         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c0x0123, vget_high_f32(vi2x1), 1);
206         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c0x0123, vget_high_f32(vi4x1), 1);
207 
208         // viMx2 = ( iM2c2, iM2c1 )
209         const float32x2_t vi0x2 = vld1_f32(i0); i0 += 2;
210         const float32x2_t vi1x2 = vld1_f32(i1); i1 += 2;
211         const float32x2_t vi2x2 = vld1_f32(i2); i2 += 2;
212         const float32x2_t vi3x2 = vld1_f32(i3); i3 += 2;
213         const float32x2_t vi4x2 = vld1_f32(i4); i4 += 2;
214 
215         const float32x4_t vk02c1x0123 = vld1q_f32(w + 88);
216 
217         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c1x0123, vi0x2, 0);
218         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c1x0123, vi2x2, 0);
219 
220         const float32x4_t vk12c1x0123 = vld1q_f32(w + 92);
221 
222         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c1x0123, vi1x2, 0);
223         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c1x0123, vi3x2, 0);
224 
225         const float32x4_t vk22c1x0123 = vld1q_f32(w + 96);
226 
227         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c1x0123, vi2x2, 0);
228         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c1x0123, vi4x2, 0);
229 
230         const float32x4_t vk02c2x0123 = vld1q_f32(w + 100);
231 
232         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c2x0123, vi0x2, 1);
233         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c2x0123, vi2x2, 1);
234 
235         const float32x4_t vk12c2x0123 = vld1q_f32(w + 104);
236 
237         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c2x0123, vi1x2, 1);
238         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c2x0123, vi3x2, 1);
239 
240         const float32x4_t vk22c2x0123 = vld1q_f32(w + 108);
241 
242         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c2x0123, vi2x2, 1);
243         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c2x0123, vi4x2, 1);
244 
245         vi0x0 = vcombine_f32(vget_high_f32(vi0x1), vi0x2);
246         vi1x0 = vcombine_f32(vget_high_f32(vi1x1), vi1x2);
247         vi2x0 = vcombine_f32(vget_high_f32(vi2x1), vi2x2);
248         vi3x0 = vcombine_f32(vget_high_f32(vi3x1), vi3x2);
249         vi4x0 = vcombine_f32(vget_high_f32(vi4x1), vi4x2);
250 
251 
252         vo0c0123 = vmaxq_f32(vo0c0123, vmin);
253         vo1c0123 = vmaxq_f32(vo1c0123, vmin);
254 
255         vo0c0123 = vminq_f32(vo0c0123, vmax);
256         vo1c0123 = vminq_f32(vo1c0123, vmax);
257 
258         if XNN_LIKELY(c >= 4) {
259           vst1q_f32(o1, vo1c0123);
260           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
261           vst1q_f32(o0, vo0c0123);
262           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
263         } else {
264           float* o0_tmp = o0;
265           float* o1_tmp = o1;
266           float32x2_t vo0c01 = vget_low_f32(vo0c0123);
267           float32x2_t vo1c01 = vget_low_f32(vo1c0123);
268           if (c & 2) {
269             vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
270             vo1c01 = vget_high_f32(vo1c0123);
271             vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
272             vo0c01 = vget_high_f32(vo0c0123);
273           }
274           if (c & 1) {
275             vst1_lane_f32(o1_tmp, vo1c01, 0);
276             vst1_lane_f32(o0_tmp, vo0c01, 0);
277           }
278 
279           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
280           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
281         }
282       }
283       assert(iw < 2);
284       if XNN_UNLIKELY(iw & 1) {
285         float32x4_t vo0c0123 = vld1q_f32(w);
286         float32x4_t vo1c0123 = vo0c0123;
287 
288         const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
289 
290         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 1);
291         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 1);
292 
293         const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
294 
295         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 1);
296         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 1);
297 
298         const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
299 
300         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1);
301         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 1);
302 
303         const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
304 
305         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c1x0123, vget_high_f32(vi0x0), 0);
306         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_high_f32(vi2x0), 0);
307 
308         const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
309 
310         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c1x0123, vget_high_f32(vi1x0), 0);
311         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c1x0123, vget_high_f32(vi3x0), 0);
312 
313         const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
314 
315         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0);
316         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c1x0123, vget_high_f32(vi4x0), 0);
317 
318         const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
319 
320         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 1);
321         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 1);
322 
323         const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
324 
325         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 1);
326         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 1);
327 
328         const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
329 
330         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 1);
331         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 1);
332 
333         // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
334         const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 3;
335         const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 3;
336         const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 3;
337         const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 3;
338         const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 3;
339 
340         const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
341 
342         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c0x0123, vget_low_f32(vi0x1), 0);
343         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_low_f32(vi2x1), 0);
344 
345         const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
346 
347         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c0x0123, vget_low_f32(vi1x1), 0);
348         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c0x0123, vget_low_f32(vi3x1), 0);
349 
350         const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
351 
352         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0);
353         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c0x0123, vget_low_f32(vi4x1), 0);
354 
355         const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
356 
357         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c1x0123, vget_low_f32(vi0x1), 1);
358         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 1);
359 
360         const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
361 
362         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c1x0123, vget_low_f32(vi1x1), 1);
363         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c1x0123, vget_low_f32(vi3x1), 1);
364 
365         const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
366 
367         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1);
368         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c1x0123, vget_low_f32(vi4x1), 1);
369 
370         const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
371 
372         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c2x0123, vget_high_f32(vi0x1), 0);
373         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vget_high_f32(vi2x1), 0);
374 
375         const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
376 
377         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c2x0123, vget_high_f32(vi1x1), 0);
378         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c2x0123, vget_high_f32(vi3x1), 0);
379 
380         const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
381 
382         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c2x0123, vget_high_f32(vi2x1), 0);
383         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c2x0123, vget_high_f32(vi4x1), 0);
384 
385 
386         vo0c0123 = vmaxq_f32(vo0c0123, vmin);
387         vo1c0123 = vmaxq_f32(vo1c0123, vmin);
388 
389         vo0c0123 = vminq_f32(vo0c0123, vmax);
390         vo1c0123 = vminq_f32(vo1c0123, vmax);
391 
392         if XNN_LIKELY(c >= 4) {
393           vst1q_f32(o1, vo1c0123);
394           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
395           vst1q_f32(o0, vo0c0123);
396           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
397         } else {
398           float* o0_tmp = o0;
399           float* o1_tmp = o1;
400           float32x2_t vo0c01 = vget_low_f32(vo0c0123);
401           float32x2_t vo1c01 = vget_low_f32(vo1c0123);
402           if (c & 2) {
403             vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
404             vo1c01 = vget_high_f32(vo1c0123);
405             vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
406             vo0c01 = vget_high_f32(vo0c0123);
407           }
408           if (c & 1) {
409             vst1_lane_f32(o1_tmp, vo1c01, 0);
410             vst1_lane_f32(o0_tmp, vo0c01, 0);
411           }
412           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
413           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
414         }
415       }
416       // Move output pointers back to the position of the first pixel in a row,
417       // and forward to the next block of output channels
418       o0 = (float*) ((uintptr_t) o0 - output_channel_decrement);
419       o1 = (float*) ((uintptr_t) o1 - output_channel_decrement);
420       // Revert input pointers to the position of the first pixel in a row
421       i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
422       i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
423       i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
424       i3 = (const float*) ((uintptr_t) i3 - input_width_decrement);
425       i4 = (const float*) ((uintptr_t) i4 - input_width_decrement);
426       // Move to the block of weights for the next 4 output channels
427       w += 112;
428       c = doz(c, 4);
429     } while (c != 0);
430     // Move output pointers back to the position of the first channel, and forward to the next block of rows
431     o0 = (float*) ((uintptr_t) o0 + output_height_increment);
432     o1 = (float*) ((uintptr_t) o1 + output_height_increment);
433     // Move input pointers forward to the next four rows
434     i0 = i4;
435     i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
436     i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
437     i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
438     i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
439   }
440 }
441