1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-conv-hwc/3x3s2p0p1c3-neon-x2.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 
11 #include <assert.h>
12 
13 #include <arm_neon.h>
14 
15 #include <xnnpack/conv.h>
16 #include <xnnpack/math.h>
17 
18 
xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_width_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2(
20     size_t input_height,
21     size_t input_width,
22     size_t output_y_start,
23     size_t output_y_end,
24     const float* input,
25     const float* zero,
26     const float* weights,
27     float* output,
28     size_t input_padding_top,
29     size_t output_channels,
30     size_t output_height_stride,
31     size_t output_width_stride,
32     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
33 {
34   assert(input_width != 0);
35   assert(output_y_end > output_y_start);
36   assert(input_padding_top <= 1);
37   assert(output_channels != 0);
38 
39   const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
40   const size_t input_width_decrement = (4 + ((input_width - 1) & 1) * 2 + (round_down_po2(input_width - 1, 2) * 3 /* channels */)) * sizeof(float);
41   const size_t output_width = input_width / 2;
42   const size_t output_channel_decrement = output_width * output_width_stride - 4 * sizeof(float);
43   const size_t output_height_increment = output_height_stride * 2 - round_up_po2(output_channels, 4) * sizeof(float);
44 
45   // Adjustment for padding processed below
46   const float* i0 = (const float*) ((uintptr_t) input +
47     input_height_stride * (output_y_start * 2 /* vertical stride */ - input_padding_top));
48   const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
49   const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
50   const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
51   const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
52   float* o0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
53   float* o1 = (float*) ((uintptr_t) o0 + output_height_stride);
54 
55   if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
56     i0 = zero;
57   }
58 
59   const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
60   const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
61 
62   for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
63     const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
64     const size_t input_y4 = input_y2 + 2;
65     if XNN_UNPREDICTABLE(input_y2 > input_height) {
66       i1 = zero;
67     }
68     if XNN_UNPREDICTABLE(input_y2 >= input_height) {
69       i2 = zero;
70     }
71     if XNN_UNPREDICTABLE(input_y4 > input_height) {
72       i3 = zero;
73     }
74     if XNN_UNPREDICTABLE(input_y4 >= input_height) {
75       i4 = zero;
76     }
77     if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
78       o1 = o0;
79     }
80 
81     const float* w = weights;
82     size_t c = output_channels;
83     do {
84       // viMx0 = ( iM1c0, iM0c2, iM0c1, iM0c0 )
85       float32x4_t vi0x0 = vld1q_f32(i0); i0 += 4;
86       float32x4_t vi1x0 = vld1q_f32(i1); i1 += 4;
87       float32x4_t vi2x0 = vld1q_f32(i2); i2 += 4;
88       float32x4_t vi3x0 = vld1q_f32(i3); i3 += 4;
89       float32x4_t vi4x0 = vld1q_f32(i4); i4 += 4;
90 
91       size_t iw = input_width - 1;
92       for (; iw >= 4; iw -= 4) {
93         float32x4_t vo0x0c0123 = vld1q_f32(w);
94         float32x4_t vo1x0c0123 = vo0x0c0123;
95         float32x4_t vo0x1c0123 = vo0x0c0123;
96         float32x4_t vo1x1c0123 = vo0x0c0123;
97 
98         const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
99 
100         // viMx1 = ( iM2c1, iM2c0, iM1c2, iM1c1 )
101         const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
102         const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
103         const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
104         const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
105         const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
106 
107         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk00c0x0123, vget_low_f32(vi0x0), 0);
108         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c0x0123, vget_low_f32(vi2x0), 0);
109 
110         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk00c0x0123, vget_high_f32(vi0x1), 0);
111         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c0x0123, vget_high_f32(vi2x1), 0);
112 
113         const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
114 
115         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk10c0x0123, vget_low_f32(vi1x0), 0);
116         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk10c0x0123, vget_low_f32(vi3x0), 0);
117 
118         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk10c0x0123, vget_high_f32(vi1x1), 0);
119         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk10c0x0123, vget_high_f32(vi3x1), 0);
120 
121         const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
122 
123         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0);
124         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk20c0x0123, vget_low_f32(vi4x0), 0);
125 
126         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c0x0123, vget_high_f32(vi2x1), 0);
127         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk20c0x0123, vget_high_f32(vi4x1), 0);
128 
129         const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
130 
131         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk00c1x0123, vget_low_f32(vi0x0), 1);
132         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c1x0123, vget_low_f32(vi2x0), 1);
133 
134         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk00c1x0123, vget_high_f32(vi0x1), 1);
135         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c1x0123, vget_high_f32(vi2x1), 1);
136 
137         const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
138 
139         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk10c1x0123, vget_low_f32(vi1x0), 1);
140         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk10c1x0123, vget_low_f32(vi3x0), 1);
141 
142         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk10c1x0123, vget_high_f32(vi1x1), 1);
143         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk10c1x0123, vget_high_f32(vi3x1), 1);
144 
145         const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
146 
147         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1);
148         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk20c1x0123, vget_low_f32(vi4x0), 1);
149 
150         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c1x0123, vget_high_f32(vi2x1), 1);
151         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk20c1x0123, vget_high_f32(vi4x1), 1);
152 
153         const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
154 
155         // viMx2 = ( iM3c2, iM3c1, iM3c0, iM2c2 )
156         const float32x4_t vi0x2 = vld1q_f32(i0); i0 += 4;
157         const float32x4_t vi1x2 = vld1q_f32(i1); i1 += 4;
158         const float32x4_t vi2x2 = vld1q_f32(i2); i2 += 4;
159         const float32x4_t vi3x2 = vld1q_f32(i3); i3 += 4;
160         const float32x4_t vi4x2 = vld1q_f32(i4); i4 += 4;
161 
162         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk00c2x0123, vget_high_f32(vi0x0), 0);
163         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c2x0123, vget_high_f32(vi2x0), 0);
164 
165         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk00c2x0123, vget_low_f32(vi0x2), 0);
166         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c2x0123, vget_low_f32(vi2x2), 0);
167 
168         const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
169 
170         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk10c2x0123, vget_high_f32(vi1x0), 0);
171         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk10c2x0123, vget_high_f32(vi3x0), 0);
172 
173         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk10c2x0123, vget_low_f32(vi1x2), 0);
174         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk10c2x0123, vget_low_f32(vi3x2), 0);
175 
176         const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
177 
178         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0);
179         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk20c2x0123, vget_high_f32(vi4x0), 0);
180 
181         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c2x0123, vget_low_f32(vi2x2), 0);
182         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk20c2x0123, vget_low_f32(vi4x2), 0);
183 
184         const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
185 
186         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk01c0x0123, vget_high_f32(vi0x0), 1);
187         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c0x0123, vget_high_f32(vi2x0), 1);
188 
189         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk01c0x0123, vget_low_f32(vi0x2), 1);
190         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk01c0x0123, vget_low_f32(vi2x2), 1);
191 
192         const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
193 
194         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk11c0x0123, vget_high_f32(vi1x0), 1);
195         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk11c0x0123, vget_high_f32(vi3x0), 1);
196 
197         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk11c0x0123, vget_low_f32(vi1x2), 1);
198         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk11c0x0123, vget_low_f32(vi3x2), 1);
199 
200         const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
201 
202         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1);
203         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk21c0x0123, vget_high_f32(vi4x0), 1);
204 
205         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk21c0x0123, vget_low_f32(vi2x2), 1);
206         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk21c0x0123, vget_low_f32(vi4x2), 1);
207 
208         const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
209 
210         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk01c1x0123, vget_low_f32(vi0x1), 0);
211         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c1x0123, vget_low_f32(vi2x1), 0);
212 
213         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk01c1x0123, vget_high_f32(vi0x2), 0);
214         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk01c1x0123, vget_high_f32(vi2x2), 0);
215 
216         const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
217 
218         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk11c1x0123, vget_low_f32(vi1x1), 0);
219         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk11c1x0123, vget_low_f32(vi3x1), 0);
220 
221         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk11c1x0123, vget_high_f32(vi1x2), 0);
222         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk11c1x0123, vget_high_f32(vi3x2), 0);
223 
224         const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
225 
226         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c1x0123, vget_low_f32(vi2x1), 0);
227         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk21c1x0123, vget_low_f32(vi4x1), 0);
228 
229         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk21c1x0123, vget_high_f32(vi2x2), 0);
230         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk21c1x0123, vget_high_f32(vi4x2), 0);
231 
232         const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
233 
234         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk01c2x0123, vget_low_f32(vi0x1), 1);
235         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c2x0123, vget_low_f32(vi2x1), 1);
236 
237         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk01c2x0123, vget_high_f32(vi0x2), 1);
238         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk01c2x0123, vget_high_f32(vi2x2), 1);
239 
240         const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
241 
242         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk11c2x0123, vget_low_f32(vi1x1), 1);
243         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk11c2x0123, vget_low_f32(vi3x1), 1);
244 
245         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk11c2x0123, vget_high_f32(vi1x2), 1);
246         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk11c2x0123, vget_high_f32(vi3x2), 1);
247 
248         const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
249 
250         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c2x0123, vget_low_f32(vi2x1), 1);
251         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk21c2x0123, vget_low_f32(vi4x1), 1);
252 
253         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk21c2x0123, vget_high_f32(vi2x2), 1);
254         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk21c2x0123, vget_high_f32(vi4x2), 1);
255 
256         const float32x4_t vk02c0x0123 = vld1q_f32(w + 76);
257 
258         // viMx3 = ( iM5c0, iM4c2, iM4c1, iM4c0 )
259         const float32x4_t vi0x3 = vld1q_f32(i0); i0 += 4;
260         const float32x4_t vi1x3 = vld1q_f32(i1); i1 += 4;
261         const float32x4_t vi2x3 = vld1q_f32(i2); i2 += 4;
262         const float32x4_t vi3x3 = vld1q_f32(i3); i3 += 4;
263         const float32x4_t vi4x3 = vld1q_f32(i4); i4 += 4;
264 
265         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk02c0x0123, vget_high_f32(vi0x1), 0);
266         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk02c0x0123, vget_high_f32(vi2x1), 0);
267 
268         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk02c0x0123, vget_low_f32(vi0x3), 0);
269         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk02c0x0123, vget_low_f32(vi2x3), 0);
270 
271         const float32x4_t vk12c0x0123 = vld1q_f32(w + 80);
272 
273         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk12c0x0123, vget_high_f32(vi1x1), 0);
274         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk12c0x0123, vget_high_f32(vi3x1), 0);
275 
276         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk12c0x0123, vget_low_f32(vi1x3), 0);
277         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk12c0x0123, vget_low_f32(vi3x3), 0);
278 
279         const float32x4_t vk22c0x0123 = vld1q_f32(w + 84);
280 
281         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk22c0x0123, vget_high_f32(vi2x1), 0);
282         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk22c0x0123, vget_high_f32(vi4x1), 0);
283 
284         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk22c0x0123, vget_low_f32(vi2x3), 0);
285         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk22c0x0123, vget_low_f32(vi4x3), 0);
286 
287         const float32x4_t vk02c1x0123 = vld1q_f32(w + 88);
288 
289         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk02c1x0123, vget_high_f32(vi0x1), 1);
290         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk02c1x0123, vget_high_f32(vi2x1), 1);
291 
292         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk02c1x0123, vget_low_f32(vi0x3), 1);
293         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk02c1x0123, vget_low_f32(vi2x3), 1);
294 
295         const float32x4_t vk12c1x0123 = vld1q_f32(w + 92);
296 
297         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk12c1x0123, vget_high_f32(vi1x1), 1);
298         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk12c1x0123, vget_high_f32(vi3x1), 1);
299 
300         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk12c1x0123, vget_low_f32(vi1x3), 1);
301         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk12c1x0123, vget_low_f32(vi3x3), 1);
302 
303         const float32x4_t vk22c1x0123 = vld1q_f32(w + 96);
304 
305         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk22c1x0123, vget_high_f32(vi2x1), 1);
306         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk22c1x0123, vget_high_f32(vi4x1), 1);
307 
308         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk22c1x0123, vget_low_f32(vi2x3), 1);
309         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk22c1x0123, vget_low_f32(vi4x3), 1);
310 
311         const float32x4_t vk02c2x0123 = vld1q_f32(w + 100);
312 
313         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk02c2x0123, vget_low_f32(vi0x2), 0);
314         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk02c2x0123, vget_low_f32(vi2x2), 0);
315 
316         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk02c2x0123, vget_high_f32(vi0x3), 0);
317         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk02c2x0123, vget_high_f32(vi2x3), 0);
318 
319         const float32x4_t vk12c2x0123 = vld1q_f32(w + 104);
320 
321         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk12c2x0123, vget_low_f32(vi1x2), 0);
322         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk12c2x0123, vget_low_f32(vi3x2), 0);
323 
324         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk12c2x0123, vget_high_f32(vi1x3), 0);
325         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk12c2x0123, vget_high_f32(vi3x3), 0);
326 
327         const float32x4_t vk22c2x0123 = vld1q_f32(w + 108);
328 
329         vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk22c2x0123, vget_low_f32(vi2x2), 0);
330         vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk22c2x0123, vget_low_f32(vi4x2), 0);
331 
332         vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk22c2x0123, vget_high_f32(vi2x3), 0);
333         vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk22c2x0123, vget_high_f32(vi4x3), 0);
334 
335         vi0x0 = vi0x3;
336         vi1x0 = vi1x3;
337         vi2x0 = vi2x3;
338         vi3x0 = vi3x3;
339         vi4x0 = vi4x3;
340 
341 
342         vo0x0c0123 = vmaxq_f32(vo0x0c0123, vmin);
343         vo1x0c0123 = vmaxq_f32(vo1x0c0123, vmin);
344 
345         vo0x1c0123 = vmaxq_f32(vo0x1c0123, vmin);
346         vo1x1c0123 = vmaxq_f32(vo1x1c0123, vmin);
347 
348         vo0x0c0123 = vminq_f32(vo0x0c0123, vmax);
349         vo1x0c0123 = vminq_f32(vo1x0c0123, vmax);
350 
351         vo0x1c0123 = vminq_f32(vo0x1c0123, vmax);
352         vo1x1c0123 = vminq_f32(vo1x1c0123, vmax);
353 
354         if XNN_LIKELY(c >= 4) {
355           vst1q_f32(o1, vo1x0c0123);
356           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
357           vst1q_f32(o0, vo0x0c0123);
358           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
359 
360           vst1q_f32(o1, vo1x1c0123);
361           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
362           vst1q_f32(o0, vo0x1c0123);
363           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
364         } else {
365           float* o0_tmp = o0;
366           float* o1_tmp = o1;
367           float32x2_t vo0x0c01 = vget_low_f32(vo0x0c0123);
368           float32x2_t vo1x0c01 = vget_low_f32(vo1x0c0123);
369           float32x2_t vo0x1c01 = vget_low_f32(vo0x1c0123);
370           float32x2_t vo1x1c01 = vget_low_f32(vo1x1c0123);
371           if (c & 2) {
372             vst1_f32((float*) ((uintptr_t) o1_tmp + output_width_stride), vo1x1c01);
373             vo1x1c01 = vget_high_f32(vo1x1c0123);
374             vst1_f32((float*) ((uintptr_t) o0_tmp + output_width_stride), vo0x1c01);
375             vo0x1c01 = vget_high_f32(vo0x1c0123);
376 
377             vst1_f32(o1_tmp, vo1x0c01); o1_tmp += 2;
378             vo1x0c01 = vget_high_f32(vo1x0c0123);
379             vst1_f32(o0_tmp, vo0x0c01); o0_tmp += 2;
380             vo0x0c01 = vget_high_f32(vo0x0c0123);
381           }
382           if (c & 1) {
383             vst1_lane_f32(o1_tmp, vo1x0c01, 0);
384             vst1_lane_f32(o0_tmp, vo0x0c01, 0);
385 
386             vst1_lane_f32((float*) ((uintptr_t) o1_tmp + output_width_stride), vo1x1c01, 0);
387             vst1_lane_f32((float*) ((uintptr_t) o0_tmp + output_width_stride), vo0x1c01, 0);
388           }
389 
390           o0 = (float*) ((uintptr_t) o0 + output_width_stride * 2);
391           o1 = (float*) ((uintptr_t) o1 + output_width_stride * 2);
392         }
393       }
394       assert(iw < 4);
395       if XNN_LIKELY(iw & 2) {
396         float32x4_t vo0c0123 = vld1q_f32(w);
397         float32x4_t vo1c0123 = vo0c0123;
398 
399         const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
400 
401         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 0);
402         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 0);
403 
404         const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
405 
406         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 0);
407         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 0);
408 
409         const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
410 
411         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0);
412         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 0);
413 
414         const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
415 
416         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c1x0123, vget_low_f32(vi0x0), 1);
417         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_low_f32(vi2x0), 1);
418 
419         const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
420 
421         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c1x0123, vget_low_f32(vi1x0), 1);
422         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c1x0123, vget_low_f32(vi3x0), 1);
423 
424         const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
425 
426         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1);
427         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c1x0123, vget_low_f32(vi4x0), 1);
428 
429         const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
430 
431         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 0);
432         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 0);
433 
434         const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
435 
436         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 0);
437         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 0);
438 
439         const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
440 
441         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0);
442         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 0);
443 
444         const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
445 
446         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c0x0123, vget_high_f32(vi0x0), 1);
447         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_high_f32(vi2x0), 1);
448 
449         const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
450 
451         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c0x0123, vget_high_f32(vi1x0), 1);
452         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c0x0123, vget_high_f32(vi3x0), 1);
453 
454         const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
455 
456         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1);
457         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c0x0123, vget_high_f32(vi4x0), 1);
458 
459         const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
460 
461         // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
462         const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
463         const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
464         const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
465         const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
466         const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
467 
468         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c1x0123, vget_low_f32(vi0x1), 0);
469         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 0);
470 
471         const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
472 
473         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c1x0123, vget_low_f32(vi1x1), 0);
474         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c1x0123, vget_low_f32(vi3x1), 0);
475 
476         const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
477 
478         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 0);
479         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c1x0123, vget_low_f32(vi4x1), 0);
480 
481         const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
482 
483         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c2x0123, vget_low_f32(vi0x1), 1);
484         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vget_low_f32(vi2x1), 1);
485 
486         const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
487 
488         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c2x0123, vget_low_f32(vi1x1), 1);
489         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c2x0123, vget_low_f32(vi3x1), 1);
490 
491         const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
492 
493         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c2x0123, vget_low_f32(vi2x1), 1);
494         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c2x0123, vget_low_f32(vi4x1), 1);
495 
496         const float32x4_t vk02c0x0123 = vld1q_f32(w + 76);
497 
498         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c0x0123, vget_high_f32(vi0x1), 0);
499         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c0x0123, vget_high_f32(vi2x1), 0);
500 
501         const float32x4_t vk12c0x0123 = vld1q_f32(w + 80);
502 
503         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c0x0123, vget_high_f32(vi1x1), 0);
504         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c0x0123, vget_high_f32(vi3x1), 0);
505 
506         const float32x4_t vk22c0x0123 = vld1q_f32(w + 84);
507 
508         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c0x0123, vget_high_f32(vi2x1), 0);
509         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c0x0123, vget_high_f32(vi4x1), 0);
510 
511         const float32x4_t vk02c1x0123 = vld1q_f32(w + 88);
512 
513         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c1x0123, vget_high_f32(vi0x1), 1);
514         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c1x0123, vget_high_f32(vi2x1), 1);
515 
516         const float32x4_t vk12c1x0123 = vld1q_f32(w + 92);
517 
518         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c1x0123, vget_high_f32(vi1x1), 1);
519         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c1x0123, vget_high_f32(vi3x1), 1);
520 
521         const float32x4_t vk22c1x0123 = vld1q_f32(w + 96);
522 
523         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c1x0123, vget_high_f32(vi2x1), 1);
524         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c1x0123, vget_high_f32(vi4x1), 1);
525 
526         const float32x4_t vk02c2x0123 = vld1q_f32(w + 100);
527 
528         // viMx2 = ( iM2c2, iM2c1 )
529         const float32x2_t vi0x2 = vld1_f32(i0); i0 += 2;
530         const float32x2_t vi1x2 = vld1_f32(i1); i1 += 2;
531         const float32x2_t vi2x2 = vld1_f32(i2); i2 += 2;
532         const float32x2_t vi3x2 = vld1_f32(i3); i3 += 2;
533         const float32x2_t vi4x2 = vld1_f32(i4); i4 += 2;
534 
535         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c2x0123, vi0x2, 0);
536         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c2x0123, vi2x2, 0);
537 
538         const float32x4_t vk12c2x0123 = vld1q_f32(w + 104);
539 
540         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c2x0123, vi1x2, 0);
541         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c2x0123, vi3x2, 0);
542 
543         const float32x4_t vk22c2x0123 = vld1q_f32(w + 108);
544 
545         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c2x0123, vi2x2, 0);
546         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c2x0123, vi4x2, 0);
547 
548         vi0x0 = vcombine_f32(vget_high_f32(vi0x1), vi0x2);
549         vi1x0 = vcombine_f32(vget_high_f32(vi1x1), vi1x2);
550         vi2x0 = vcombine_f32(vget_high_f32(vi2x1), vi2x2);
551         vi3x0 = vcombine_f32(vget_high_f32(vi3x1), vi3x2);
552         vi4x0 = vcombine_f32(vget_high_f32(vi4x1), vi4x2);
553 
554 
555         vo0c0123 = vmaxq_f32(vo0c0123, vmin);
556         vo1c0123 = vmaxq_f32(vo1c0123, vmin);
557 
558         vo0c0123 = vminq_f32(vo0c0123, vmax);
559         vo1c0123 = vminq_f32(vo1c0123, vmax);
560 
561         if XNN_LIKELY(c >= 4) {
562           vst1q_f32(o1, vo1c0123);
563           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
564           vst1q_f32(o0, vo0c0123);
565           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
566         } else {
567           float* o0_tmp = o0;
568           float* o1_tmp = o1;
569           float32x2_t vo0c01 = vget_low_f32(vo0c0123);
570           float32x2_t vo1c01 = vget_low_f32(vo1c0123);
571           if (c & 2) {
572             vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
573             vo1c01 = vget_high_f32(vo1c0123);
574             vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
575             vo0c01 = vget_high_f32(vo0c0123);
576           }
577           if (c & 1) {
578             vst1_lane_f32(o1_tmp, vo1c01, 0);
579             vst1_lane_f32(o0_tmp, vo0c01, 0);
580           }
581 
582           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
583           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
584         }
585       }
586       if XNN_LIKELY(iw & 1) {
587         float32x4_t vo0c0123 = vld1q_f32(w);
588         float32x4_t vo1c0123 = vo0c0123;
589 
590         const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
591 
592         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 0);
593         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 0);
594 
595         const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
596 
597         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 0);
598         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 0);
599 
600         const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
601 
602         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0);
603         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 0);
604 
605         const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
606 
607         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c1x0123, vget_low_f32(vi0x0), 1);
608         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_low_f32(vi2x0), 1);
609 
610         const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
611 
612         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c1x0123, vget_low_f32(vi1x0), 1);
613         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c1x0123, vget_low_f32(vi3x0), 1);
614 
615         const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
616 
617         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1);
618         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c1x0123, vget_low_f32(vi4x0), 1);
619 
620         const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
621 
622         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 0);
623         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 0);
624 
625         const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
626 
627         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 0);
628         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 0);
629 
630         const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
631 
632         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0);
633         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 0);
634 
635         const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
636 
637         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c0x0123, vget_high_f32(vi0x0), 1);
638         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_high_f32(vi2x0), 1);
639 
640         const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
641 
642         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c0x0123, vget_high_f32(vi1x0), 1);
643         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c0x0123, vget_high_f32(vi3x0), 1);
644 
645         const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
646 
647         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1);
648         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c0x0123, vget_high_f32(vi4x0), 1);
649 
650         const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
651 
652         // viMx1 = ( iM1c2, iM1c1 )
653         const float32x2_t vi0x1 = vld1_f32(i0); i0 += 2;
654         const float32x2_t vi1x1 = vld1_f32(i1); i1 += 2;
655         const float32x2_t vi2x1 = vld1_f32(i2); i2 += 2;
656         const float32x2_t vi3x1 = vld1_f32(i3); i3 += 2;
657         const float32x2_t vi4x1 = vld1_f32(i4); i4 += 2;
658 
659         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c1x0123, vi0x1, 0);
660         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vi2x1, 0);
661 
662         const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
663 
664         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c1x0123, vi1x1, 0);
665         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c1x0123, vi3x1, 0);
666 
667         const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
668 
669         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vi2x1, 0);
670         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c1x0123, vi4x1, 0);
671 
672         const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
673 
674         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c2x0123, vi0x1, 1);
675         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vi2x1, 1);
676 
677         const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
678 
679         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c2x0123, vi1x1, 1);
680         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c2x0123, vi3x1, 1);
681 
682         const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
683 
684         vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c2x0123, vi2x1, 1);
685         vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c2x0123, vi4x1, 1);
686 
687 
688         vo0c0123 = vmaxq_f32(vo0c0123, vmin);
689         vo1c0123 = vmaxq_f32(vo1c0123, vmin);
690 
691         vo0c0123 = vminq_f32(vo0c0123, vmax);
692         vo1c0123 = vminq_f32(vo1c0123, vmax);
693 
694         if XNN_LIKELY(c >= 4) {
695           vst1q_f32(o1, vo1c0123);
696           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
697           vst1q_f32(o0, vo0c0123);
698           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
699         } else {
700           float* o0_tmp = o0;
701           float* o1_tmp = o1;
702           float32x2_t vo0c01 = vget_low_f32(vo0c0123);
703           float32x2_t vo1c01 = vget_low_f32(vo1c0123);
704           if (c & 2) {
705             vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
706             vo1c01 = vget_high_f32(vo1c0123);
707             vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
708             vo0c01 = vget_high_f32(vo0c0123);
709           }
710           if (c & 1) {
711             vst1_lane_f32(o1_tmp, vo1c01, 0);
712             vst1_lane_f32(o0_tmp, vo0c01, 0);
713           }
714           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
715           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
716         }
717       }
718       // Move output pointers back to the position of the first pixel in a row,
719       // and forward to the next block of output channels
720       o0 = (float*) ((uintptr_t) o0 - output_channel_decrement);
721       o1 = (float*) ((uintptr_t) o1 - output_channel_decrement);
722       // Revert input pointers to the position of the first pixel in a row
723       i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
724       i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
725       i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
726       i3 = (const float*) ((uintptr_t) i3 - input_width_decrement);
727       i4 = (const float*) ((uintptr_t) i4 - input_width_decrement);
728       // Move to the block of weights for the next 4 output channels
729       w += 112;
730       c = doz(c, 4);
731     } while (c != 0);
732     // Move output pointers back to the position of the first channel, and forward to the next block of rows
733     o0 = (float*) ((uintptr_t) o0 + output_height_increment);
734     o1 = (float*) ((uintptr_t) o1 + output_height_increment);
735     // Move input pointers forward to the next four rows
736     i0 = i4;
737     i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
738     i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
739     i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
740     i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
741   }
742 }
743