1 // Auto-generated file. Do not edit!
2 // Template: src/f32-conv-hwc/3x3s2p1c3-neon-x2.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10
11 #include <assert.h>
12
13 #include <arm_neon.h>
14
15 #include <xnnpack/conv.h>
16 #include <xnnpack/math.h>
17
18
xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_width_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2(
20 size_t input_height,
21 size_t input_width,
22 size_t output_y_start,
23 size_t output_y_end,
24 const float* input,
25 const float* zero,
26 const float* weights,
27 float* output,
28 size_t input_padding_top,
29 size_t output_channels,
30 size_t output_height_stride,
31 size_t output_width_stride,
32 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
33 {
34 assert(input_width != 0);
35 assert(output_y_end > output_y_start);
36 assert(input_padding_top <= 1);
37 assert(output_channels != 0);
38
39 const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
40 const size_t input_width_decrement = input_width * 3 /* channels */ * sizeof(float);
41 const size_t output_width = (input_width + 1) / 2;
42 const size_t output_channel_decrement = output_width * output_width_stride - 4 * sizeof(float);
43 const size_t output_height_increment = output_height_stride * 2 - round_up_po2(output_channels, 4) * sizeof(float);
44
45 // Adjustment for padding processed below
46 const float* i0 = (const float*) ((uintptr_t) input +
47 input_height_stride * (output_y_start * 2 /* vertical stride */ - input_padding_top));
48 const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
49 const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
50 const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
51 const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
52 float* o0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
53 float* o1 = (float*) ((uintptr_t) o0 + output_height_stride);
54
55 if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
56 i0 = zero;
57 }
58
59 const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min);
60 const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max);
61
62 for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
63 const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
64 const size_t input_y4 = input_y2 + 2;
65 if XNN_UNPREDICTABLE(input_y2 > input_height) {
66 i1 = zero;
67 }
68 if XNN_UNPREDICTABLE(input_y2 >= input_height) {
69 i2 = zero;
70 }
71 if XNN_UNPREDICTABLE(input_y4 > input_height) {
72 i3 = zero;
73 }
74 if XNN_UNPREDICTABLE(input_y4 >= input_height) {
75 i4 = zero;
76 }
77 if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
78 o1 = o0;
79 }
80
81 const float* w = weights;
82 size_t c = output_channels;
83 do {
84 // viMx0 = ( iM0c2, iM0c1, iM0c0, --- )
85 float32x4_t vi0x0 = vmovq_n_f32(0.0f);
86 float32x4_t vi1x0 = vmovq_n_f32(0.0f);
87 float32x4_t vi2x0 = vmovq_n_f32(0.0f);
88 float32x4_t vi3x0 = vmovq_n_f32(0.0f);
89 float32x4_t vi4x0 = vmovq_n_f32(0.0f);
90
91 size_t iw = input_width;
92 for (; iw >= 4; iw -= 4) {
93 float32x4_t vo0x0c0123 = vld1q_f32(w);
94 float32x4_t vo1x0c0123 = vo0x0c0123;
95 float32x4_t vo0x1c0123 = vo0x0c0123;
96 float32x4_t vo1x1c0123 = vo0x0c0123;
97
98 const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
99
100 // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
101 const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
102 const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
103 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
104 const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
105 const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
106
107 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk00c0x0123, vget_low_f32(vi0x0), 1);
108 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c0x0123, vget_low_f32(vi2x0), 1);
109
110 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk00c0x0123, vget_high_f32(vi0x1), 1);
111 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c0x0123, vget_high_f32(vi2x1), 1);
112
113 const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
114
115 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk10c0x0123, vget_low_f32(vi1x0), 1);
116 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk10c0x0123, vget_low_f32(vi3x0), 1);
117
118 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk10c0x0123, vget_high_f32(vi1x1), 1);
119 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk10c0x0123, vget_high_f32(vi3x1), 1);
120
121 const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
122
123 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1);
124 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk20c0x0123, vget_low_f32(vi4x0), 1);
125
126 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c0x0123, vget_high_f32(vi2x1), 1);
127 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk20c0x0123, vget_high_f32(vi4x1), 1);
128
129 const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
130
131 // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
132 const float32x4_t vi0x2 = vld1q_f32(i0); i0 += 4;
133 const float32x4_t vi1x2 = vld1q_f32(i1); i1 += 4;
134 const float32x4_t vi2x2 = vld1q_f32(i2); i2 += 4;
135 const float32x4_t vi3x2 = vld1q_f32(i3); i3 += 4;
136 const float32x4_t vi4x2 = vld1q_f32(i4); i4 += 4;
137
138 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk00c1x0123, vget_high_f32(vi0x0), 0);
139 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c1x0123, vget_high_f32(vi2x0), 0);
140
141 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk00c1x0123, vget_low_f32(vi0x2), 0);
142 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c1x0123, vget_low_f32(vi2x2), 0);
143
144 const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
145
146 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk10c1x0123, vget_high_f32(vi1x0), 0);
147 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk10c1x0123, vget_high_f32(vi3x0), 0);
148
149 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk10c1x0123, vget_low_f32(vi1x2), 0);
150 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk10c1x0123, vget_low_f32(vi3x2), 0);
151
152 const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
153
154 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0);
155 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk20c1x0123, vget_high_f32(vi4x0), 0);
156
157 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c1x0123, vget_low_f32(vi2x2), 0);
158 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk20c1x0123, vget_low_f32(vi4x2), 0);
159
160 const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
161
162 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk00c2x0123, vget_high_f32(vi0x0), 1);
163 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk00c2x0123, vget_high_f32(vi2x0), 1);
164
165 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk00c2x0123, vget_low_f32(vi0x2), 1);
166 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c2x0123, vget_low_f32(vi2x2), 1);
167
168 const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
169
170 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk10c2x0123, vget_high_f32(vi1x0), 1);
171 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk10c2x0123, vget_high_f32(vi3x0), 1);
172
173 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk10c2x0123, vget_low_f32(vi1x2), 1);
174 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk10c2x0123, vget_low_f32(vi3x2), 1);
175
176 const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
177
178 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk20c2x0123, vget_high_f32(vi2x0), 1);
179 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk20c2x0123, vget_high_f32(vi4x0), 1);
180
181 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c2x0123, vget_low_f32(vi2x2), 1);
182 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk20c2x0123, vget_low_f32(vi4x2), 1);
183
184 const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
185
186 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk01c0x0123, vget_low_f32(vi0x1), 0);
187 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c0x0123, vget_low_f32(vi2x1), 0);
188
189 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk01c0x0123, vget_high_f32(vi0x2), 0);
190 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk01c0x0123, vget_high_f32(vi2x2), 0);
191
192 const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
193
194 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk11c0x0123, vget_low_f32(vi1x1), 0);
195 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk11c0x0123, vget_low_f32(vi3x1), 0);
196
197 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk11c0x0123, vget_high_f32(vi1x2), 0);
198 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk11c0x0123, vget_high_f32(vi3x2), 0);
199
200 const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
201
202 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0);
203 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk21c0x0123, vget_low_f32(vi4x1), 0);
204
205 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk21c0x0123, vget_high_f32(vi2x2), 0);
206 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk21c0x0123, vget_high_f32(vi4x2), 0);
207
208 const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
209
210 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk01c1x0123, vget_low_f32(vi0x1), 1);
211 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c1x0123, vget_low_f32(vi2x1), 1);
212
213 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk01c1x0123, vget_high_f32(vi0x2), 1);
214 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk01c1x0123, vget_high_f32(vi2x2), 1);
215
216 const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
217
218 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk11c1x0123, vget_low_f32(vi1x1), 1);
219 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk11c1x0123, vget_low_f32(vi3x1), 1);
220
221 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk11c1x0123, vget_high_f32(vi1x2), 1);
222 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk11c1x0123, vget_high_f32(vi3x2), 1);
223
224 const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
225
226 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1);
227 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk21c1x0123, vget_low_f32(vi4x1), 1);
228
229 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk21c1x0123, vget_high_f32(vi2x2), 1);
230 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk21c1x0123, vget_high_f32(vi4x2), 1);
231
232 const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
233
234 // viMx3 = ( iM4c2, iM4c1, iM4c0, iM3c2 )
235 const float32x4_t vi0x3 = vld1q_f32(i0); i0 += 4;
236 const float32x4_t vi1x3 = vld1q_f32(i1); i1 += 4;
237 const float32x4_t vi2x3 = vld1q_f32(i2); i2 += 4;
238 const float32x4_t vi3x3 = vld1q_f32(i3); i3 += 4;
239 const float32x4_t vi4x3 = vld1q_f32(i4); i4 += 4;
240
241 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk01c2x0123, vget_high_f32(vi0x1), 0);
242 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c2x0123, vget_high_f32(vi2x1), 0);
243
244 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk01c2x0123, vget_low_f32(vi0x3), 0);
245 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk01c2x0123, vget_low_f32(vi2x3), 0);
246
247 const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
248
249 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk11c2x0123, vget_high_f32(vi1x1), 0);
250 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk11c2x0123, vget_high_f32(vi3x1), 0);
251
252 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk11c2x0123, vget_low_f32(vi1x3), 0);
253 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk11c2x0123, vget_low_f32(vi3x3), 0);
254
255 const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
256
257 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c2x0123, vget_high_f32(vi2x1), 0);
258 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk21c2x0123, vget_high_f32(vi4x1), 0);
259
260 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk21c2x0123, vget_low_f32(vi2x3), 0);
261 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk21c2x0123, vget_low_f32(vi4x3), 0);
262
263 const float32x4_t vk02c0x0123 = vld1q_f32(w + 76);
264
265 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk02c0x0123, vget_high_f32(vi0x1), 1);
266 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk02c0x0123, vget_high_f32(vi2x1), 1);
267
268 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk02c0x0123, vget_low_f32(vi0x3), 1);
269 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk02c0x0123, vget_low_f32(vi2x3), 1);
270
271 const float32x4_t vk12c0x0123 = vld1q_f32(w + 80);
272
273 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk12c0x0123, vget_high_f32(vi1x1), 1);
274 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk12c0x0123, vget_high_f32(vi3x1), 1);
275
276 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk12c0x0123, vget_low_f32(vi1x3), 1);
277 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk12c0x0123, vget_low_f32(vi3x3), 1);
278
279 const float32x4_t vk22c0x0123 = vld1q_f32(w + 84);
280
281 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk22c0x0123, vget_high_f32(vi2x1), 1);
282 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk22c0x0123, vget_high_f32(vi4x1), 1);
283
284 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk22c0x0123, vget_low_f32(vi2x3), 1);
285 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk22c0x0123, vget_low_f32(vi4x3), 1);
286
287 const float32x4_t vk02c1x0123 = vld1q_f32(w + 88);
288
289 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk02c1x0123, vget_low_f32(vi0x2), 0);
290 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk02c1x0123, vget_low_f32(vi2x2), 0);
291
292 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk02c1x0123, vget_high_f32(vi0x3), 0);
293 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk02c1x0123, vget_high_f32(vi2x3), 0);
294
295 const float32x4_t vk12c1x0123 = vld1q_f32(w + 92);
296
297 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk12c1x0123, vget_low_f32(vi1x2), 0);
298 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk12c1x0123, vget_low_f32(vi3x2), 0);
299
300 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk12c1x0123, vget_high_f32(vi1x3), 0);
301 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk12c1x0123, vget_high_f32(vi3x3), 0);
302
303 const float32x4_t vk22c1x0123 = vld1q_f32(w + 96);
304
305 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk22c1x0123, vget_low_f32(vi2x2), 0);
306 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk22c1x0123, vget_low_f32(vi4x2), 0);
307
308 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk22c1x0123, vget_high_f32(vi2x3), 0);
309 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk22c1x0123, vget_high_f32(vi4x3), 0);
310
311 const float32x4_t vk02c2x0123 = vld1q_f32(w + 100);
312
313 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk02c2x0123, vget_low_f32(vi0x2), 1);
314 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk02c2x0123, vget_low_f32(vi2x2), 1);
315
316 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk02c2x0123, vget_high_f32(vi0x3), 1);
317 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk02c2x0123, vget_high_f32(vi2x3), 1);
318
319 const float32x4_t vk12c2x0123 = vld1q_f32(w + 104);
320
321 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk12c2x0123, vget_low_f32(vi1x2), 1);
322 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk12c2x0123, vget_low_f32(vi3x2), 1);
323
324 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk12c2x0123, vget_high_f32(vi1x3), 1);
325 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk12c2x0123, vget_high_f32(vi3x3), 1);
326
327 const float32x4_t vk22c2x0123 = vld1q_f32(w + 108);
328
329 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk22c2x0123, vget_low_f32(vi2x2), 1);
330 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk22c2x0123, vget_low_f32(vi4x2), 1);
331
332 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk22c2x0123, vget_high_f32(vi2x3), 1);
333 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk22c2x0123, vget_high_f32(vi4x3), 1);
334
335 vi0x0 = vi0x3;
336 vi1x0 = vi1x3;
337 vi2x0 = vi2x3;
338 vi3x0 = vi3x3;
339 vi4x0 = vi4x3;
340
341
342 vo0x0c0123 = vmaxq_f32(vo0x0c0123, vmin);
343 vo1x0c0123 = vmaxq_f32(vo1x0c0123, vmin);
344
345 vo0x1c0123 = vmaxq_f32(vo0x1c0123, vmin);
346 vo1x1c0123 = vmaxq_f32(vo1x1c0123, vmin);
347
348 vo0x0c0123 = vminq_f32(vo0x0c0123, vmax);
349 vo1x0c0123 = vminq_f32(vo1x0c0123, vmax);
350
351 vo0x1c0123 = vminq_f32(vo0x1c0123, vmax);
352 vo1x1c0123 = vminq_f32(vo1x1c0123, vmax);
353
354 if XNN_LIKELY(c >= 4) {
355 vst1q_f32(o1, vo1x0c0123);
356 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
357 vst1q_f32(o0, vo0x0c0123);
358 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
359
360 vst1q_f32(o1, vo1x1c0123);
361 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
362 vst1q_f32(o0, vo0x1c0123);
363 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
364 } else {
365 float* o0_tmp = o0;
366 float* o1_tmp = o1;
367 float32x2_t vo0x0c01 = vget_low_f32(vo0x0c0123);
368 float32x2_t vo1x0c01 = vget_low_f32(vo1x0c0123);
369 float32x2_t vo0x1c01 = vget_low_f32(vo0x1c0123);
370 float32x2_t vo1x1c01 = vget_low_f32(vo1x1c0123);
371 if (c & 2) {
372 vst1_f32((float*) ((uintptr_t) o1_tmp + output_width_stride), vo1x1c01);
373 vo1x1c01 = vget_high_f32(vo1x1c0123);
374 vst1_f32((float*) ((uintptr_t) o0_tmp + output_width_stride), vo0x1c01);
375 vo0x1c01 = vget_high_f32(vo0x1c0123);
376
377 vst1_f32(o1_tmp, vo1x0c01); o1_tmp += 2;
378 vo1x0c01 = vget_high_f32(vo1x0c0123);
379 vst1_f32(o0_tmp, vo0x0c01); o0_tmp += 2;
380 vo0x0c01 = vget_high_f32(vo0x0c0123);
381 }
382 if (c & 1) {
383 vst1_lane_f32(o1_tmp, vo1x0c01, 0);
384 vst1_lane_f32(o0_tmp, vo0x0c01, 0);
385
386 vst1_lane_f32((float*) ((uintptr_t) o1_tmp + output_width_stride), vo1x1c01, 0);
387 vst1_lane_f32((float*) ((uintptr_t) o0_tmp + output_width_stride), vo0x1c01, 0);
388 }
389
390 o0 = (float*) ((uintptr_t) o0 + output_width_stride * 2);
391 o1 = (float*) ((uintptr_t) o1 + output_width_stride * 2);
392 }
393 }
394 assert(iw < 4);
395 if XNN_UNLIKELY(iw & 2) {
396 float32x4_t vo0c0123 = vld1q_f32(w);
397 float32x4_t vo1c0123 = vo0c0123;
398
399 const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
400
401 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 1);
402 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 1);
403
404 const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
405
406 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 1);
407 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 1);
408
409 const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
410
411 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1);
412 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 1);
413
414 const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
415
416 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c1x0123, vget_high_f32(vi0x0), 0);
417 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_high_f32(vi2x0), 0);
418
419 const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
420
421 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c1x0123, vget_high_f32(vi1x0), 0);
422 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c1x0123, vget_high_f32(vi3x0), 0);
423
424 const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
425
426 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0);
427 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c1x0123, vget_high_f32(vi4x0), 0);
428
429 const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
430
431 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 1);
432 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 1);
433
434 const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
435
436 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 1);
437 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 1);
438
439 const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
440
441 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 1);
442 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 1);
443
444 // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
445 const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
446 const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
447 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
448 const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
449 const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
450
451 const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
452
453 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c0x0123, vget_low_f32(vi0x1), 0);
454 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_low_f32(vi2x1), 0);
455
456 const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
457
458 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c0x0123, vget_low_f32(vi1x1), 0);
459 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c0x0123, vget_low_f32(vi3x1), 0);
460
461 const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
462
463 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0);
464 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c0x0123, vget_low_f32(vi4x1), 0);
465
466 const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
467
468 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c1x0123, vget_low_f32(vi0x1), 1);
469 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 1);
470
471 const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
472
473 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c1x0123, vget_low_f32(vi1x1), 1);
474 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c1x0123, vget_low_f32(vi3x1), 1);
475
476 const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
477
478 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1);
479 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c1x0123, vget_low_f32(vi4x1), 1);
480
481 const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
482
483 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c2x0123, vget_high_f32(vi0x1), 0);
484 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vget_high_f32(vi2x1), 0);
485
486 const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
487
488 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c2x0123, vget_high_f32(vi1x1), 0);
489 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c2x0123, vget_high_f32(vi3x1), 0);
490
491 const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
492
493 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c2x0123, vget_high_f32(vi2x1), 0);
494 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c2x0123, vget_high_f32(vi4x1), 0);
495
496 const float32x4_t vk02c0x0123 = vld1q_f32(w + 76);
497
498 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c0x0123, vget_high_f32(vi0x1), 1);
499 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c0x0123, vget_high_f32(vi2x1), 1);
500
501 const float32x4_t vk12c0x0123 = vld1q_f32(w + 80);
502
503 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c0x0123, vget_high_f32(vi1x1), 1);
504 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c0x0123, vget_high_f32(vi3x1), 1);
505
506 const float32x4_t vk22c0x0123 = vld1q_f32(w + 84);
507
508 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c0x0123, vget_high_f32(vi2x1), 1);
509 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c0x0123, vget_high_f32(vi4x1), 1);
510
511 // viMx2 = ( iM2c2, iM2c1 )
512 const float32x2_t vi0x2 = vld1_f32(i0); i0 += 2;
513 const float32x2_t vi1x2 = vld1_f32(i1); i1 += 2;
514 const float32x2_t vi2x2 = vld1_f32(i2); i2 += 2;
515 const float32x2_t vi3x2 = vld1_f32(i3); i3 += 2;
516 const float32x2_t vi4x2 = vld1_f32(i4); i4 += 2;
517
518 const float32x4_t vk02c1x0123 = vld1q_f32(w + 88);
519
520 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c1x0123, vi0x2, 0);
521 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c1x0123, vi2x2, 0);
522
523 const float32x4_t vk12c1x0123 = vld1q_f32(w + 92);
524
525 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c1x0123, vi1x2, 0);
526 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c1x0123, vi3x2, 0);
527
528 const float32x4_t vk22c1x0123 = vld1q_f32(w + 96);
529
530 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c1x0123, vi2x2, 0);
531 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c1x0123, vi4x2, 0);
532
533 const float32x4_t vk02c2x0123 = vld1q_f32(w + 100);
534
535 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c2x0123, vi0x2, 1);
536 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c2x0123, vi2x2, 1);
537
538 const float32x4_t vk12c2x0123 = vld1q_f32(w + 104);
539
540 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c2x0123, vi1x2, 1);
541 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c2x0123, vi3x2, 1);
542
543 const float32x4_t vk22c2x0123 = vld1q_f32(w + 108);
544
545 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c2x0123, vi2x2, 1);
546 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c2x0123, vi4x2, 1);
547
548 vi0x0 = vcombine_f32(vget_high_f32(vi0x1), vi0x2);
549 vi1x0 = vcombine_f32(vget_high_f32(vi1x1), vi1x2);
550 vi2x0 = vcombine_f32(vget_high_f32(vi2x1), vi2x2);
551 vi3x0 = vcombine_f32(vget_high_f32(vi3x1), vi3x2);
552 vi4x0 = vcombine_f32(vget_high_f32(vi4x1), vi4x2);
553
554
555 vo0c0123 = vmaxq_f32(vo0c0123, vmin);
556 vo1c0123 = vmaxq_f32(vo1c0123, vmin);
557
558 vo0c0123 = vminq_f32(vo0c0123, vmax);
559 vo1c0123 = vminq_f32(vo1c0123, vmax);
560
561 if XNN_LIKELY(c >= 4) {
562 vst1q_f32(o1, vo1c0123);
563 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
564 vst1q_f32(o0, vo0c0123);
565 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
566 } else {
567 float* o0_tmp = o0;
568 float* o1_tmp = o1;
569 float32x2_t vo0c01 = vget_low_f32(vo0c0123);
570 float32x2_t vo1c01 = vget_low_f32(vo1c0123);
571 if (c & 2) {
572 vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
573 vo1c01 = vget_high_f32(vo1c0123);
574 vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
575 vo0c01 = vget_high_f32(vo0c0123);
576 }
577 if (c & 1) {
578 vst1_lane_f32(o1_tmp, vo1c01, 0);
579 vst1_lane_f32(o0_tmp, vo0c01, 0);
580 }
581
582 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
583 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
584 }
585 }
586 if XNN_UNLIKELY(iw & 1) {
587 float32x4_t vo0c0123 = vld1q_f32(w);
588 float32x4_t vo1c0123 = vo0c0123;
589
590 const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
591
592 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 1);
593 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 1);
594
595 const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
596
597 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 1);
598 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 1);
599
600 const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
601
602 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1);
603 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 1);
604
605 const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
606
607 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c1x0123, vget_high_f32(vi0x0), 0);
608 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_high_f32(vi2x0), 0);
609
610 const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
611
612 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c1x0123, vget_high_f32(vi1x0), 0);
613 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c1x0123, vget_high_f32(vi3x0), 0);
614
615 const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
616
617 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0);
618 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c1x0123, vget_high_f32(vi4x0), 0);
619
620 const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
621
622 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 1);
623 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 1);
624
625 const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
626
627 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 1);
628 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 1);
629
630 const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
631
632 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 1);
633 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 1);
634
635 // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
636 const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 3;
637 const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 3;
638 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 3;
639 const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 3;
640 const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 3;
641
642 const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
643
644 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c0x0123, vget_low_f32(vi0x1), 0);
645 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_low_f32(vi2x1), 0);
646
647 const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
648
649 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c0x0123, vget_low_f32(vi1x1), 0);
650 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c0x0123, vget_low_f32(vi3x1), 0);
651
652 const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
653
654 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0);
655 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c0x0123, vget_low_f32(vi4x1), 0);
656
657 const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
658
659 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c1x0123, vget_low_f32(vi0x1), 1);
660 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 1);
661
662 const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
663
664 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c1x0123, vget_low_f32(vi1x1), 1);
665 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c1x0123, vget_low_f32(vi3x1), 1);
666
667 const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
668
669 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1);
670 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c1x0123, vget_low_f32(vi4x1), 1);
671
672 const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
673
674 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c2x0123, vget_high_f32(vi0x1), 0);
675 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vget_high_f32(vi2x1), 0);
676
677 const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
678
679 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c2x0123, vget_high_f32(vi1x1), 0);
680 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c2x0123, vget_high_f32(vi3x1), 0);
681
682 const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
683
684 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c2x0123, vget_high_f32(vi2x1), 0);
685 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c2x0123, vget_high_f32(vi4x1), 0);
686
687
688 vo0c0123 = vmaxq_f32(vo0c0123, vmin);
689 vo1c0123 = vmaxq_f32(vo1c0123, vmin);
690
691 vo0c0123 = vminq_f32(vo0c0123, vmax);
692 vo1c0123 = vminq_f32(vo1c0123, vmax);
693
694 if XNN_LIKELY(c >= 4) {
695 vst1q_f32(o1, vo1c0123);
696 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
697 vst1q_f32(o0, vo0c0123);
698 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
699 } else {
700 float* o0_tmp = o0;
701 float* o1_tmp = o1;
702 float32x2_t vo0c01 = vget_low_f32(vo0c0123);
703 float32x2_t vo1c01 = vget_low_f32(vo1c0123);
704 if (c & 2) {
705 vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
706 vo1c01 = vget_high_f32(vo1c0123);
707 vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
708 vo0c01 = vget_high_f32(vo0c0123);
709 }
710 if (c & 1) {
711 vst1_lane_f32(o1_tmp, vo1c01, 0);
712 vst1_lane_f32(o0_tmp, vo0c01, 0);
713 }
714 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
715 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
716 }
717 }
718 // Move output pointers back to the position of the first pixel in a row,
719 // and forward to the next block of output channels
720 o0 = (float*) ((uintptr_t) o0 - output_channel_decrement);
721 o1 = (float*) ((uintptr_t) o1 - output_channel_decrement);
722 // Revert input pointers to the position of the first pixel in a row
723 i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
724 i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
725 i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
726 i3 = (const float*) ((uintptr_t) i3 - input_width_decrement);
727 i4 = (const float*) ((uintptr_t) i4 - input_width_decrement);
728 // Move to the block of weights for the next 4 output channels
729 w += 112;
730 c = doz(c, 4);
731 } while (c != 0);
732 // Move output pointers back to the position of the first channel, and forward to the next block of rows
733 o0 = (float*) ((uintptr_t) o0 + output_height_increment);
734 o1 = (float*) ((uintptr_t) o1 + output_height_increment);
735 // Move input pointers forward to the next four rows
736 i0 = i4;
737 i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
738 i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
739 i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
740 i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
741 }
742 }
743