1 // Auto-generated file. Do not edit!
2 // Template: src/f32-conv-hwc/3x3s2p1c3-neon-x1.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10
11 #include <assert.h>
12
13 #include <arm_neon.h>
14
15 #include <xnnpack/conv.h>
16 #include <xnnpack/math.h>
17
18
xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_width_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1(
20 size_t input_height,
21 size_t input_width,
22 size_t output_y_start,
23 size_t output_y_end,
24 const float* input,
25 const float* zero,
26 const float* weights,
27 float* output,
28 size_t input_padding_top,
29 size_t output_channels,
30 size_t output_height_stride,
31 size_t output_width_stride,
32 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
33 {
34 assert(input_width != 0);
35 assert(output_y_end > output_y_start);
36 assert(input_padding_top <= 1);
37 assert(output_channels != 0);
38
39 const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
40 const size_t input_width_decrement = input_width * 3 /* channels */ * sizeof(float);
41 const size_t output_width = (input_width + 1) / 2;
42 const size_t output_channel_decrement = output_width * output_width_stride - 4 * sizeof(float);
43 const size_t output_height_increment = output_height_stride * 2 - round_up_po2(output_channels, 4) * sizeof(float);
44
45 // Adjustment for padding processed below
46 const float* i0 = (const float*) ((uintptr_t) input +
47 input_height_stride * (output_y_start * 2 /* vertical stride */ - input_padding_top));
48 const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
49 const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
50 const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
51 const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
52 float* o0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
53 float* o1 = (float*) ((uintptr_t) o0 + output_height_stride);
54
55 if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
56 i0 = zero;
57 }
58
59 const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min);
60 const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max);
61
62 for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
63 const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
64 const size_t input_y4 = input_y2 + 2;
65 if XNN_UNPREDICTABLE(input_y2 > input_height) {
66 i1 = zero;
67 }
68 if XNN_UNPREDICTABLE(input_y2 >= input_height) {
69 i2 = zero;
70 }
71 if XNN_UNPREDICTABLE(input_y4 > input_height) {
72 i3 = zero;
73 }
74 if XNN_UNPREDICTABLE(input_y4 >= input_height) {
75 i4 = zero;
76 }
77 if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
78 o1 = o0;
79 }
80
81 const float* w = weights;
82 size_t c = output_channels;
83 do {
84 // viMx0 = ( iM0c2, iM0c1, iM0c0, --- )
85 float32x4_t vi0x0 = vmovq_n_f32(0.0f);
86 float32x4_t vi1x0 = vmovq_n_f32(0.0f);
87 float32x4_t vi2x0 = vmovq_n_f32(0.0f);
88 float32x4_t vi3x0 = vmovq_n_f32(0.0f);
89 float32x4_t vi4x0 = vmovq_n_f32(0.0f);
90
91 size_t iw = input_width;
92 for (; iw >= 2; iw -= 2) {
93 float32x4_t vo0c0123 = vld1q_f32(w);
94 float32x4_t vo1c0123 = vo0c0123;
95
96 const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
97
98 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 1);
99 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 1);
100
101 const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
102
103 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 1);
104 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 1);
105
106 const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
107
108 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1);
109 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 1);
110
111 const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
112
113 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c1x0123, vget_high_f32(vi0x0), 0);
114 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_high_f32(vi2x0), 0);
115
116 const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
117
118 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c1x0123, vget_high_f32(vi1x0), 0);
119 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c1x0123, vget_high_f32(vi3x0), 0);
120
121 const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
122
123 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0);
124 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c1x0123, vget_high_f32(vi4x0), 0);
125
126 const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
127
128 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 1);
129 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 1);
130
131 const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
132
133 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 1);
134 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 1);
135
136 const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
137
138 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 1);
139 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 1);
140
141 // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
142 const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
143 const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
144 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
145 const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
146 const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
147
148 const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
149
150 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c0x0123, vget_low_f32(vi0x1), 0);
151 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_low_f32(vi2x1), 0);
152
153 const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
154
155 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c0x0123, vget_low_f32(vi1x1), 0);
156 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c0x0123, vget_low_f32(vi3x1), 0);
157
158 const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
159
160 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0);
161 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c0x0123, vget_low_f32(vi4x1), 0);
162
163 const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
164
165 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c1x0123, vget_low_f32(vi0x1), 1);
166 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 1);
167
168 const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
169
170 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c1x0123, vget_low_f32(vi1x1), 1);
171 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c1x0123, vget_low_f32(vi3x1), 1);
172
173 const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
174
175 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1);
176 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c1x0123, vget_low_f32(vi4x1), 1);
177
178 const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
179
180 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c2x0123, vget_high_f32(vi0x1), 0);
181 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vget_high_f32(vi2x1), 0);
182
183 const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
184
185 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c2x0123, vget_high_f32(vi1x1), 0);
186 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c2x0123, vget_high_f32(vi3x1), 0);
187
188 const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
189
190 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c2x0123, vget_high_f32(vi2x1), 0);
191 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c2x0123, vget_high_f32(vi4x1), 0);
192
193 const float32x4_t vk02c0x0123 = vld1q_f32(w + 76);
194
195 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c0x0123, vget_high_f32(vi0x1), 1);
196 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c0x0123, vget_high_f32(vi2x1), 1);
197
198 const float32x4_t vk12c0x0123 = vld1q_f32(w + 80);
199
200 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c0x0123, vget_high_f32(vi1x1), 1);
201 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c0x0123, vget_high_f32(vi3x1), 1);
202
203 const float32x4_t vk22c0x0123 = vld1q_f32(w + 84);
204
205 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c0x0123, vget_high_f32(vi2x1), 1);
206 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c0x0123, vget_high_f32(vi4x1), 1);
207
208 // viMx2 = ( iM2c2, iM2c1 )
209 const float32x2_t vi0x2 = vld1_f32(i0); i0 += 2;
210 const float32x2_t vi1x2 = vld1_f32(i1); i1 += 2;
211 const float32x2_t vi2x2 = vld1_f32(i2); i2 += 2;
212 const float32x2_t vi3x2 = vld1_f32(i3); i3 += 2;
213 const float32x2_t vi4x2 = vld1_f32(i4); i4 += 2;
214
215 const float32x4_t vk02c1x0123 = vld1q_f32(w + 88);
216
217 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c1x0123, vi0x2, 0);
218 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c1x0123, vi2x2, 0);
219
220 const float32x4_t vk12c1x0123 = vld1q_f32(w + 92);
221
222 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c1x0123, vi1x2, 0);
223 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c1x0123, vi3x2, 0);
224
225 const float32x4_t vk22c1x0123 = vld1q_f32(w + 96);
226
227 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c1x0123, vi2x2, 0);
228 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c1x0123, vi4x2, 0);
229
230 const float32x4_t vk02c2x0123 = vld1q_f32(w + 100);
231
232 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk02c2x0123, vi0x2, 1);
233 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c2x0123, vi2x2, 1);
234
235 const float32x4_t vk12c2x0123 = vld1q_f32(w + 104);
236
237 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk12c2x0123, vi1x2, 1);
238 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk12c2x0123, vi3x2, 1);
239
240 const float32x4_t vk22c2x0123 = vld1q_f32(w + 108);
241
242 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c2x0123, vi2x2, 1);
243 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk22c2x0123, vi4x2, 1);
244
245 vi0x0 = vcombine_f32(vget_high_f32(vi0x1), vi0x2);
246 vi1x0 = vcombine_f32(vget_high_f32(vi1x1), vi1x2);
247 vi2x0 = vcombine_f32(vget_high_f32(vi2x1), vi2x2);
248 vi3x0 = vcombine_f32(vget_high_f32(vi3x1), vi3x2);
249 vi4x0 = vcombine_f32(vget_high_f32(vi4x1), vi4x2);
250
251
252 vo0c0123 = vmaxq_f32(vo0c0123, vmin);
253 vo1c0123 = vmaxq_f32(vo1c0123, vmin);
254
255 vo0c0123 = vminq_f32(vo0c0123, vmax);
256 vo1c0123 = vminq_f32(vo1c0123, vmax);
257
258 if XNN_LIKELY(c >= 4) {
259 vst1q_f32(o1, vo1c0123);
260 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
261 vst1q_f32(o0, vo0c0123);
262 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
263 } else {
264 float* o0_tmp = o0;
265 float* o1_tmp = o1;
266 float32x2_t vo0c01 = vget_low_f32(vo0c0123);
267 float32x2_t vo1c01 = vget_low_f32(vo1c0123);
268 if (c & 2) {
269 vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
270 vo1c01 = vget_high_f32(vo1c0123);
271 vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
272 vo0c01 = vget_high_f32(vo0c0123);
273 }
274 if (c & 1) {
275 vst1_lane_f32(o1_tmp, vo1c01, 0);
276 vst1_lane_f32(o0_tmp, vo0c01, 0);
277 }
278
279 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
280 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
281 }
282 }
283 assert(iw < 2);
284 if XNN_UNLIKELY(iw & 1) {
285 float32x4_t vo0c0123 = vld1q_f32(w);
286 float32x4_t vo1c0123 = vo0c0123;
287
288 const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
289
290 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 1);
291 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 1);
292
293 const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
294
295 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 1);
296 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 1);
297
298 const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
299
300 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 1);
301 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 1);
302
303 const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
304
305 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c1x0123, vget_high_f32(vi0x0), 0);
306 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c1x0123, vget_high_f32(vi2x0), 0);
307
308 const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
309
310 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c1x0123, vget_high_f32(vi1x0), 0);
311 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c1x0123, vget_high_f32(vi3x0), 0);
312
313 const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
314
315 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c1x0123, vget_high_f32(vi2x0), 0);
316 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c1x0123, vget_high_f32(vi4x0), 0);
317
318 const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
319
320 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 1);
321 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 1);
322
323 const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
324
325 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 1);
326 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 1);
327
328 const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
329
330 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 1);
331 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 1);
332
333 // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
334 const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 3;
335 const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 3;
336 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 3;
337 const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 3;
338 const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 3;
339
340 const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
341
342 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c0x0123, vget_low_f32(vi0x1), 0);
343 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_low_f32(vi2x1), 0);
344
345 const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
346
347 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c0x0123, vget_low_f32(vi1x1), 0);
348 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c0x0123, vget_low_f32(vi3x1), 0);
349
350 const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
351
352 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0);
353 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c0x0123, vget_low_f32(vi4x1), 0);
354
355 const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
356
357 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c1x0123, vget_low_f32(vi0x1), 1);
358 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 1);
359
360 const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
361
362 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c1x0123, vget_low_f32(vi1x1), 1);
363 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c1x0123, vget_low_f32(vi3x1), 1);
364
365 const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
366
367 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1);
368 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c1x0123, vget_low_f32(vi4x1), 1);
369
370 const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
371
372 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk01c2x0123, vget_high_f32(vi0x1), 0);
373 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vget_high_f32(vi2x1), 0);
374
375 const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
376
377 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk11c2x0123, vget_high_f32(vi1x1), 0);
378 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk11c2x0123, vget_high_f32(vi3x1), 0);
379
380 const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
381
382 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c2x0123, vget_high_f32(vi2x1), 0);
383 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk21c2x0123, vget_high_f32(vi4x1), 0);
384
385
386 vo0c0123 = vmaxq_f32(vo0c0123, vmin);
387 vo1c0123 = vmaxq_f32(vo1c0123, vmin);
388
389 vo0c0123 = vminq_f32(vo0c0123, vmax);
390 vo1c0123 = vminq_f32(vo1c0123, vmax);
391
392 if XNN_LIKELY(c >= 4) {
393 vst1q_f32(o1, vo1c0123);
394 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
395 vst1q_f32(o0, vo0c0123);
396 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
397 } else {
398 float* o0_tmp = o0;
399 float* o1_tmp = o1;
400 float32x2_t vo0c01 = vget_low_f32(vo0c0123);
401 float32x2_t vo1c01 = vget_low_f32(vo1c0123);
402 if (c & 2) {
403 vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
404 vo1c01 = vget_high_f32(vo1c0123);
405 vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
406 vo0c01 = vget_high_f32(vo0c0123);
407 }
408 if (c & 1) {
409 vst1_lane_f32(o1_tmp, vo1c01, 0);
410 vst1_lane_f32(o0_tmp, vo0c01, 0);
411 }
412 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
413 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
414 }
415 }
416 // Move output pointers back to the position of the first pixel in a row,
417 // and forward to the next block of output channels
418 o0 = (float*) ((uintptr_t) o0 - output_channel_decrement);
419 o1 = (float*) ((uintptr_t) o1 - output_channel_decrement);
420 // Revert input pointers to the position of the first pixel in a row
421 i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
422 i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
423 i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
424 i3 = (const float*) ((uintptr_t) i3 - input_width_decrement);
425 i4 = (const float*) ((uintptr_t) i4 - input_width_decrement);
426 // Move to the block of weights for the next 4 output channels
427 w += 112;
428 c = doz(c, 4);
429 } while (c != 0);
430 // Move output pointers back to the position of the first channel, and forward to the next block of rows
431 o0 = (float*) ((uintptr_t) o0 + output_height_increment);
432 o1 = (float*) ((uintptr_t) o1 + output_height_increment);
433 // Move input pointers forward to the next four rows
434 i0 = i4;
435 i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
436 i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
437 i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
438 i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
439 }
440 }
441