1 // Auto-generated file. Do not edit!
2 // Template: src/f32-conv-hwc/3x3s2p0p1c3-neon-x1.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10
11 #include <assert.h>
12
13 #include <arm_neon.h>
14
15 #include <xnnpack/conv.h>
16 #include <xnnpack/math.h>
17
18
xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x1(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_width_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x1(
20 size_t input_height,
21 size_t input_width,
22 size_t output_y_start,
23 size_t output_y_end,
24 const float* input,
25 const float* zero,
26 const float* weights,
27 float* output,
28 size_t input_padding_top,
29 size_t output_channels,
30 size_t output_height_stride,
31 size_t output_width_stride,
32 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
33 {
34 assert(input_width != 0);
35 assert(output_y_end > output_y_start);
36 assert(input_padding_top <= 1);
37 assert(output_channels != 0);
38
39 const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
40 const size_t input_width_decrement = (4 + ((input_width - 1) & 1) * 2 + (round_down_po2(input_width - 1, 2) * 3 /* channels */)) * sizeof(float);
41 const size_t output_width = input_width / 2;
42 const size_t output_channel_decrement = output_width * output_width_stride - 4 * sizeof(float);
43 const size_t output_height_increment = output_height_stride * 2 - round_up_po2(output_channels, 4) * sizeof(float);
44
45 // Adjustment for padding processed below
46 const float* i0 = (const float*) ((uintptr_t) input +
47 input_height_stride * (output_y_start * 2 /* vertical stride */ - input_padding_top));
48 const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
49 const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
50 const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
51 const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
52 float* o0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
53 float* o1 = (float*) ((uintptr_t) o0 + output_height_stride);
54
55 if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
56 i0 = zero;
57 }
58
59
60 for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
61 const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
62 const size_t input_y4 = input_y2 + 2;
63 if XNN_UNPREDICTABLE(input_y2 > input_height) {
64 i1 = zero;
65 }
66 if XNN_UNPREDICTABLE(input_y2 >= input_height) {
67 i2 = zero;
68 }
69 if XNN_UNPREDICTABLE(input_y4 > input_height) {
70 i3 = zero;
71 }
72 if XNN_UNPREDICTABLE(input_y4 >= input_height) {
73 i4 = zero;
74 }
75 if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
76 o1 = o0;
77 }
78
79 const float* w = weights;
80 size_t c = output_channels;
81 do {
82 // viMx0 = ( iM1c0, iM0c2, iM0c1, iM0c0 )
83 float32x4_t vi0x0 = vld1q_f32(i0); i0 += 4;
84 float32x4_t vi1x0 = vld1q_f32(i1); i1 += 4;
85 float32x4_t vi2x0 = vld1q_f32(i2); i2 += 4;
86 float32x4_t vi3x0 = vld1q_f32(i3); i3 += 4;
87 float32x4_t vi4x0 = vld1q_f32(i4); i4 += 4;
88
89 size_t iw = input_width - 1;
90 for (; iw >= 2; iw -= 2) {
91 float32x4_t vo0c0123 = vld1q_f32(w);
92 float32x4_t vo1c0123 = vo0c0123;
93
94 const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
95
96 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 0);
97 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 0);
98
99 const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
100
101 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 0);
102 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 0);
103
104 const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
105
106 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0);
107 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 0);
108
109 const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
110
111 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c1x0123, vget_low_f32(vi0x0), 1);
112 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c1x0123, vget_low_f32(vi2x0), 1);
113
114 const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
115
116 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c1x0123, vget_low_f32(vi1x0), 1);
117 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c1x0123, vget_low_f32(vi3x0), 1);
118
119 const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
120
121 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1);
122 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c1x0123, vget_low_f32(vi4x0), 1);
123
124 const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
125
126 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 0);
127 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 0);
128
129 const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
130
131 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 0);
132 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 0);
133
134 const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
135
136 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0);
137 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 0);
138
139 const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
140
141 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c0x0123, vget_high_f32(vi0x0), 1);
142 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c0x0123, vget_high_f32(vi2x0), 1);
143
144 const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
145
146 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c0x0123, vget_high_f32(vi1x0), 1);
147 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c0x0123, vget_high_f32(vi3x0), 1);
148
149 const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
150
151 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1);
152 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c0x0123, vget_high_f32(vi4x0), 1);
153
154 const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
155
156 // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
157 const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
158 const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
159 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
160 const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
161 const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
162
163 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c1x0123, vget_low_f32(vi0x1), 0);
164 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 0);
165
166 const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
167
168 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c1x0123, vget_low_f32(vi1x1), 0);
169 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c1x0123, vget_low_f32(vi3x1), 0);
170
171 const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
172
173 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 0);
174 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c1x0123, vget_low_f32(vi4x1), 0);
175
176 const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
177
178 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c2x0123, vget_low_f32(vi0x1), 1);
179 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c2x0123, vget_low_f32(vi2x1), 1);
180
181 const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
182
183 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c2x0123, vget_low_f32(vi1x1), 1);
184 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c2x0123, vget_low_f32(vi3x1), 1);
185
186 const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
187
188 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c2x0123, vget_low_f32(vi2x1), 1);
189 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c2x0123, vget_low_f32(vi4x1), 1);
190
191 const float32x4_t vk02c0x0123 = vld1q_f32(w + 76);
192
193 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk02c0x0123, vget_high_f32(vi0x1), 0);
194 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk02c0x0123, vget_high_f32(vi2x1), 0);
195
196 const float32x4_t vk12c0x0123 = vld1q_f32(w + 80);
197
198 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk12c0x0123, vget_high_f32(vi1x1), 0);
199 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk12c0x0123, vget_high_f32(vi3x1), 0);
200
201 const float32x4_t vk22c0x0123 = vld1q_f32(w + 84);
202
203 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk22c0x0123, vget_high_f32(vi2x1), 0);
204 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk22c0x0123, vget_high_f32(vi4x1), 0);
205
206 const float32x4_t vk02c1x0123 = vld1q_f32(w + 88);
207
208 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk02c1x0123, vget_high_f32(vi0x1), 1);
209 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk02c1x0123, vget_high_f32(vi2x1), 1);
210
211 const float32x4_t vk12c1x0123 = vld1q_f32(w + 92);
212
213 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk12c1x0123, vget_high_f32(vi1x1), 1);
214 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk12c1x0123, vget_high_f32(vi3x1), 1);
215
216 const float32x4_t vk22c1x0123 = vld1q_f32(w + 96);
217
218 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk22c1x0123, vget_high_f32(vi2x1), 1);
219 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk22c1x0123, vget_high_f32(vi4x1), 1);
220
221 const float32x4_t vk02c2x0123 = vld1q_f32(w + 100);
222
223 // viMx2 = ( iM2c2, iM2c1 )
224 const float32x2_t vi0x2 = vld1_f32(i0); i0 += 2;
225 const float32x2_t vi1x2 = vld1_f32(i1); i1 += 2;
226 const float32x2_t vi2x2 = vld1_f32(i2); i2 += 2;
227 const float32x2_t vi3x2 = vld1_f32(i3); i3 += 2;
228 const float32x2_t vi4x2 = vld1_f32(i4); i4 += 2;
229
230 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk02c2x0123, vi0x2, 0);
231 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk02c2x0123, vi2x2, 0);
232
233 const float32x4_t vk12c2x0123 = vld1q_f32(w + 104);
234
235 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk12c2x0123, vi1x2, 0);
236 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk12c2x0123, vi3x2, 0);
237
238 const float32x4_t vk22c2x0123 = vld1q_f32(w + 108);
239
240 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk22c2x0123, vi2x2, 0);
241 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk22c2x0123, vi4x2, 0);
242
243 vi0x0 = vcombine_f32(vget_high_f32(vi0x1), vi0x2);
244 vi1x0 = vcombine_f32(vget_high_f32(vi1x1), vi1x2);
245 vi2x0 = vcombine_f32(vget_high_f32(vi2x1), vi2x2);
246 vi3x0 = vcombine_f32(vget_high_f32(vi3x1), vi3x2);
247 vi4x0 = vcombine_f32(vget_high_f32(vi4x1), vi4x2);
248
249 const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min);
250 const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max);
251
252 vo0c0123 = vmaxq_f32(vo0c0123, vmin);
253 vo1c0123 = vmaxq_f32(vo1c0123, vmin);
254
255 vo0c0123 = vminq_f32(vo0c0123, vmax);
256 vo1c0123 = vminq_f32(vo1c0123, vmax);
257
258 if XNN_LIKELY(c >= 4) {
259 vst1q_f32(o1, vo1c0123);
260 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
261 vst1q_f32(o0, vo0c0123);
262 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
263 } else {
264 float* o0_tmp = o0;
265 float* o1_tmp = o1;
266 float32x2_t vo0c01 = vget_low_f32(vo0c0123);
267 float32x2_t vo1c01 = vget_low_f32(vo1c0123);
268 if (c & 2) {
269 vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
270 vo1c01 = vget_high_f32(vo1c0123);
271 vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
272 vo0c01 = vget_high_f32(vo0c0123);
273 }
274 if (c & 1) {
275 vst1_lane_f32(o1_tmp, vo1c01, 0);
276 vst1_lane_f32(o0_tmp, vo0c01, 0);
277 }
278
279 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
280 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
281 }
282 }
283 assert(iw < 2);
284 if XNN_LIKELY(iw & 1) {
285 float32x4_t vo0c0123 = vld1q_f32(w);
286 float32x4_t vo1c0123 = vo0c0123;
287
288 const float32x4_t vk00c0x0123 = vld1q_f32(w + 4);
289
290 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 0);
291 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 0);
292
293 const float32x4_t vk10c0x0123 = vld1q_f32(w + 8);
294
295 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 0);
296 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 0);
297
298 const float32x4_t vk20c0x0123 = vld1q_f32(w + 12);
299
300 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0);
301 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 0);
302
303 const float32x4_t vk00c1x0123 = vld1q_f32(w + 16);
304
305 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c1x0123, vget_low_f32(vi0x0), 1);
306 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c1x0123, vget_low_f32(vi2x0), 1);
307
308 const float32x4_t vk10c1x0123 = vld1q_f32(w + 20);
309
310 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c1x0123, vget_low_f32(vi1x0), 1);
311 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c1x0123, vget_low_f32(vi3x0), 1);
312
313 const float32x4_t vk20c1x0123 = vld1q_f32(w + 24);
314
315 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1);
316 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c1x0123, vget_low_f32(vi4x0), 1);
317
318 const float32x4_t vk00c2x0123 = vld1q_f32(w + 28);
319
320 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 0);
321 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 0);
322
323 const float32x4_t vk10c2x0123 = vld1q_f32(w + 32);
324
325 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 0);
326 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 0);
327
328 const float32x4_t vk20c2x0123 = vld1q_f32(w + 36);
329
330 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0);
331 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 0);
332
333 const float32x4_t vk01c0x0123 = vld1q_f32(w + 40);
334
335 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c0x0123, vget_high_f32(vi0x0), 1);
336 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c0x0123, vget_high_f32(vi2x0), 1);
337
338 const float32x4_t vk11c0x0123 = vld1q_f32(w + 44);
339
340 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c0x0123, vget_high_f32(vi1x0), 1);
341 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c0x0123, vget_high_f32(vi3x0), 1);
342
343 const float32x4_t vk21c0x0123 = vld1q_f32(w + 48);
344
345 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1);
346 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c0x0123, vget_high_f32(vi4x0), 1);
347
348 const float32x4_t vk01c1x0123 = vld1q_f32(w + 52);
349
350 // viMx1 = ( iM1c2, iM1c1 )
351 const float32x2_t vi0x1 = vld1_f32(i0); i0 += 2;
352 const float32x2_t vi1x1 = vld1_f32(i1); i1 += 2;
353 const float32x2_t vi2x1 = vld1_f32(i2); i2 += 2;
354 const float32x2_t vi3x1 = vld1_f32(i3); i3 += 2;
355 const float32x2_t vi4x1 = vld1_f32(i4); i4 += 2;
356
357 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c1x0123, vi0x1, 0);
358 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c1x0123, vi2x1, 0);
359
360 const float32x4_t vk11c1x0123 = vld1q_f32(w + 56);
361
362 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c1x0123, vi1x1, 0);
363 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c1x0123, vi3x1, 0);
364
365 const float32x4_t vk21c1x0123 = vld1q_f32(w + 60);
366
367 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c1x0123, vi2x1, 0);
368 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c1x0123, vi4x1, 0);
369
370 const float32x4_t vk01c2x0123 = vld1q_f32(w + 64);
371
372 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c2x0123, vi0x1, 1);
373 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c2x0123, vi2x1, 1);
374
375 const float32x4_t vk11c2x0123 = vld1q_f32(w + 68);
376
377 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c2x0123, vi1x1, 1);
378 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c2x0123, vi3x1, 1);
379
380 const float32x4_t vk21c2x0123 = vld1q_f32(w + 72);
381
382 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c2x0123, vi2x1, 1);
383 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c2x0123, vi4x1, 1);
384
385 const float32x4_t vmin = vld1q_dup_f32(¶ms->scalar.min);
386 const float32x4_t vmax = vld1q_dup_f32(¶ms->scalar.max);
387
388 vo0c0123 = vmaxq_f32(vo0c0123, vmin);
389 vo1c0123 = vmaxq_f32(vo1c0123, vmin);
390
391 vo0c0123 = vminq_f32(vo0c0123, vmax);
392 vo1c0123 = vminq_f32(vo1c0123, vmax);
393
394 if XNN_LIKELY(c >= 4) {
395 vst1q_f32(o1, vo1c0123);
396 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
397 vst1q_f32(o0, vo0c0123);
398 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
399 } else {
400 float* o0_tmp = o0;
401 float* o1_tmp = o1;
402 float32x2_t vo0c01 = vget_low_f32(vo0c0123);
403 float32x2_t vo1c01 = vget_low_f32(vo1c0123);
404 if (c & 2) {
405 vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
406 vo1c01 = vget_high_f32(vo1c0123);
407 vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
408 vo0c01 = vget_high_f32(vo0c0123);
409 }
410 if (c & 1) {
411 vst1_lane_f32(o1_tmp, vo1c01, 0);
412 vst1_lane_f32(o0_tmp, vo0c01, 0);
413 }
414 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
415 o1 = (float*) ((uintptr_t) o1 + output_width_stride);
416 }
417 }
418 // Move output pointers back to the position of the first pixel in a row,
419 // and forward to the next block of output channels
420 o0 = (float*) ((uintptr_t) o0 - output_channel_decrement);
421 o1 = (float*) ((uintptr_t) o1 - output_channel_decrement);
422 // Revert input pointers to the position of the first pixel in a row
423 i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
424 i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
425 i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
426 i3 = (const float*) ((uintptr_t) i3 - input_width_decrement);
427 i4 = (const float*) ((uintptr_t) i4 - input_width_decrement);
428 // Move to the block of weights for the next 4 output channels
429 w += 112;
430 c = doz(c, 4);
431 } while (c != 0);
432 // Move output pointers back to the position of the first channel, and forward to the next block of rows
433 o0 = (float*) ((uintptr_t) o0 + output_height_increment);
434 o1 = (float*) ((uintptr_t) o1 + output_height_increment);
435 // Move input pointers forward to the next four rows
436 i0 = i4;
437 i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
438 i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
439 i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
440 i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
441 }
442 }
443