1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-conv-hwc/3x3s2p0p1c3-neon-x1.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 
11 #include <assert.h>
12 
13 #include <arm_neon.h>
14 
15 #include <xnnpack/conv.h>
16 #include <xnnpack/math.h>
17 
18 
xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_width_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1(
20     size_t input_height,
21     size_t input_width,
22     size_t output_y_start,
23     size_t output_y_end,
24     const float* input,
25     const float* zero,
26     const float* weights,
27     float* output,
28     size_t input_padding_top,
29     size_t output_channels,
30     size_t output_height_stride,
31     size_t output_width_stride,
32     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
33 {
34   assert(input_width != 0);
35   assert(output_y_end > output_y_start);
36   assert(input_padding_top <= 1);
37   assert(output_channels != 0);
38 
39   const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
40   const size_t input_width_decrement = (4 + ((input_width - 1) & 1) * 2 + (round_down_po2(input_width - 1, 2) * 3 /* channels */)) * sizeof(float);
41   const size_t output_width = input_width / 2;
42   const size_t output_channel_decrement = output_width * output_width_stride - 8 * sizeof(float);
43   const size_t output_height_increment = output_height_stride * 2 - round_up_po2(output_channels, 8) * sizeof(float);
44 
45   // Adjustment for padding processed below
46   const float* i0 = (const float*) ((uintptr_t) input +
47     input_height_stride * (output_y_start * 2 /* vertical stride */ - input_padding_top));
48   const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
49   const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
50   const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
51   const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
52   float* o0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
53   float* o1 = (float*) ((uintptr_t) o0 + output_height_stride);
54 
55   if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
56     i0 = zero;
57   }
58 
59 
60   for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
61     const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
62     const size_t input_y4 = input_y2 + 2;
63     if XNN_UNPREDICTABLE(input_y2 > input_height) {
64       i1 = zero;
65     }
66     if XNN_UNPREDICTABLE(input_y2 >= input_height) {
67       i2 = zero;
68     }
69     if XNN_UNPREDICTABLE(input_y4 > input_height) {
70       i3 = zero;
71     }
72     if XNN_UNPREDICTABLE(input_y4 >= input_height) {
73       i4 = zero;
74     }
75     if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
76       o1 = o0;
77     }
78 
79     const float* w = weights;
80     size_t c = output_channels;
81     do {
82       // viMx0 = ( iM1c0, iM0c2, iM0c1, iM0c0 )
83       float32x4_t vi0x0 = vld1q_f32(i0); i0 += 4;
84       float32x4_t vi1x0 = vld1q_f32(i1); i1 += 4;
85       float32x4_t vi2x0 = vld1q_f32(i2); i2 += 4;
86       float32x4_t vi3x0 = vld1q_f32(i3); i3 += 4;
87       float32x4_t vi4x0 = vld1q_f32(i4); i4 += 4;
88 
89       size_t iw = input_width - 1;
90       for (; iw >= 2; iw -= 2) {
91         float32x4_t vo0c0123 = vld1q_f32(w);
92         float32x4_t vo0c4567 = vld1q_f32(w + 4);
93         float32x4_t vo1c0123 = vo0c0123;
94         float32x4_t vo1c4567 = vo0c4567;
95 
96         const float32x4_t vk00c0x0123 = vld1q_f32(w + 8);
97         const float32x4_t vk00c0x4567 = vld1q_f32(w + 12);
98 
99         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 0);
100         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 0);
101         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk00c0x4567, vget_low_f32(vi0x0), 0);
102         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk00c0x4567, vget_low_f32(vi2x0), 0);
103 
104         const float32x4_t vk10c0x0123 = vld1q_f32(w + 16);
105         const float32x4_t vk10c0x4567 = vld1q_f32(w + 20);
106 
107         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 0);
108         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 0);
109         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk10c0x4567, vget_low_f32(vi1x0), 0);
110         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk10c0x4567, vget_low_f32(vi3x0), 0);
111 
112         const float32x4_t vk20c0x0123 = vld1q_f32(w + 24);
113         const float32x4_t vk20c0x4567 = vld1q_f32(w + 28);
114 
115         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0);
116         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 0);
117         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk20c0x4567, vget_low_f32(vi2x0), 0);
118         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk20c0x4567, vget_low_f32(vi4x0), 0);
119 
120         const float32x4_t vk00c1x0123 = vld1q_f32(w + 32);
121         const float32x4_t vk00c1x4567 = vld1q_f32(w + 36);
122 
123         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c1x0123, vget_low_f32(vi0x0), 1);
124         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c1x0123, vget_low_f32(vi2x0), 1);
125         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk00c1x4567, vget_low_f32(vi0x0), 1);
126         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk00c1x4567, vget_low_f32(vi2x0), 1);
127 
128         const float32x4_t vk10c1x0123 = vld1q_f32(w + 40);
129         const float32x4_t vk10c1x4567 = vld1q_f32(w + 44);
130 
131         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c1x0123, vget_low_f32(vi1x0), 1);
132         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c1x0123, vget_low_f32(vi3x0), 1);
133         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk10c1x4567, vget_low_f32(vi1x0), 1);
134         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk10c1x4567, vget_low_f32(vi3x0), 1);
135 
136         const float32x4_t vk20c1x0123 = vld1q_f32(w + 48);
137         const float32x4_t vk20c1x4567 = vld1q_f32(w + 52);
138 
139         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1);
140         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c1x0123, vget_low_f32(vi4x0), 1);
141         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk20c1x4567, vget_low_f32(vi2x0), 1);
142         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk20c1x4567, vget_low_f32(vi4x0), 1);
143 
144         const float32x4_t vk00c2x0123 = vld1q_f32(w + 56);
145         const float32x4_t vk00c2x4567 = vld1q_f32(w + 60);
146 
147         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 0);
148         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 0);
149         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk00c2x4567, vget_high_f32(vi0x0), 0);
150         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk00c2x4567, vget_high_f32(vi2x0), 0);
151 
152         const float32x4_t vk10c2x0123 = vld1q_f32(w + 64);
153         const float32x4_t vk10c2x4567 = vld1q_f32(w + 68);
154 
155         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 0);
156         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 0);
157         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk10c2x4567, vget_high_f32(vi1x0), 0);
158         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk10c2x4567, vget_high_f32(vi3x0), 0);
159 
160         const float32x4_t vk20c2x0123 = vld1q_f32(w + 72);
161         const float32x4_t vk20c2x4567 = vld1q_f32(w + 76);
162 
163         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0);
164         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 0);
165         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk20c2x4567, vget_high_f32(vi2x0), 0);
166         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk20c2x4567, vget_high_f32(vi4x0), 0);
167 
168         const float32x4_t vk01c0x0123 = vld1q_f32(w + 80);
169         const float32x4_t vk01c0x4567 = vld1q_f32(w + 84);
170 
171         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c0x0123, vget_high_f32(vi0x0), 1);
172         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c0x0123, vget_high_f32(vi2x0), 1);
173         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk01c0x4567, vget_high_f32(vi0x0), 1);
174         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk01c0x4567, vget_high_f32(vi2x0), 1);
175 
176         const float32x4_t vk11c0x0123 = vld1q_f32(w + 88);
177         const float32x4_t vk11c0x4567 = vld1q_f32(w + 92);
178 
179         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c0x0123, vget_high_f32(vi1x0), 1);
180         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c0x0123, vget_high_f32(vi3x0), 1);
181         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk11c0x4567, vget_high_f32(vi1x0), 1);
182         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk11c0x4567, vget_high_f32(vi3x0), 1);
183 
184         const float32x4_t vk21c0x0123 = vld1q_f32(w + 96);
185         const float32x4_t vk21c0x4567 = vld1q_f32(w + 100);
186 
187         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1);
188         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c0x0123, vget_high_f32(vi4x0), 1);
189         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk21c0x4567, vget_high_f32(vi2x0), 1);
190         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk21c0x4567, vget_high_f32(vi4x0), 1);
191 
192         const float32x4_t vk01c1x0123 = vld1q_f32(w + 104);
193         const float32x4_t vk01c1x4567 = vld1q_f32(w + 108);
194 
195         // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
196         const float32x4_t vi0x1 = vld1q_f32(i0); i0 += 4;
197         const float32x4_t vi1x1 = vld1q_f32(i1); i1 += 4;
198         const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4;
199         const float32x4_t vi3x1 = vld1q_f32(i3); i3 += 4;
200         const float32x4_t vi4x1 = vld1q_f32(i4); i4 += 4;
201 
202         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c1x0123, vget_low_f32(vi0x1), 0);
203         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 0);
204         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk01c1x4567, vget_low_f32(vi0x1), 0);
205         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk01c1x4567, vget_low_f32(vi2x1), 0);
206 
207         const float32x4_t vk11c1x0123 = vld1q_f32(w + 112);
208         const float32x4_t vk11c1x4567 = vld1q_f32(w + 116);
209 
210         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c1x0123, vget_low_f32(vi1x1), 0);
211         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c1x0123, vget_low_f32(vi3x1), 0);
212         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk11c1x4567, vget_low_f32(vi1x1), 0);
213         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk11c1x4567, vget_low_f32(vi3x1), 0);
214 
215         const float32x4_t vk21c1x0123 = vld1q_f32(w + 120);
216         const float32x4_t vk21c1x4567 = vld1q_f32(w + 124);
217 
218         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 0);
219         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c1x0123, vget_low_f32(vi4x1), 0);
220         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk21c1x4567, vget_low_f32(vi2x1), 0);
221         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk21c1x4567, vget_low_f32(vi4x1), 0);
222 
223         const float32x4_t vk01c2x0123 = vld1q_f32(w + 128);
224         const float32x4_t vk01c2x4567 = vld1q_f32(w + 132);
225 
226         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c2x0123, vget_low_f32(vi0x1), 1);
227         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c2x0123, vget_low_f32(vi2x1), 1);
228         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk01c2x4567, vget_low_f32(vi0x1), 1);
229         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk01c2x4567, vget_low_f32(vi2x1), 1);
230 
231         const float32x4_t vk11c2x0123 = vld1q_f32(w + 136);
232         const float32x4_t vk11c2x4567 = vld1q_f32(w + 140);
233 
234         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c2x0123, vget_low_f32(vi1x1), 1);
235         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c2x0123, vget_low_f32(vi3x1), 1);
236         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk11c2x4567, vget_low_f32(vi1x1), 1);
237         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk11c2x4567, vget_low_f32(vi3x1), 1);
238 
239         const float32x4_t vk21c2x0123 = vld1q_f32(w + 144);
240         const float32x4_t vk21c2x4567 = vld1q_f32(w + 148);
241 
242         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c2x0123, vget_low_f32(vi2x1), 1);
243         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c2x0123, vget_low_f32(vi4x1), 1);
244         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk21c2x4567, vget_low_f32(vi2x1), 1);
245         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk21c2x4567, vget_low_f32(vi4x1), 1);
246 
247         const float32x4_t vk02c0x0123 = vld1q_f32(w + 152);
248         const float32x4_t vk02c0x4567 = vld1q_f32(w + 156);
249 
250         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk02c0x0123, vget_high_f32(vi0x1), 0);
251         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk02c0x0123, vget_high_f32(vi2x1), 0);
252         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk02c0x4567, vget_high_f32(vi0x1), 0);
253         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk02c0x4567, vget_high_f32(vi2x1), 0);
254 
255         const float32x4_t vk12c0x0123 = vld1q_f32(w + 160);
256         const float32x4_t vk12c0x4567 = vld1q_f32(w + 164);
257 
258         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk12c0x0123, vget_high_f32(vi1x1), 0);
259         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk12c0x0123, vget_high_f32(vi3x1), 0);
260         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk12c0x4567, vget_high_f32(vi1x1), 0);
261         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk12c0x4567, vget_high_f32(vi3x1), 0);
262 
263         const float32x4_t vk22c0x0123 = vld1q_f32(w + 168);
264         const float32x4_t vk22c0x4567 = vld1q_f32(w + 172);
265 
266         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk22c0x0123, vget_high_f32(vi2x1), 0);
267         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk22c0x0123, vget_high_f32(vi4x1), 0);
268         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk22c0x4567, vget_high_f32(vi2x1), 0);
269         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk22c0x4567, vget_high_f32(vi4x1), 0);
270 
271         const float32x4_t vk02c1x0123 = vld1q_f32(w + 176);
272         const float32x4_t vk02c1x4567 = vld1q_f32(w + 180);
273 
274         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk02c1x0123, vget_high_f32(vi0x1), 1);
275         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk02c1x0123, vget_high_f32(vi2x1), 1);
276         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk02c1x4567, vget_high_f32(vi0x1), 1);
277         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk02c1x4567, vget_high_f32(vi2x1), 1);
278 
279         const float32x4_t vk12c1x0123 = vld1q_f32(w + 184);
280         const float32x4_t vk12c1x4567 = vld1q_f32(w + 188);
281 
282         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk12c1x0123, vget_high_f32(vi1x1), 1);
283         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk12c1x0123, vget_high_f32(vi3x1), 1);
284         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk12c1x4567, vget_high_f32(vi1x1), 1);
285         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk12c1x4567, vget_high_f32(vi3x1), 1);
286 
287         const float32x4_t vk22c1x0123 = vld1q_f32(w + 192);
288         const float32x4_t vk22c1x4567 = vld1q_f32(w + 196);
289 
290         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk22c1x0123, vget_high_f32(vi2x1), 1);
291         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk22c1x0123, vget_high_f32(vi4x1), 1);
292         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk22c1x4567, vget_high_f32(vi2x1), 1);
293         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk22c1x4567, vget_high_f32(vi4x1), 1);
294 
295         const float32x4_t vk02c2x0123 = vld1q_f32(w + 200);
296         const float32x4_t vk02c2x4567 = vld1q_f32(w + 204);
297 
298         // viMx2 = ( iM2c2, iM2c1 )
299         const float32x2_t vi0x2 = vld1_f32(i0); i0 += 2;
300         const float32x2_t vi1x2 = vld1_f32(i1); i1 += 2;
301         const float32x2_t vi2x2 = vld1_f32(i2); i2 += 2;
302         const float32x2_t vi3x2 = vld1_f32(i3); i3 += 2;
303         const float32x2_t vi4x2 = vld1_f32(i4); i4 += 2;
304 
305         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk02c2x0123, vi0x2, 0);
306         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk02c2x0123, vi2x2, 0);
307         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk02c2x4567, vi0x2, 0);
308         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk02c2x4567, vi2x2, 0);
309 
310         const float32x4_t vk12c2x0123 = vld1q_f32(w + 208);
311         const float32x4_t vk12c2x4567 = vld1q_f32(w + 212);
312 
313         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk12c2x0123, vi1x2, 0);
314         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk12c2x0123, vi3x2, 0);
315         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk12c2x4567, vi1x2, 0);
316         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk12c2x4567, vi3x2, 0);
317 
318         const float32x4_t vk22c2x0123 = vld1q_f32(w + 216);
319         const float32x4_t vk22c2x4567 = vld1q_f32(w + 220);
320 
321         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk22c2x0123, vi2x2, 0);
322         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk22c2x0123, vi4x2, 0);
323         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk22c2x4567, vi2x2, 0);
324         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk22c2x4567, vi4x2, 0);
325 
326         vi0x0 = vcombine_f32(vget_high_f32(vi0x1), vi0x2);
327         vi1x0 = vcombine_f32(vget_high_f32(vi1x1), vi1x2);
328         vi2x0 = vcombine_f32(vget_high_f32(vi2x1), vi2x2);
329         vi3x0 = vcombine_f32(vget_high_f32(vi3x1), vi3x2);
330         vi4x0 = vcombine_f32(vget_high_f32(vi4x1), vi4x2);
331 
332         const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
333         const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
334 
335         vo0c0123 = vmaxq_f32(vo0c0123, vmin);
336         vo1c0123 = vmaxq_f32(vo1c0123, vmin);
337         vo0c4567 = vmaxq_f32(vo0c4567, vmin);
338         vo1c4567 = vmaxq_f32(vo1c4567, vmin);
339 
340         vo0c0123 = vminq_f32(vo0c0123, vmax);
341         vo1c0123 = vminq_f32(vo1c0123, vmax);
342         vo0c4567 = vminq_f32(vo0c4567, vmax);
343         vo1c4567 = vminq_f32(vo1c4567, vmax);
344 
345         if XNN_LIKELY(c >= 8) {
346           vst1q_f32(o1, vo1c0123);
347           vst1q_f32(o1 + 4, vo1c4567);
348           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
349           vst1q_f32(o0, vo0c0123);
350           vst1q_f32(o0 + 4, vo0c4567);
351           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
352         } else {
353           float* o0_tmp = o0;
354           float* o1_tmp = o1;
355           if (c & 4) {
356             vst1q_f32(o1_tmp, vo1c0123); o1_tmp += 4;
357             vo1c0123 = vo1c4567;
358             vst1q_f32(o0_tmp, vo0c0123); o0_tmp += 4;
359             vo0c0123 = vo0c4567;
360           }
361           float32x2_t vo0c01 = vget_low_f32(vo0c0123);
362           float32x2_t vo1c01 = vget_low_f32(vo1c0123);
363           if (c & 2) {
364             vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
365             vo1c01 = vget_high_f32(vo1c0123);
366             vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
367             vo0c01 = vget_high_f32(vo0c0123);
368           }
369           if (c & 1) {
370             vst1_lane_f32(o1_tmp, vo1c01, 0);
371             vst1_lane_f32(o0_tmp, vo0c01, 0);
372           }
373 
374           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
375           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
376         }
377       }
378       assert(iw < 2);
379       if XNN_LIKELY(iw & 1) {
380         float32x4_t vo0c0123 = vld1q_f32(w);
381         float32x4_t vo0c4567 = vld1q_f32(w + 4);
382         float32x4_t vo1c0123 = vo0c0123;
383         float32x4_t vo1c4567 = vo0c4567;
384 
385         const float32x4_t vk00c0x0123 = vld1q_f32(w + 8);
386         const float32x4_t vk00c0x4567 = vld1q_f32(w + 12);
387 
388         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c0x0123, vget_low_f32(vi0x0), 0);
389         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c0x0123, vget_low_f32(vi2x0), 0);
390         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk00c0x4567, vget_low_f32(vi0x0), 0);
391         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk00c0x4567, vget_low_f32(vi2x0), 0);
392 
393         const float32x4_t vk10c0x0123 = vld1q_f32(w + 16);
394         const float32x4_t vk10c0x4567 = vld1q_f32(w + 20);
395 
396         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c0x0123, vget_low_f32(vi1x0), 0);
397         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c0x0123, vget_low_f32(vi3x0), 0);
398         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk10c0x4567, vget_low_f32(vi1x0), 0);
399         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk10c0x4567, vget_low_f32(vi3x0), 0);
400 
401         const float32x4_t vk20c0x0123 = vld1q_f32(w + 24);
402         const float32x4_t vk20c0x4567 = vld1q_f32(w + 28);
403 
404         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c0x0123, vget_low_f32(vi2x0), 0);
405         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c0x0123, vget_low_f32(vi4x0), 0);
406         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk20c0x4567, vget_low_f32(vi2x0), 0);
407         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk20c0x4567, vget_low_f32(vi4x0), 0);
408 
409         const float32x4_t vk00c1x0123 = vld1q_f32(w + 32);
410         const float32x4_t vk00c1x4567 = vld1q_f32(w + 36);
411 
412         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c1x0123, vget_low_f32(vi0x0), 1);
413         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c1x0123, vget_low_f32(vi2x0), 1);
414         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk00c1x4567, vget_low_f32(vi0x0), 1);
415         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk00c1x4567, vget_low_f32(vi2x0), 1);
416 
417         const float32x4_t vk10c1x0123 = vld1q_f32(w + 40);
418         const float32x4_t vk10c1x4567 = vld1q_f32(w + 44);
419 
420         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c1x0123, vget_low_f32(vi1x0), 1);
421         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c1x0123, vget_low_f32(vi3x0), 1);
422         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk10c1x4567, vget_low_f32(vi1x0), 1);
423         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk10c1x4567, vget_low_f32(vi3x0), 1);
424 
425         const float32x4_t vk20c1x0123 = vld1q_f32(w + 48);
426         const float32x4_t vk20c1x4567 = vld1q_f32(w + 52);
427 
428         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c1x0123, vget_low_f32(vi2x0), 1);
429         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c1x0123, vget_low_f32(vi4x0), 1);
430         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk20c1x4567, vget_low_f32(vi2x0), 1);
431         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk20c1x4567, vget_low_f32(vi4x0), 1);
432 
433         const float32x4_t vk00c2x0123 = vld1q_f32(w + 56);
434         const float32x4_t vk00c2x4567 = vld1q_f32(w + 60);
435 
436         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk00c2x0123, vget_high_f32(vi0x0), 0);
437         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk00c2x0123, vget_high_f32(vi2x0), 0);
438         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk00c2x4567, vget_high_f32(vi0x0), 0);
439         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk00c2x4567, vget_high_f32(vi2x0), 0);
440 
441         const float32x4_t vk10c2x0123 = vld1q_f32(w + 64);
442         const float32x4_t vk10c2x4567 = vld1q_f32(w + 68);
443 
444         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk10c2x0123, vget_high_f32(vi1x0), 0);
445         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk10c2x0123, vget_high_f32(vi3x0), 0);
446         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk10c2x4567, vget_high_f32(vi1x0), 0);
447         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk10c2x4567, vget_high_f32(vi3x0), 0);
448 
449         const float32x4_t vk20c2x0123 = vld1q_f32(w + 72);
450         const float32x4_t vk20c2x4567 = vld1q_f32(w + 76);
451 
452         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk20c2x0123, vget_high_f32(vi2x0), 0);
453         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk20c2x0123, vget_high_f32(vi4x0), 0);
454         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk20c2x4567, vget_high_f32(vi2x0), 0);
455         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk20c2x4567, vget_high_f32(vi4x0), 0);
456 
457         const float32x4_t vk01c0x0123 = vld1q_f32(w + 80);
458         const float32x4_t vk01c0x4567 = vld1q_f32(w + 84);
459 
460         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c0x0123, vget_high_f32(vi0x0), 1);
461         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c0x0123, vget_high_f32(vi2x0), 1);
462         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk01c0x4567, vget_high_f32(vi0x0), 1);
463         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk01c0x4567, vget_high_f32(vi2x0), 1);
464 
465         const float32x4_t vk11c0x0123 = vld1q_f32(w + 88);
466         const float32x4_t vk11c0x4567 = vld1q_f32(w + 92);
467 
468         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c0x0123, vget_high_f32(vi1x0), 1);
469         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c0x0123, vget_high_f32(vi3x0), 1);
470         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk11c0x4567, vget_high_f32(vi1x0), 1);
471         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk11c0x4567, vget_high_f32(vi3x0), 1);
472 
473         const float32x4_t vk21c0x0123 = vld1q_f32(w + 96);
474         const float32x4_t vk21c0x4567 = vld1q_f32(w + 100);
475 
476         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c0x0123, vget_high_f32(vi2x0), 1);
477         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c0x0123, vget_high_f32(vi4x0), 1);
478         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk21c0x4567, vget_high_f32(vi2x0), 1);
479         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk21c0x4567, vget_high_f32(vi4x0), 1);
480 
481         const float32x4_t vk01c1x0123 = vld1q_f32(w + 104);
482         const float32x4_t vk01c1x4567 = vld1q_f32(w + 108);
483 
484         // viMx1 = ( iM1c2, iM1c1 )
485         const float32x2_t vi0x1 = vld1_f32(i0); i0 += 2;
486         const float32x2_t vi1x1 = vld1_f32(i1); i1 += 2;
487         const float32x2_t vi2x1 = vld1_f32(i2); i2 += 2;
488         const float32x2_t vi3x1 = vld1_f32(i3); i3 += 2;
489         const float32x2_t vi4x1 = vld1_f32(i4); i4 += 2;
490 
491         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c1x0123, vi0x1, 0);
492         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c1x0123, vi2x1, 0);
493         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk01c1x4567, vi0x1, 0);
494         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk01c1x4567, vi2x1, 0);
495 
496         const float32x4_t vk11c1x0123 = vld1q_f32(w + 112);
497         const float32x4_t vk11c1x4567 = vld1q_f32(w + 116);
498 
499         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c1x0123, vi1x1, 0);
500         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c1x0123, vi3x1, 0);
501         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk11c1x4567, vi1x1, 0);
502         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk11c1x4567, vi3x1, 0);
503 
504         const float32x4_t vk21c1x0123 = vld1q_f32(w + 120);
505         const float32x4_t vk21c1x4567 = vld1q_f32(w + 124);
506 
507         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c1x0123, vi2x1, 0);
508         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c1x0123, vi4x1, 0);
509         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk21c1x4567, vi2x1, 0);
510         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk21c1x4567, vi4x1, 0);
511 
512         const float32x4_t vk01c2x0123 = vld1q_f32(w + 128);
513         const float32x4_t vk01c2x4567 = vld1q_f32(w + 132);
514 
515         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk01c2x0123, vi0x1, 1);
516         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c2x0123, vi2x1, 1);
517         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk01c2x4567, vi0x1, 1);
518         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk01c2x4567, vi2x1, 1);
519 
520         const float32x4_t vk11c2x0123 = vld1q_f32(w + 136);
521         const float32x4_t vk11c2x4567 = vld1q_f32(w + 140);
522 
523         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk11c2x0123, vi1x1, 1);
524         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk11c2x0123, vi3x1, 1);
525         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk11c2x4567, vi1x1, 1);
526         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk11c2x4567, vi3x1, 1);
527 
528         const float32x4_t vk21c2x0123 = vld1q_f32(w + 144);
529         const float32x4_t vk21c2x4567 = vld1q_f32(w + 148);
530 
531         vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c2x0123, vi2x1, 1);
532         vo1c0123 = vmlaq_lane_f32(vo1c0123, vk21c2x0123, vi4x1, 1);
533         vo0c4567 = vmlaq_lane_f32(vo0c4567, vk21c2x4567, vi2x1, 1);
534         vo1c4567 = vmlaq_lane_f32(vo1c4567, vk21c2x4567, vi4x1, 1);
535 
536         const float32x4_t vmin = vld1q_dup_f32(&params->scalar.min);
537         const float32x4_t vmax = vld1q_dup_f32(&params->scalar.max);
538 
539         vo0c0123 = vmaxq_f32(vo0c0123, vmin);
540         vo1c0123 = vmaxq_f32(vo1c0123, vmin);
541         vo0c4567 = vmaxq_f32(vo0c4567, vmin);
542         vo1c4567 = vmaxq_f32(vo1c4567, vmin);
543 
544         vo0c0123 = vminq_f32(vo0c0123, vmax);
545         vo1c0123 = vminq_f32(vo1c0123, vmax);
546         vo0c4567 = vminq_f32(vo0c4567, vmax);
547         vo1c4567 = vminq_f32(vo1c4567, vmax);
548 
549         if XNN_LIKELY(c >= 8) {
550           vst1q_f32(o1, vo1c0123);
551           vst1q_f32(o1 + 4, vo1c4567);
552           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
553           vst1q_f32(o0, vo0c0123);
554           vst1q_f32(o0 + 4, vo0c4567);
555           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
556         } else {
557           float* o0_tmp = o0;
558           float* o1_tmp = o1;
559           if (c & 4) {
560             vst1q_f32(o1_tmp, vo1c0123); o1_tmp += 4;
561             vo1c0123 = vo1c4567;
562             vst1q_f32(o0_tmp, vo0c0123); o0_tmp += 4;
563             vo0c0123 = vo0c4567;
564           }
565           float32x2_t vo0c01 = vget_low_f32(vo0c0123);
566           float32x2_t vo1c01 = vget_low_f32(vo1c0123);
567           if (c & 2) {
568             vst1_f32(o1_tmp, vo1c01); o1_tmp += 2;
569             vo1c01 = vget_high_f32(vo1c0123);
570             vst1_f32(o0_tmp, vo0c01); o0_tmp += 2;
571             vo0c01 = vget_high_f32(vo0c0123);
572           }
573           if (c & 1) {
574             vst1_lane_f32(o1_tmp, vo1c01, 0);
575             vst1_lane_f32(o0_tmp, vo0c01, 0);
576           }
577           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
578           o1 = (float*) ((uintptr_t) o1 + output_width_stride);
579         }
580       }
581       // Move output pointers back to the position of the first pixel in a row,
582       // and forward to the next block of output channels
583       o0 = (float*) ((uintptr_t) o0 - output_channel_decrement);
584       o1 = (float*) ((uintptr_t) o1 - output_channel_decrement);
585       // Revert input pointers to the position of the first pixel in a row
586       i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
587       i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
588       i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
589       i3 = (const float*) ((uintptr_t) i3 - input_width_decrement);
590       i4 = (const float*) ((uintptr_t) i4 - input_width_decrement);
591       // Move to the block of weights for the next 8 output channels
592       w += 224;
593       c = doz(c, 8);
594     } while (c != 0);
595     // Move output pointers back to the position of the first channel, and forward to the next block of rows
596     o0 = (float*) ((uintptr_t) o0 + output_height_increment);
597     o1 = (float*) ((uintptr_t) o1 + output_height_increment);
598     // Move input pointers forward to the next four rows
599     i0 = i4;
600     i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
601     i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
602     i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
603     i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
604   }
605 }
606