1 // Auto-generated file. Do not edit!
2 // Template: src/f32-dwconv2d-chw/5x5p2-neon.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <arm_neon.h>
13
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16
17
xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2(
19 size_t input_height,
20 size_t input_width,
21 const float* input,
22 const float* weights,
23 const float* zero,
24 float* output,
25 uint32_t padding_top,
26 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28 assert(input_height != 0);
29 assert(input_width != 0);
30 assert(input_width % sizeof(float) == 0);
31 assert(padding_top == 2);
32
33 const uint32x4_t vmask = vld1q_u32(params->neon.mask);
34 const float32x4_t vmax = vld1q_dup_f32(¶ms->neon.max);
35 const float32x4_t vmin = vld1q_dup_f32(¶ms->neon.min);
36
37 const float32x4_t vw0123 = vld1q_f32(weights);
38 const float32x4_t vw4567 = vld1q_f32(weights + 4);
39 const float32x4_t vw89AB = vld1q_f32(weights + 8);
40 const float32x4_t vwCDEF = vld1q_f32(weights + 12);
41 const float32x4_t vwGHIJ = vld1q_f32(weights + 16);
42 const float32x4_t vwKLMN = vld1q_f32(weights + 20);
43 const float32x2_t vwOP = vld1_f32(weights + 24);
44
45 const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
46
47 const float* i0 = zero;
48 const float* i1 = zero;
49 const float* i2 = input;
50 const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
51 const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
52 const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
53 const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
54 const float* i7 = (const float*) ((uintptr_t) i6 + input_width);
55
56 float* o0 = output;
57 float* o1 = (float*) ((uintptr_t) o0 + input_width);
58 float* o2 = (float*) ((uintptr_t) o1 + input_width);
59 float* o3 = (float*) ((uintptr_t) o2 + input_width);
60
61 size_t output_height = input_height;
62 do {
63 if XNN_UNPREDICTABLE(output_height < 2) {
64 i3 = zero;
65 o1 = o0;
66 }
67 if XNN_UNPREDICTABLE(output_height < 3) {
68 i4 = zero;
69 o2 = o1;
70 }
71 if XNN_UNPREDICTABLE(output_height < 4) {
72 i5 = zero;
73 o3 = o2;
74 }
75 if XNN_UNPREDICTABLE(output_height < 5) {
76 i6 = zero;
77 }
78 if XNN_UNPREDICTABLE(output_height < 6) {
79 i7 = zero;
80 }
81
82 float32x4_t vi0x0123 = vmovq_n_f32(0.0f);
83 float32x4_t vi1x0123 = vmovq_n_f32(0.0f);
84 float32x4_t vi2x0123 = vmovq_n_f32(0.0f);
85 float32x4_t vi3x0123 = vmovq_n_f32(0.0f);
86 float32x4_t vi4x0123 = vmovq_n_f32(0.0f);
87 float32x4_t vi5x0123 = vmovq_n_f32(0.0f);
88 float32x4_t vi6x0123 = vmovq_n_f32(0.0f);
89 float32x4_t vi7x0123 = vmovq_n_f32(0.0f);
90
91 float32x4_t vi0x4567 = vld1q_f32(i0); i0 += 4;
92 float32x4_t vi1x4567 = vld1q_f32(i1); i1 += 4;
93 float32x4_t vi2x4567 = vld1q_f32(i2); i2 += 4;
94 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4;
95 float32x4_t vi4x4567 = vld1q_f32(i4); i4 += 4;
96 float32x4_t vi5x4567 = vld1q_f32(i5); i5 += 4;
97 float32x4_t vi6x4567 = vld1q_f32(i6); i6 += 4;
98 float32x4_t vi7x4567 = vld1q_f32(i7); i7 += 4;
99
100 size_t w = input_width;
101 for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
102 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
103 float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
104 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
105 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
106
107 const float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4;
108 const float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4;
109 const float32x4_t vi2x89AB = vld1q_f32(i2); i2 += 4;
110 const float32x4_t vi3x89AB = vld1q_f32(i3); i3 += 4;
111 const float32x4_t vi4x89AB = vld1q_f32(i4); i4 += 4;
112 const float32x4_t vi5x89AB = vld1q_f32(i5); i5 += 4;
113 const float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4;
114 const float32x4_t vi7x89AB = vld1q_f32(i7); i7 += 4;
115
116 float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1);
117 float32x4_t vo1p1 = vmulq_lane_f32(vi1x4567, vget_high_f32(vw0123), 1);
118 float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1);
119 float32x4_t vo3p1 = vmulq_lane_f32(vi3x4567, vget_high_f32(vw0123), 1);
120
121 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0);
122 vo1p0 = vfmaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw89AB), 0);
123 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0);
124 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0);
125
126 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1);
127 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x4567, vget_low_f32(vwCDEF), 1);
128 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1);
129 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1);
130
131 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0);
132 vo1p1 = vfmaq_lane_f32(vo1p1, vi4x4567, vget_high_f32(vwGHIJ), 0);
133 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0);
134 vo3p1 = vfmaq_lane_f32(vo3p1, vi6x4567, vget_high_f32(vwGHIJ), 0);
135
136 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1);
137 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1);
138 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1);
139 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1);
140
141 const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3);
142 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3);
143 const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3);
144 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3);
145 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3);
146 const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3);
147 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3);
148 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3);
149
150 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0);
151 vo1p1 = vfmaq_lane_f32(vo1p1, vi1x3456, vget_high_f32(vw0123), 0);
152 vo2p1 = vfmaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0);
153 vo3p1 = vfmaq_lane_f32(vo3p1, vi3x3456, vget_high_f32(vw0123), 0);
154
155 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1);
156 vo1p0 = vfmaq_lane_f32(vo1p0, vi2x3456, vget_high_f32(vw4567), 1);
157 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1);
158 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1);
159
160 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0);
161 vo1p1 = vfmaq_lane_f32(vo1p1, vi3x3456, vget_low_f32(vwCDEF), 0);
162 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0);
163 vo3p1 = vfmaq_lane_f32(vo3p1, vi5x3456, vget_low_f32(vwCDEF), 0);
164
165 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1);
166 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1);
167 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1);
168 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1);
169
170 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0);
171 vo1p1 = vfmaq_lane_f32(vo1p1, vi5x3456, vget_high_f32(vwKLMN), 0);
172 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0);
173 vo3p1 = vfmaq_lane_f32(vo3p1, vi7x3456, vget_high_f32(vwKLMN), 0);
174
175 const float32x4_t vi0x2345 = vextq_f32(vi0x0123, vi0x4567, 2);
176 vi0x0123 = vi0x4567;
177 const float32x4_t vi1x2345 = vextq_f32(vi1x0123, vi1x4567, 2);
178 vi1x0123 = vi1x4567;
179 const float32x4_t vi2x2345 = vextq_f32(vi2x0123, vi2x4567, 2);
180 vi2x0123 = vi2x4567;
181 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2);
182 vi3x0123 = vi3x4567;
183 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2);
184 vi4x0123 = vi4x4567;
185 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2);
186 vi5x0123 = vi5x4567;
187 const float32x4_t vi6x2345 = vextq_f32(vi6x0123, vi6x4567, 2);
188 vi6x0123 = vi6x4567;
189 const float32x4_t vi7x2345 = vextq_f32(vi7x0123, vi7x4567, 2);
190 vi7x0123 = vi7x4567;
191
192 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1);
193 vo1p0 = vfmaq_lane_f32(vo1p0, vi1x2345, vget_low_f32(vw0123), 1);
194 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x2345, vget_low_f32(vw0123), 1);
195 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x2345, vget_low_f32(vw0123), 1);
196
197 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0);
198 vo1p1 = vfmaq_lane_f32(vo1p1, vi2x2345, vget_high_f32(vw4567), 0);
199 vo2p1 = vfmaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0);
200 vo3p1 = vfmaq_lane_f32(vo3p1, vi4x2345, vget_high_f32(vw4567), 0);
201
202 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1);
203 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x2345, vget_high_f32(vw89AB), 1);
204 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x2345, vget_high_f32(vw89AB), 1);
205 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1);
206
207 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0);
208 vo1p1 = vfmaq_lane_f32(vo1p1, vi4x2345, vget_low_f32(vwGHIJ), 0);
209 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0);
210 vo3p1 = vfmaq_lane_f32(vo3p1, vi6x2345, vget_low_f32(vwGHIJ), 0);
211
212 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1);
213 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1);
214 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x2345, vget_low_f32(vwKLMN), 1);
215 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x2345, vget_low_f32(vwKLMN), 1);
216
217 const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vi0x89AB, 1);
218 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1);
219 const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vi2x89AB, 1);
220 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1);
221 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1);
222 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1);
223 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1);
224 const float32x4_t vi7x5678 = vextq_f32(vi7x4567, vi7x89AB, 1);
225
226 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0);
227 vo1p1 = vfmaq_lane_f32(vo1p1, vi1x5678, vget_low_f32(vw4567), 0);
228 vo2p1 = vfmaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0);
229 vo3p1 = vfmaq_lane_f32(vo3p1, vi3x5678, vget_low_f32(vw4567), 0);
230
231 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1);
232 vo1p0 = vfmaq_lane_f32(vo1p0, vi2x5678, vget_low_f32(vw89AB), 1);
233 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x5678, vget_low_f32(vw89AB), 1);
234 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x5678, vget_low_f32(vw89AB), 1);
235
236 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0);
237 vo1p1 = vfmaq_lane_f32(vo1p1, vi3x5678, vget_high_f32(vwCDEF), 0);
238 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0);
239 vo3p1 = vfmaq_lane_f32(vo3p1, vi5x5678, vget_high_f32(vwCDEF), 0);
240
241 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x5678, vget_high_f32(vwGHIJ), 1);
242 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1);
243 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1);
244 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1);
245
246 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0);
247 vo1p1 = vfmaq_lane_f32(vo1p1, vi5x5678, vwOP, 0);
248 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0);
249 vo3p1 = vfmaq_lane_f32(vo3p1, vi7x5678, vwOP, 0);
250
251 const float32x4_t vi0x6789 = vextq_f32(vi0x4567, vi0x89AB, 2);
252 vi0x4567 = vi0x89AB;
253 const float32x4_t vi1x6789 = vextq_f32(vi1x4567, vi1x89AB, 2);
254 vi1x4567 = vi1x89AB;
255 const float32x4_t vi2x6789 = vextq_f32(vi2x4567, vi2x89AB, 2);
256 vi2x4567 = vi2x89AB;
257 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2);
258 vi3x4567 = vi3x89AB;
259 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2);
260 vi4x4567 = vi4x89AB;
261 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2);
262 vi5x4567 = vi5x89AB;
263 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2);
264 vi6x4567 = vi6x89AB;
265 const float32x4_t vi7x6789 = vextq_f32(vi7x4567, vi7x89AB, 2);
266 vi7x4567 = vi7x89AB;
267
268 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x6789, vget_low_f32(vw4567), 1);
269 vo1p0 = vfmaq_lane_f32(vo1p0, vi1x6789, vget_low_f32(vw4567), 1);
270 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x6789, vget_low_f32(vw4567), 1);
271 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x6789, vget_low_f32(vw4567), 1);
272
273 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x6789, vget_high_f32(vw89AB), 0);
274 vo1p1 = vfmaq_lane_f32(vo1p1, vi2x6789, vget_high_f32(vw89AB), 0);
275 vo2p1 = vfmaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0);
276 vo3p1 = vfmaq_lane_f32(vo3p1, vi4x6789, vget_high_f32(vw89AB), 0);
277
278 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x6789, vget_high_f32(vwCDEF), 1);
279 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1);
280 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x6789, vget_high_f32(vwCDEF), 1);
281 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1);
282
283 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0);
284 vo1p1 = vfmaq_lane_f32(vo1p1, vi4x6789, vget_low_f32(vwKLMN), 0);
285 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0);
286 vo3p1 = vfmaq_lane_f32(vo3p1, vi6x6789, vget_low_f32(vwKLMN), 0);
287
288 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x6789, vwOP, 1);
289 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1);
290 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x6789, vwOP, 1);
291 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x6789, vwOP, 1);
292
293 vo0p0 = vaddq_f32(vo0p0, vo0p1);
294 vo1p0 = vaddq_f32(vo1p0, vo1p1);
295 vo2p0 = vaddq_f32(vo2p0, vo2p1);
296 vo3p0 = vaddq_f32(vo3p0, vo3p1);
297
298 float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);
299 float32x4_t vo1 = vmaxq_f32(vo1p0, vmin);
300 float32x4_t vo2 = vmaxq_f32(vo2p0, vmin);
301 float32x4_t vo3 = vmaxq_f32(vo3p0, vmin);
302
303 vo0 = vminq_f32(vo0, vmax);
304 vo1 = vminq_f32(vo1, vmax);
305 vo2 = vminq_f32(vo2, vmax);
306 vo3 = vminq_f32(vo3, vmax);
307
308 vst1q_f32(o3, vo3); o3 += 4;
309 vst1q_f32(o2, vo2); o2 += 4;
310 vst1q_f32(o1, vo1); o1 += 4;
311 vst1q_f32(o0, vo0); o0 += 4;
312 }
313 // Always process the last block of 5..8 pixels.
314 if XNN_LIKELY(w > 4 * sizeof(float)) {
315 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
316 float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
317 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
318 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
319
320 float32x4_t vi0x89AB = vld1q_f32(i0); i0 += 4;
321 float32x4_t vi1x89AB = vld1q_f32(i1); i1 += 4;
322 float32x4_t vi2x89AB = vld1q_f32(i2); i2 += 4;
323 float32x4_t vi3x89AB = vld1q_f32(i3); i3 += 4;
324 float32x4_t vi4x89AB = vld1q_f32(i4); i4 += 4;
325 float32x4_t vi5x89AB = vld1q_f32(i5); i5 += 4;
326 float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4;
327 float32x4_t vi7x89AB = vld1q_f32(i7); i7 += 4;
328
329 vi0x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi0x89AB)));
330 vi1x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x89AB)));
331 vi2x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi2x89AB)));
332 vi3x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi3x89AB)));
333 vi4x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi4x89AB)));
334 vi5x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi5x89AB)));
335 vi6x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi6x89AB)));
336 vi7x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi7x89AB)));
337
338 float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1);
339 float32x4_t vo1p1 = vmulq_lane_f32(vi1x4567, vget_high_f32(vw0123), 1);
340 float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1);
341 float32x4_t vo3p1 = vmulq_lane_f32(vi3x4567, vget_high_f32(vw0123), 1);
342
343 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0);
344 vo1p0 = vfmaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw89AB), 0);
345 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0);
346 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0);
347
348 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1);
349 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x4567, vget_low_f32(vwCDEF), 1);
350 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1);
351 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1);
352
353 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0);
354 vo1p1 = vfmaq_lane_f32(vo1p1, vi4x4567, vget_high_f32(vwGHIJ), 0);
355 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0);
356 vo3p1 = vfmaq_lane_f32(vo3p1, vi6x4567, vget_high_f32(vwGHIJ), 0);
357
358 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1);
359 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1);
360 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1);
361 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1);
362
363 const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3);
364 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3);
365 const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3);
366 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3);
367 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3);
368 const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3);
369 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3);
370 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3);
371
372 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0);
373 vo1p1 = vfmaq_lane_f32(vo1p1, vi1x3456, vget_high_f32(vw0123), 0);
374 vo2p1 = vfmaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0);
375 vo3p1 = vfmaq_lane_f32(vo3p1, vi3x3456, vget_high_f32(vw0123), 0);
376
377 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1);
378 vo1p0 = vfmaq_lane_f32(vo1p0, vi2x3456, vget_high_f32(vw4567), 1);
379 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1);
380 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1);
381
382 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0);
383 vo1p1 = vfmaq_lane_f32(vo1p1, vi3x3456, vget_low_f32(vwCDEF), 0);
384 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0);
385 vo3p1 = vfmaq_lane_f32(vo3p1, vi5x3456, vget_low_f32(vwCDEF), 0);
386
387 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1);
388 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1);
389 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1);
390 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1);
391
392 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0);
393 vo1p1 = vfmaq_lane_f32(vo1p1, vi5x3456, vget_high_f32(vwKLMN), 0);
394 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0);
395 vo3p1 = vfmaq_lane_f32(vo3p1, vi7x3456, vget_high_f32(vwKLMN), 0);
396
397 const float32x4_t vi0x2345 = vextq_f32(vi0x0123, vi0x4567, 2);
398 vi0x0123 = vi0x4567;
399 const float32x4_t vi1x2345 = vextq_f32(vi1x0123, vi1x4567, 2);
400 vi1x0123 = vi1x4567;
401 const float32x4_t vi2x2345 = vextq_f32(vi2x0123, vi2x4567, 2);
402 vi2x0123 = vi2x4567;
403 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2);
404 vi3x0123 = vi3x4567;
405 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2);
406 vi4x0123 = vi4x4567;
407 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2);
408 vi5x0123 = vi5x4567;
409 const float32x4_t vi6x2345 = vextq_f32(vi6x0123, vi6x4567, 2);
410 vi6x0123 = vi6x4567;
411 const float32x4_t vi7x2345 = vextq_f32(vi7x0123, vi7x4567, 2);
412 vi7x0123 = vi7x4567;
413
414 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1);
415 vo1p0 = vfmaq_lane_f32(vo1p0, vi1x2345, vget_low_f32(vw0123), 1);
416 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x2345, vget_low_f32(vw0123), 1);
417 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x2345, vget_low_f32(vw0123), 1);
418
419 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0);
420 vo1p1 = vfmaq_lane_f32(vo1p1, vi2x2345, vget_high_f32(vw4567), 0);
421 vo2p1 = vfmaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0);
422 vo3p1 = vfmaq_lane_f32(vo3p1, vi4x2345, vget_high_f32(vw4567), 0);
423
424 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1);
425 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x2345, vget_high_f32(vw89AB), 1);
426 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x2345, vget_high_f32(vw89AB), 1);
427 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1);
428
429 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0);
430 vo1p1 = vfmaq_lane_f32(vo1p1, vi4x2345, vget_low_f32(vwGHIJ), 0);
431 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0);
432 vo3p1 = vfmaq_lane_f32(vo3p1, vi6x2345, vget_low_f32(vwGHIJ), 0);
433
434 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1);
435 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1);
436 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x2345, vget_low_f32(vwKLMN), 1);
437 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x2345, vget_low_f32(vwKLMN), 1);
438
439 const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vi0x89AB, 1);
440 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vi1x89AB, 1);
441 const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vi2x89AB, 1);
442 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1);
443 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1);
444 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1);
445 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1);
446 const float32x4_t vi7x5678 = vextq_f32(vi7x4567, vi7x89AB, 1);
447
448 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0);
449 vo1p1 = vfmaq_lane_f32(vo1p1, vi1x5678, vget_low_f32(vw4567), 0);
450 vo2p1 = vfmaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0);
451 vo3p1 = vfmaq_lane_f32(vo3p1, vi3x5678, vget_low_f32(vw4567), 0);
452
453 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1);
454 vo1p0 = vfmaq_lane_f32(vo1p0, vi2x5678, vget_low_f32(vw89AB), 1);
455 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x5678, vget_low_f32(vw89AB), 1);
456 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x5678, vget_low_f32(vw89AB), 1);
457
458 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0);
459 vo1p1 = vfmaq_lane_f32(vo1p1, vi3x5678, vget_high_f32(vwCDEF), 0);
460 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0);
461 vo3p1 = vfmaq_lane_f32(vo3p1, vi5x5678, vget_high_f32(vwCDEF), 0);
462
463 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x5678, vget_high_f32(vwGHIJ), 1);
464 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1);
465 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1);
466 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1);
467
468 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0);
469 vo1p1 = vfmaq_lane_f32(vo1p1, vi5x5678, vwOP, 0);
470 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0);
471 vo3p1 = vfmaq_lane_f32(vo3p1, vi7x5678, vwOP, 0);
472
473 const float32x4_t vi0x6789 = vextq_f32(vi0x4567, vi0x89AB, 2);
474 vi0x4567 = vi0x89AB;
475 const float32x4_t vi1x6789 = vextq_f32(vi1x4567, vi1x89AB, 2);
476 vi1x4567 = vi1x89AB;
477 const float32x4_t vi2x6789 = vextq_f32(vi2x4567, vi2x89AB, 2);
478 vi2x4567 = vi2x89AB;
479 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2);
480 vi3x4567 = vi3x89AB;
481 const float32x4_t vi4x6789 = vextq_f32(vi4x4567, vi4x89AB, 2);
482 vi4x4567 = vi4x89AB;
483 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2);
484 vi5x4567 = vi5x89AB;
485 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2);
486 vi6x4567 = vi6x89AB;
487 const float32x4_t vi7x6789 = vextq_f32(vi7x4567, vi7x89AB, 2);
488 vi7x4567 = vi7x89AB;
489
490 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x6789, vget_low_f32(vw4567), 1);
491 vo1p0 = vfmaq_lane_f32(vo1p0, vi1x6789, vget_low_f32(vw4567), 1);
492 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x6789, vget_low_f32(vw4567), 1);
493 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x6789, vget_low_f32(vw4567), 1);
494
495 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x6789, vget_high_f32(vw89AB), 0);
496 vo1p1 = vfmaq_lane_f32(vo1p1, vi2x6789, vget_high_f32(vw89AB), 0);
497 vo2p1 = vfmaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0);
498 vo3p1 = vfmaq_lane_f32(vo3p1, vi4x6789, vget_high_f32(vw89AB), 0);
499
500 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x6789, vget_high_f32(vwCDEF), 1);
501 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1);
502 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x6789, vget_high_f32(vwCDEF), 1);
503 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1);
504
505 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0);
506 vo1p1 = vfmaq_lane_f32(vo1p1, vi4x6789, vget_low_f32(vwKLMN), 0);
507 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0);
508 vo3p1 = vfmaq_lane_f32(vo3p1, vi6x6789, vget_low_f32(vwKLMN), 0);
509
510 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x6789, vwOP, 1);
511 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1);
512 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x6789, vwOP, 1);
513 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x6789, vwOP, 1);
514
515 vo0p0 = vaddq_f32(vo0p0, vo0p1);
516 vo1p0 = vaddq_f32(vo1p0, vo1p1);
517 vo2p0 = vaddq_f32(vo2p0, vo2p1);
518 vo3p0 = vaddq_f32(vo3p0, vo3p1);
519
520 float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);
521 float32x4_t vo1 = vmaxq_f32(vo1p0, vmin);
522 float32x4_t vo2 = vmaxq_f32(vo2p0, vmin);
523 float32x4_t vo3 = vmaxq_f32(vo3p0, vmin);
524
525 vo0 = vminq_f32(vo0, vmax);
526 vo1 = vminq_f32(vo1, vmax);
527 vo2 = vminq_f32(vo2, vmax);
528 vo3 = vminq_f32(vo3, vmax);
529
530 vst1q_f32(o3, vo3); o3 += 4;
531 vst1q_f32(o2, vo2); o2 += 4;
532 vst1q_f32(o1, vo1); o1 += 4;
533 vst1q_f32(o0, vo0); o0 += 4;
534
535 w -= 4 * sizeof(float);
536 }
537 assert(w >= 1 * sizeof(float));
538 assert(w <= 4 * sizeof(float));
539 {
540 float32x4_t vo0p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
541 float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
542 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
543 float32x4_t vo3p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0);
544
545 vi0x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi0x4567)));
546 vi1x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi1x4567)));
547 vi2x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi2x4567)));
548 vi3x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi3x4567)));
549 vi4x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi4x4567)));
550 vi5x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi5x4567)));
551 vi6x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi6x4567)));
552 vi7x4567 = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi7x4567)));
553
554 float32x4_t vo0p1 = vmulq_lane_f32(vi0x4567, vget_high_f32(vw0123), 1);
555 float32x4_t vo1p1 = vmulq_lane_f32(vi1x4567, vget_high_f32(vw0123), 1);
556 float32x4_t vo2p1 = vmulq_lane_f32(vi2x4567, vget_high_f32(vw0123), 1);
557 float32x4_t vo3p1 = vmulq_lane_f32(vi3x4567, vget_high_f32(vw0123), 1);
558
559 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x4567, vget_low_f32(vw89AB), 0);
560 vo1p0 = vfmaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw89AB), 0);
561 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0);
562 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x4567, vget_low_f32(vw89AB), 0);
563
564 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vget_low_f32(vwCDEF), 1);
565 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x4567, vget_low_f32(vwCDEF), 1);
566 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1);
567 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vget_low_f32(vwCDEF), 1);
568
569 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0);
570 vo1p1 = vfmaq_lane_f32(vo1p1, vi4x4567, vget_high_f32(vwGHIJ), 0);
571 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x4567, vget_high_f32(vwGHIJ), 0);
572 vo3p1 = vfmaq_lane_f32(vo3p1, vi6x4567, vget_high_f32(vwGHIJ), 0);
573
574 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x4567, vget_high_f32(vwKLMN), 1);
575 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1);
576 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1);
577 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x4567, vget_high_f32(vwKLMN), 1);
578
579 const float32x4_t vi0x3456 = vextq_f32(vi0x0123, vi0x4567, 3);
580 const float32x4_t vi1x3456 = vextq_f32(vi1x0123, vi1x4567, 3);
581 const float32x4_t vi2x3456 = vextq_f32(vi2x0123, vi2x4567, 3);
582 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3);
583 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3);
584 const float32x4_t vi5x3456 = vextq_f32(vi5x0123, vi5x4567, 3);
585 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3);
586 const float32x4_t vi7x3456 = vextq_f32(vi7x0123, vi7x4567, 3);
587
588 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x3456, vget_high_f32(vw0123), 0);
589 vo1p1 = vfmaq_lane_f32(vo1p1, vi1x3456, vget_high_f32(vw0123), 0);
590 vo2p1 = vfmaq_lane_f32(vo2p1, vi2x3456, vget_high_f32(vw0123), 0);
591 vo3p1 = vfmaq_lane_f32(vo3p1, vi3x3456, vget_high_f32(vw0123), 0);
592
593 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x3456, vget_high_f32(vw4567), 1);
594 vo1p0 = vfmaq_lane_f32(vo1p0, vi2x3456, vget_high_f32(vw4567), 1);
595 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1);
596 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1);
597
598 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x3456, vget_low_f32(vwCDEF), 0);
599 vo1p1 = vfmaq_lane_f32(vo1p1, vi3x3456, vget_low_f32(vwCDEF), 0);
600 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0);
601 vo3p1 = vfmaq_lane_f32(vo3p1, vi5x3456, vget_low_f32(vwCDEF), 0);
602
603 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x3456, vget_low_f32(vwGHIJ), 1);
604 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1);
605 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1);
606 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1);
607
608 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0);
609 vo1p1 = vfmaq_lane_f32(vo1p1, vi5x3456, vget_high_f32(vwKLMN), 0);
610 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0);
611 vo3p1 = vfmaq_lane_f32(vo3p1, vi7x3456, vget_high_f32(vwKLMN), 0);
612
613 const float32x4_t vi0x2345 = vextq_f32(vi0x0123, vi0x4567, 2);
614 const float32x4_t vi1x2345 = vextq_f32(vi1x0123, vi1x4567, 2);
615 const float32x4_t vi2x2345 = vextq_f32(vi2x0123, vi2x4567, 2);
616 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2);
617 const float32x4_t vi4x2345 = vextq_f32(vi4x0123, vi4x4567, 2);
618 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2);
619 const float32x4_t vi6x2345 = vextq_f32(vi6x0123, vi6x4567, 2);
620 const float32x4_t vi7x2345 = vextq_f32(vi7x0123, vi7x4567, 2);
621
622 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x2345, vget_low_f32(vw0123), 1);
623 vo1p0 = vfmaq_lane_f32(vo1p0, vi1x2345, vget_low_f32(vw0123), 1);
624 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x2345, vget_low_f32(vw0123), 1);
625 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x2345, vget_low_f32(vw0123), 1);
626
627 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x2345, vget_high_f32(vw4567), 0);
628 vo1p1 = vfmaq_lane_f32(vo1p1, vi2x2345, vget_high_f32(vw4567), 0);
629 vo2p1 = vfmaq_lane_f32(vo2p1, vi3x2345, vget_high_f32(vw4567), 0);
630 vo3p1 = vfmaq_lane_f32(vo3p1, vi4x2345, vget_high_f32(vw4567), 0);
631
632 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x2345, vget_high_f32(vw89AB), 1);
633 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x2345, vget_high_f32(vw89AB), 1);
634 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x2345, vget_high_f32(vw89AB), 1);
635 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1);
636
637 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x2345, vget_low_f32(vwGHIJ), 0);
638 vo1p1 = vfmaq_lane_f32(vo1p1, vi4x2345, vget_low_f32(vwGHIJ), 0);
639 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0);
640 vo3p1 = vfmaq_lane_f32(vo3p1, vi6x2345, vget_low_f32(vwGHIJ), 0);
641
642 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x2345, vget_low_f32(vwKLMN), 1);
643 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1);
644 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x2345, vget_low_f32(vwKLMN), 1);
645 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x2345, vget_low_f32(vwKLMN), 1);
646
647 const float32x4_t vzero = vmovq_n_f32(0.0f);
648 const float32x4_t vi0x5678 = vextq_f32(vi0x4567, vzero, 1);
649 const float32x4_t vi1x5678 = vextq_f32(vi1x4567, vzero, 1);
650 const float32x4_t vi2x5678 = vextq_f32(vi2x4567, vzero, 1);
651 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vzero, 1);
652 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1);
653 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vzero, 1);
654 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1);
655 const float32x4_t vi7x5678 = vextq_f32(vi7x4567, vzero, 1);
656
657 vo0p1 = vfmaq_lane_f32(vo0p1, vi0x5678, vget_low_f32(vw4567), 0);
658 vo1p1 = vfmaq_lane_f32(vo1p1, vi1x5678, vget_low_f32(vw4567), 0);
659 vo2p1 = vfmaq_lane_f32(vo2p1, vi2x5678, vget_low_f32(vw4567), 0);
660 vo3p1 = vfmaq_lane_f32(vo3p1, vi3x5678, vget_low_f32(vw4567), 0);
661
662 vo0p0 = vfmaq_lane_f32(vo0p0, vi1x5678, vget_low_f32(vw89AB), 1);
663 vo1p0 = vfmaq_lane_f32(vo1p0, vi2x5678, vget_low_f32(vw89AB), 1);
664 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x5678, vget_low_f32(vw89AB), 1);
665 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x5678, vget_low_f32(vw89AB), 1);
666
667 vo0p1 = vfmaq_lane_f32(vo0p1, vi2x5678, vget_high_f32(vwCDEF), 0);
668 vo1p1 = vfmaq_lane_f32(vo1p1, vi3x5678, vget_high_f32(vwCDEF), 0);
669 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x5678, vget_high_f32(vwCDEF), 0);
670 vo3p1 = vfmaq_lane_f32(vo3p1, vi5x5678, vget_high_f32(vwCDEF), 0);
671
672 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x5678, vget_high_f32(vwGHIJ), 1);
673 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1);
674 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1);
675 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1);
676
677 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0);
678 vo1p1 = vfmaq_lane_f32(vo1p1, vi5x5678, vwOP, 0);
679 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0);
680 vo3p1 = vfmaq_lane_f32(vo3p1, vi7x5678, vwOP, 0);
681
682 const float32x4_t vi0x6789 = vextq_f32(vi0x5678, vzero, 1);
683 const float32x4_t vi1x6789 = vextq_f32(vi1x5678, vzero, 1);
684 const float32x4_t vi2x6789 = vextq_f32(vi2x5678, vzero, 1);
685 const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1);
686 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1);
687 const float32x4_t vi5x6789 = vextq_f32(vi5x5678, vzero, 1);
688 const float32x4_t vi6x6789 = vextq_f32(vi6x5678, vzero, 1);
689 const float32x4_t vi7x6789 = vextq_f32(vi7x5678, vzero, 1);
690
691 vo0p0 = vfmaq_lane_f32(vo0p0, vi0x6789, vget_low_f32(vw4567), 1);
692 vo1p0 = vfmaq_lane_f32(vo1p0, vi1x6789, vget_low_f32(vw4567), 1);
693 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x6789, vget_low_f32(vw4567), 1);
694 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x6789, vget_low_f32(vw4567), 1);
695
696 vo0p1 = vfmaq_lane_f32(vo0p1, vi1x6789, vget_high_f32(vw89AB), 0);
697 vo1p1 = vfmaq_lane_f32(vo1p1, vi2x6789, vget_high_f32(vw89AB), 0);
698 vo2p1 = vfmaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0);
699 vo3p1 = vfmaq_lane_f32(vo3p1, vi4x6789, vget_high_f32(vw89AB), 0);
700
701 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x6789, vget_high_f32(vwCDEF), 1);
702 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1);
703 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x6789, vget_high_f32(vwCDEF), 1);
704 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1);
705
706 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0);
707 vo1p1 = vfmaq_lane_f32(vo1p1, vi4x6789, vget_low_f32(vwKLMN), 0);
708 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0);
709 vo3p1 = vfmaq_lane_f32(vo3p1, vi6x6789, vget_low_f32(vwKLMN), 0);
710
711 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x6789, vwOP, 1);
712 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1);
713 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x6789, vwOP, 1);
714 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x6789, vwOP, 1);
715
716 vo0p0 = vaddq_f32(vo0p0, vo0p1);
717 vo1p0 = vaddq_f32(vo1p0, vo1p1);
718 vo2p0 = vaddq_f32(vo2p0, vo2p1);
719 vo3p0 = vaddq_f32(vo3p0, vo3p1);
720
721 float32x4_t vo0 = vmaxq_f32(vo0p0, vmin);
722 float32x4_t vo1 = vmaxq_f32(vo1p0, vmin);
723 float32x4_t vo2 = vmaxq_f32(vo2p0, vmin);
724 float32x4_t vo3 = vmaxq_f32(vo3p0, vmin);
725
726 vo0 = vminq_f32(vo0, vmax);
727 vo1 = vminq_f32(vo1, vmax);
728 vo2 = vminq_f32(vo2, vmax);
729 vo3 = vminq_f32(vo3, vmax);
730
731 if XNN_LIKELY(w & (4 * sizeof(float))) {
732 vst1q_f32(o3, vo3); o3 += 4;
733 vst1q_f32(o2, vo2); o2 += 4;
734 vst1q_f32(o1, vo1); o1 += 4;
735 vst1q_f32(o0, vo0); o0 += 4;
736 } else {
737 float32x2_t vo0_lo = vget_low_f32(vo0);
738 float32x2_t vo1_lo = vget_low_f32(vo1);
739 float32x2_t vo2_lo = vget_low_f32(vo2);
740 float32x2_t vo3_lo = vget_low_f32(vo3);
741 if (w & (2 * sizeof(float))) {
742 vst1_f32(o3, vo3_lo); o3 += 2;
743 vst1_f32(o2, vo2_lo); o2 += 2;
744 vst1_f32(o1, vo1_lo); o1 += 2;
745 vst1_f32(o0, vo0_lo); o0 += 2;
746
747 vo0_lo = vget_high_f32(vo0);
748 vo1_lo = vget_high_f32(vo1);
749 vo2_lo = vget_high_f32(vo2);
750 vo3_lo = vget_high_f32(vo3);
751 }
752 if (w & (1 * sizeof(float))) {
753 vst1_lane_f32(o3, vo3_lo, 0); o3 += 1;
754 vst1_lane_f32(o2, vo2_lo, 0); o2 += 1;
755 vst1_lane_f32(o1, vo1_lo, 0); o1 += 1;
756 vst1_lane_f32(o0, vo0_lo, 0); o0 += 1;
757 }
758 }
759 }
760
761 i0 = (const float*) ((uintptr_t) i4 - input_decrement);
762 i1 = (const float*) ((uintptr_t) i5 - input_decrement);
763 i2 = (const float*) ((uintptr_t) i1 + input_width);
764 i3 = (const float*) ((uintptr_t) i2 + input_width);
765 i4 = (const float*) ((uintptr_t) i3 + input_width);
766 i5 = (const float*) ((uintptr_t) i4 + input_width);
767 i6 = (const float*) ((uintptr_t) i5 + input_width);
768 i7 = (const float*) ((uintptr_t) i6 + input_width);
769
770 o0 = o3;
771 o1 = (float*) ((uintptr_t) o0 + input_width);
772 o2 = (float*) ((uintptr_t) o1 + input_width);
773 o3 = (float*) ((uintptr_t) o2 + input_width);
774
775 output_height = doz(output_height, 4);
776 } while (output_height != 0);
777 }
778