1 // Auto-generated file. Do not edit!
2 // Template: src/f32-dwconv2d-chw/5x5p2-wasmsimd-loadsplat.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <wasm_simd128.h>
13
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16
17
18
xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2(
20 size_t input_height,
21 size_t input_width,
22 const float* input,
23 const float* weights,
24 const float* zero,
25 float* output,
26 uint32_t padding_top,
27 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
28 {
29 assert(input_height != 0);
30 assert(input_width != 0);
31 assert(input_width % sizeof(float) == 0);
32 assert(padding_top == 2);
33
34 const v128_t vmask = wasm_v128_load(params->scalar.mask);
35 const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max);
36 const v128_t vmin = wasm_v32x4_load_splat(¶ms->scalar.min);
37
38 const v128_t vw0123 = wasm_v128_load(weights);
39 const v128_t vw4567 = wasm_v128_load(weights + 4);
40 const v128_t vw89AB = wasm_v128_load(weights + 8);
41 const v128_t vwCDEF = wasm_v128_load(weights + 12);
42 const v128_t vwGHIJ = wasm_v128_load(weights + 16);
43 const v128_t vwKLMN = wasm_v128_load(weights + 20);
44 const v128_t vwOP = wasm_v64x2_load_splat(weights + 24);
45 const v128_t vbias = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0);
46 const v128_t vk00 = wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1);
47 const v128_t vk01 = wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2);
48 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3);
49 const v128_t vk03 = wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0);
50 const v128_t vk04 = wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1);
51 const v128_t vk10 = wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2);
52 const v128_t vk11 = wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3);
53 const v128_t vk12 = wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0);
54 const v128_t vk13 = wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1);
55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2);
56 const v128_t vk20 = wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3);
57 const v128_t vk21 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0);
58 const v128_t vk22 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1);
59 const v128_t vk23 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2);
60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3);
61 const v128_t vk30 = wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0);
62 const v128_t vk31 = wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1);
63 const v128_t vk32 = wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2);
64 const v128_t vk33 = wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3);
65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0);
66 const v128_t vk40 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1);
67 const v128_t vk41 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2);
68 const v128_t vk42 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3);
69 const v128_t vk43 = wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0);
70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1);
71
72 const v128_t vzero = wasm_f32x4_splat(0.0f);
73
74 const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
75
76 const float* i0 = zero;
77 const float* i1 = zero;
78 const float* i2 = input;
79 const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
80 const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
81 const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
82 const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
83 const float* i7 = (const float*) ((uintptr_t) i6 + input_width);
84
85 float* o0 = output;
86 float* o1 = (float*) ((uintptr_t) o0 + input_width);
87 float* o2 = (float*) ((uintptr_t) o1 + input_width);
88 float* o3 = (float*) ((uintptr_t) o2 + input_width);
89
90 size_t output_height = input_height;
91 do {
92 if XNN_UNPREDICTABLE(output_height < 2) {
93 i3 = zero;
94 o1 = o0;
95 }
96 if XNN_UNPREDICTABLE(output_height < 3) {
97 i4 = zero;
98 o2 = o1;
99 }
100 if XNN_UNPREDICTABLE(output_height < 4) {
101 i5 = zero;
102 o3 = o2;
103 }
104 if XNN_UNPREDICTABLE(output_height < 5) {
105 i6 = zero;
106 }
107 if XNN_UNPREDICTABLE(output_height < 6) {
108 i7 = zero;
109 }
110
111 v128_t vi0x0123 = vzero;
112 v128_t vi1x0123 = vzero;
113 v128_t vi2x0123 = vzero;
114 v128_t vi3x0123 = vzero;
115 v128_t vi4x0123 = vzero;
116 v128_t vi5x0123 = vzero;
117 v128_t vi6x0123 = vzero;
118 v128_t vi7x0123 = vzero;
119
120 v128_t vi0x4567 = wasm_v128_load(i0); i0 += 4;
121 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4;
122 v128_t vi2x4567 = wasm_v128_load(i2); i2 += 4;
123 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4;
124 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4;
125 v128_t vi5x4567 = wasm_v128_load(i5); i5 += 4;
126 v128_t vi6x4567 = wasm_v128_load(i6); i6 += 4;
127 v128_t vi7x4567 = wasm_v128_load(i7); i7 += 4;
128
129 size_t w = input_width;
130 for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
131 v128_t vo0p0 = vbias;
132 v128_t vo1p0 = vbias;
133 v128_t vo2p0 = vbias;
134 v128_t vo3p0 = vbias;
135
136 const v128_t vi0x89AB = wasm_v128_load(i0); i0 += 4;
137 const v128_t vi1x89AB = wasm_v128_load(i1); i1 += 4;
138 const v128_t vi2x89AB = wasm_v128_load(i2); i2 += 4;
139 const v128_t vi3x89AB = wasm_v128_load(i3); i3 += 4;
140 const v128_t vi4x89AB = wasm_v128_load(i4); i4 += 4;
141 const v128_t vi5x89AB = wasm_v128_load(i5); i5 += 4;
142 const v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4;
143 const v128_t vi7x89AB = wasm_v128_load(i7); i7 += 4;
144
145 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02);
146 v128_t vo1p1 = wasm_f32x4_mul(vi1x4567, vk02);
147 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, vk02);
148 v128_t vo3p1 = wasm_f32x4_mul(vi3x4567, vk02);
149
150 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk12));
151 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, vk12));
152 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, vk12));
153 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, vk12));
154
155 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk22));
156 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk22));
157 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk22));
158 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk22));
159
160 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32));
161 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x4567, vk32));
162 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32));
163 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi6x4567, vk32));
164
165 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42));
166 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42));
167 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42));
168 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42));
169
170 const v128_t vi0x3456 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 3, 4, 5, 6);
171 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6);
172 const v128_t vi2x3456 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 3, 4, 5, 6);
173 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6);
174 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6);
175 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6);
176 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6);
177 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6);
178
179 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01));
180 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi1x3456, vk01));
181 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, vk01));
182 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi3x3456, vk01));
183
184 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk11));
185 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, vk11));
186 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, vk11));
187 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk11));
188
189 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, vk21));
190 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi3x3456, vk21));
191 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, vk21));
192 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi5x3456, vk21));
193
194 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, vk31));
195 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, vk31));
196 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, vk31));
197 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31));
198
199 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41));
200 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x3456, vk41));
201 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41));
202 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x3456, vk41));
203
204 const v128_t vi0x2345 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 2, 3, 4, 5);
205 vi0x0123 = vi0x4567;
206 const v128_t vi1x2345 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 2, 3, 4, 5);
207 vi1x0123 = vi1x4567;
208 const v128_t vi2x2345 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 2, 3, 4, 5);
209 vi2x0123 = vi2x4567;
210 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5);
211 vi3x0123 = vi3x4567;
212 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5);
213 vi4x0123 = vi4x4567;
214 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5);
215 vi5x0123 = vi5x4567;
216 const v128_t vi6x2345 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 2, 3, 4, 5);
217 vi6x0123 = vi6x4567;
218 const v128_t vi7x2345 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 2, 3, 4, 5);
219 vi7x0123 = vi7x4567;
220
221 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x2345, vk00));
222 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x2345, vk00));
223 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x2345, vk00));
224 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x2345, vk00));
225
226 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10));
227 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x2345, vk10));
228 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, vk10));
229 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi4x2345, vk10));
230
231 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, vk20));
232 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, vk20));
233 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, vk20));
234 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20));
235
236 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, vk30));
237 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x2345, vk30));
238 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30));
239 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi6x2345, vk30));
240
241 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, vk40));
242 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40));
243 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, vk40));
244 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x2345, vk40));
245
246 const v128_t vi0x5678 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 1, 2, 3, 4);
247 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4);
248 const v128_t vi2x5678 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 1, 2, 3, 4);
249 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4);
250 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4);
251 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4);
252 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4);
253 const v128_t vi7x5678 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 1, 2, 3, 4);
254
255 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, vk03));
256 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi1x5678, vk03));
257 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, vk03));
258 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi3x5678, vk03));
259
260 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, vk13));
261 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, vk13));
262 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x5678, vk13));
263 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, vk13));
264
265 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23));
266 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi3x5678, vk23));
267 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, vk23));
268 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi5x5678, vk23));
269
270 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x5678, vk33));
271 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x5678, vk33));
272 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x5678, vk33));
273 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33));
274
275 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, vk43));
276 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x5678, vk43));
277 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43));
278 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x5678, vk43));
279
280 const v128_t vi0x6789 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 2, 3, 4, 5);
281 vi0x4567 = vi0x89AB;
282 const v128_t vi1x6789 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 2, 3, 4, 5);
283 vi1x4567 = vi1x89AB;
284 const v128_t vi2x6789 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 2, 3, 4, 5);
285 vi2x4567 = vi2x89AB;
286 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5);
287 vi3x4567 = vi3x89AB;
288 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5);
289 vi4x4567 = vi4x89AB;
290 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5);
291 vi5x4567 = vi5x89AB;
292 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5);
293 vi6x4567 = vi6x89AB;
294 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5);
295 vi7x4567 = vi7x89AB;
296
297 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x6789, vk04));
298 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x6789, vk04));
299 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x6789, vk04));
300 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x6789, vk04));
301
302 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14));
303 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14));
304 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14));
305 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi4x6789, vk14));
306
307 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24));
308 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24));
309 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24));
310 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24));
311
312 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34));
313 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34));
314 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34));
315 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi6x6789, vk34));
316
317 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44));
318 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44));
319 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44));
320 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44));
321
322 vo0p0 = wasm_f32x4_add(vo0p0, vo0p1);
323 vo1p0 = wasm_f32x4_add(vo1p0, vo1p1);
324 vo2p0 = wasm_f32x4_add(vo2p0, vo2p1);
325 vo3p0 = wasm_f32x4_add(vo3p0, vo3p1);
326
327 v128_t vo0 = wasm_v128_bitselect(vmin, vo0p0, wasm_f32x4_lt(vo0p0, vmin));
328 v128_t vo1 = wasm_v128_bitselect(vmin, vo1p0, wasm_f32x4_lt(vo1p0, vmin));
329 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin));
330 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin));
331 vo0 = wasm_v128_bitselect(vo0, vmax, wasm_f32x4_le(vo0, vmax));
332 vo1 = wasm_v128_bitselect(vo1, vmax, wasm_f32x4_le(vo1, vmax));
333 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax));
334 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax));
335
336 wasm_v128_store(o3, vo3); o3 += 4;
337 wasm_v128_store(o2, vo2); o2 += 4;
338 wasm_v128_store(o1, vo1); o1 += 4;
339 wasm_v128_store(o0, vo0); o0 += 4;
340 }
341 // Always process the last block of 5..8 pixels.
342 if XNN_LIKELY(w > 4 * sizeof(float)) {
343 v128_t vo0p0 = vbias;
344 v128_t vo1p0 = vbias;
345 v128_t vo2p0 = vbias;
346 v128_t vo3p0 = vbias;
347
348 v128_t vi0x89AB = wasm_v128_load(i0); i0 += 4;
349 v128_t vi1x89AB = wasm_v128_load(i1); i1 += 4;
350 v128_t vi2x89AB = wasm_v128_load(i2); i2 += 4;
351 v128_t vi3x89AB = wasm_v128_load(i3); i3 += 4;
352 v128_t vi4x89AB = wasm_v128_load(i4); i4 += 4;
353 v128_t vi5x89AB = wasm_v128_load(i5); i5 += 4;
354 v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4;
355 v128_t vi7x89AB = wasm_v128_load(i7); i7 += 4;
356
357 vi0x89AB = wasm_v128_and(vmask, vi0x89AB);
358 vi1x89AB = wasm_v128_and(vmask, vi1x89AB);
359 vi2x89AB = wasm_v128_and(vmask, vi2x89AB);
360 vi3x89AB = wasm_v128_and(vmask, vi3x89AB);
361 vi4x89AB = wasm_v128_and(vmask, vi4x89AB);
362 vi5x89AB = wasm_v128_and(vmask, vi5x89AB);
363 vi6x89AB = wasm_v128_and(vmask, vi6x89AB);
364 vi7x89AB = wasm_v128_and(vmask, vi7x89AB);
365
366 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02);
367 v128_t vo1p1 = wasm_f32x4_mul(vi1x4567, vk02);
368 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, vk02);
369 v128_t vo3p1 = wasm_f32x4_mul(vi3x4567, vk02);
370
371 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk12));
372 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, vk12));
373 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, vk12));
374 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, vk12));
375
376 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk22));
377 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk22));
378 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk22));
379 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk22));
380
381 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32));
382 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x4567, vk32));
383 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32));
384 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi6x4567, vk32));
385
386 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42));
387 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42));
388 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42));
389 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42));
390
391 const v128_t vi0x3456 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 3, 4, 5, 6);
392 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6);
393 const v128_t vi2x3456 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 3, 4, 5, 6);
394 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6);
395 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6);
396 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6);
397 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6);
398 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6);
399
400 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01));
401 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi1x3456, vk01));
402 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, vk01));
403 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi3x3456, vk01));
404
405 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk11));
406 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, vk11));
407 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, vk11));
408 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk11));
409
410 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, vk21));
411 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi3x3456, vk21));
412 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, vk21));
413 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi5x3456, vk21));
414
415 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, vk31));
416 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, vk31));
417 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, vk31));
418 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31));
419
420 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41));
421 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x3456, vk41));
422 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41));
423 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x3456, vk41));
424
425 const v128_t vi0x2345 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 2, 3, 4, 5);
426 vi0x0123 = vi0x4567;
427 const v128_t vi1x2345 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 2, 3, 4, 5);
428 vi1x0123 = vi1x4567;
429 const v128_t vi2x2345 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 2, 3, 4, 5);
430 vi2x0123 = vi2x4567;
431 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5);
432 vi3x0123 = vi3x4567;
433 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5);
434 vi4x0123 = vi4x4567;
435 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5);
436 vi5x0123 = vi5x4567;
437 const v128_t vi6x2345 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 2, 3, 4, 5);
438 vi6x0123 = vi6x4567;
439 const v128_t vi7x2345 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 2, 3, 4, 5);
440 vi7x0123 = vi7x4567;
441
442 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x2345, vk00));
443 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x2345, vk00));
444 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x2345, vk00));
445 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x2345, vk00));
446
447 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10));
448 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x2345, vk10));
449 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, vk10));
450 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi4x2345, vk10));
451
452 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, vk20));
453 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, vk20));
454 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, vk20));
455 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20));
456
457 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, vk30));
458 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x2345, vk30));
459 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30));
460 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi6x2345, vk30));
461
462 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, vk40));
463 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40));
464 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, vk40));
465 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x2345, vk40));
466
467 const v128_t vi0x5678 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 1, 2, 3, 4);
468 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4);
469 const v128_t vi2x5678 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 1, 2, 3, 4);
470 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4);
471 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4);
472 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4);
473 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4);
474 const v128_t vi7x5678 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 1, 2, 3, 4);
475
476 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, vk03));
477 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi1x5678, vk03));
478 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, vk03));
479 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi3x5678, vk03));
480
481 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, vk13));
482 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, vk13));
483 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x5678, vk13));
484 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, vk13));
485
486 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23));
487 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi3x5678, vk23));
488 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, vk23));
489 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi5x5678, vk23));
490
491 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x5678, vk33));
492 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x5678, vk33));
493 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x5678, vk33));
494 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33));
495
496 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, vk43));
497 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x5678, vk43));
498 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43));
499 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x5678, vk43));
500
501 const v128_t vi0x6789 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 2, 3, 4, 5);
502 vi0x4567 = vi0x89AB;
503 const v128_t vi1x6789 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 2, 3, 4, 5);
504 vi1x4567 = vi1x89AB;
505 const v128_t vi2x6789 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 2, 3, 4, 5);
506 vi2x4567 = vi2x89AB;
507 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5);
508 vi3x4567 = vi3x89AB;
509 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5);
510 vi4x4567 = vi4x89AB;
511 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5);
512 vi5x4567 = vi5x89AB;
513 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5);
514 vi6x4567 = vi6x89AB;
515 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5);
516 vi7x4567 = vi7x89AB;
517
518 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x6789, vk04));
519 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x6789, vk04));
520 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x6789, vk04));
521 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x6789, vk04));
522
523 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14));
524 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14));
525 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14));
526 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi4x6789, vk14));
527
528 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24));
529 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24));
530 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24));
531 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24));
532
533 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34));
534 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34));
535 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34));
536 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi6x6789, vk34));
537
538 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44));
539 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44));
540 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44));
541 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44));
542
543 vo0p0 = wasm_f32x4_add(vo0p0, vo0p1);
544 vo1p0 = wasm_f32x4_add(vo1p0, vo1p1);
545 vo2p0 = wasm_f32x4_add(vo2p0, vo2p1);
546 vo3p0 = wasm_f32x4_add(vo3p0, vo3p1);
547
548 v128_t vo0 = wasm_v128_bitselect(vmin, vo0p0, wasm_f32x4_lt(vo0p0, vmin));
549 v128_t vo1 = wasm_v128_bitselect(vmin, vo1p0, wasm_f32x4_lt(vo1p0, vmin));
550 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin));
551 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin));
552 vo0 = wasm_v128_bitselect(vo0, vmax, wasm_f32x4_le(vo0, vmax));
553 vo1 = wasm_v128_bitselect(vo1, vmax, wasm_f32x4_le(vo1, vmax));
554 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax));
555 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax));
556
557 wasm_v128_store(o3, vo3); o3 += 4;
558 wasm_v128_store(o2, vo2); o2 += 4;
559 wasm_v128_store(o1, vo1); o1 += 4;
560 wasm_v128_store(o0, vo0); o0 += 4;
561
562 w -= 4 * sizeof(float);
563 }
564 assert(w >= 1 * sizeof(float));
565 assert(w <= 4 * sizeof(float));
566 {
567 v128_t vo0p0 = vbias;
568 v128_t vo1p0 = vbias;
569 v128_t vo2p0 = vbias;
570 v128_t vo3p0 = vbias;
571
572 vi0x4567 = wasm_v128_and(vmask, vi0x4567);
573 vi1x4567 = wasm_v128_and(vmask, vi1x4567);
574 vi2x4567 = wasm_v128_and(vmask, vi2x4567);
575 vi3x4567 = wasm_v128_and(vmask, vi3x4567);
576 vi4x4567 = wasm_v128_and(vmask, vi4x4567);
577 vi5x4567 = wasm_v128_and(vmask, vi5x4567);
578 vi6x4567 = wasm_v128_and(vmask, vi6x4567);
579 vi7x4567 = wasm_v128_and(vmask, vi7x4567);
580
581 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02);
582 v128_t vo1p1 = wasm_f32x4_mul(vi1x4567, vk02);
583 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, vk02);
584 v128_t vo3p1 = wasm_f32x4_mul(vi3x4567, vk02);
585
586 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk12));
587 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, vk12));
588 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, vk12));
589 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x4567, vk12));
590
591 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk22));
592 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk22));
593 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk22));
594 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, vk22));
595
596 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32));
597 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x4567, vk32));
598 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32));
599 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi6x4567, vk32));
600
601 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42));
602 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42));
603 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42));
604 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x4567, vk42));
605
606 const v128_t vi0x3456 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 3, 4, 5, 6);
607 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6);
608 const v128_t vi2x3456 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 3, 4, 5, 6);
609 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6);
610 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6);
611 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6);
612 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6);
613 const v128_t vi7x3456 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 3, 4, 5, 6);
614
615 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01));
616 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi1x3456, vk01));
617 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, vk01));
618 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi3x3456, vk01));
619
620 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk11));
621 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, vk11));
622 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, vk11));
623 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk11));
624
625 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, vk21));
626 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi3x3456, vk21));
627 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, vk21));
628 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi5x3456, vk21));
629
630 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, vk31));
631 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, vk31));
632 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, vk31));
633 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31));
634
635 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41));
636 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x3456, vk41));
637 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41));
638 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x3456, vk41));
639
640 const v128_t vi0x2345 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 2, 3, 4, 5);
641 const v128_t vi1x2345 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 2, 3, 4, 5);
642 const v128_t vi2x2345 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 2, 3, 4, 5);
643 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5);
644 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5);
645 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5);
646 const v128_t vi6x2345 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 2, 3, 4, 5);
647 const v128_t vi7x2345 = wasm_v32x4_shuffle(vi7x0123, vi7x4567, 2, 3, 4, 5);
648
649 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x2345, vk00));
650 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x2345, vk00));
651 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x2345, vk00));
652 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x2345, vk00));
653
654 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10));
655 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x2345, vk10));
656 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, vk10));
657 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi4x2345, vk10));
658
659 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, vk20));
660 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, vk20));
661 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, vk20));
662 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20));
663
664 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, vk30));
665 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x2345, vk30));
666 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30));
667 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi6x2345, vk30));
668
669 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, vk40));
670 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40));
671 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, vk40));
672 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x2345, vk40));
673
674 const v128_t vi0x5678 = wasm_v32x4_shuffle(vi0x4567, vzero, 1, 2, 3, 4);
675 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4);
676 const v128_t vi2x5678 = wasm_v32x4_shuffle(vi2x4567, vzero, 1, 2, 3, 4);
677 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vzero, 1, 2, 3, 4);
678 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vzero, 1, 2, 3, 4);
679 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vzero, 1, 2, 3, 4);
680 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4);
681 const v128_t vi7x5678 = wasm_v32x4_shuffle(vi7x4567, vzero, 1, 2, 3, 4);
682
683 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, vk03));
684 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi1x5678, vk03));
685 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, vk03));
686 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi3x5678, vk03));
687
688 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, vk13));
689 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, vk13));
690 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x5678, vk13));
691 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, vk13));
692
693 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23));
694 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi3x5678, vk23));
695 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, vk23));
696 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi5x5678, vk23));
697
698 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x5678, vk33));
699 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x5678, vk33));
700 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x5678, vk33));
701 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33));
702
703 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, vk43));
704 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x5678, vk43));
705 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43));
706 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi7x5678, vk43));
707
708 const v128_t vi0x6789 = wasm_v32x4_shuffle(vi0x5678, vzero, 1, 2, 3, 4);
709 const v128_t vi1x6789 = wasm_v32x4_shuffle(vi1x5678, vzero, 1, 2, 3, 4);
710 const v128_t vi2x6789 = wasm_v32x4_shuffle(vi2x5678, vzero, 1, 2, 3, 4);
711 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x5678, vzero, 1, 2, 3, 4);
712 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x5678, vzero, 1, 2, 3, 4);
713 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x5678, vzero, 1, 2, 3, 4);
714 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x5678, vzero, 1, 2, 3, 4);
715 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x5678, vzero, 1, 2, 3, 4);
716
717 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x6789, vk04));
718 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x6789, vk04));
719 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x6789, vk04));
720 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x6789, vk04));
721
722 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14));
723 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14));
724 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14));
725 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi4x6789, vk14));
726
727 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24));
728 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24));
729 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24));
730 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24));
731
732 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34));
733 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34));
734 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34));
735 vo3p1 = wasm_f32x4_add(vo3p1, wasm_f32x4_mul(vi6x6789, vk34));
736
737 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44));
738 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44));
739 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44));
740 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44));
741
742 vo0p0 = wasm_f32x4_add(vo0p0, vo0p1);
743 vo1p0 = wasm_f32x4_add(vo1p0, vo1p1);
744 vo2p0 = wasm_f32x4_add(vo2p0, vo2p1);
745 vo3p0 = wasm_f32x4_add(vo3p0, vo3p1);
746
747 v128_t vo0 = wasm_v128_bitselect(vmin, vo0p0, wasm_f32x4_lt(vo0p0, vmin));
748 v128_t vo1 = wasm_v128_bitselect(vmin, vo1p0, wasm_f32x4_lt(vo1p0, vmin));
749 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin));
750 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin));
751 vo0 = wasm_v128_bitselect(vo0, vmax, wasm_f32x4_le(vo0, vmax));
752 vo1 = wasm_v128_bitselect(vo1, vmax, wasm_f32x4_le(vo1, vmax));
753 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax));
754 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax));
755
756 if XNN_LIKELY(w & (4 * sizeof(float))) {
757 wasm_v128_store(o3, vo3); o3 += 4;
758 wasm_v128_store(o2, vo2); o2 += 4;
759 wasm_v128_store(o1, vo1); o1 += 4;
760 wasm_v128_store(o0, vo0); o0 += 4;
761 } else {
762 if (w & (2 * sizeof(float))) {
763 *((double*) o3) = wasm_f64x2_extract_lane(vo3, 0); o3 += 2;
764 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); o2 += 2;
765 *((double*) o1) = wasm_f64x2_extract_lane(vo1, 0); o1 += 2;
766 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); o0 += 2;
767
768 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1);
769 vo1 = wasm_v32x4_shuffle(vo1, vo1, 2, 3, 0, 1);
770 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1);
771 vo3 = wasm_v32x4_shuffle(vo3, vo3, 2, 3, 0, 1);
772 }
773 if (w & (1 * sizeof(float))) {
774 *o3 = wasm_f32x4_extract_lane(vo3, 0); o3 += 1;
775 *o2 = wasm_f32x4_extract_lane(vo2, 0); o2 += 1;
776 *o1 = wasm_f32x4_extract_lane(vo1, 0); o1 += 1;
777 *o0 = wasm_f32x4_extract_lane(vo0, 0); o0 += 1;
778 }
779 }
780 }
781
782 i0 = (const float*) ((uintptr_t) i4 - input_decrement);
783 i1 = (const float*) ((uintptr_t) i5 - input_decrement);
784 i2 = (const float*) ((uintptr_t) i1 + input_width);
785 i3 = (const float*) ((uintptr_t) i2 + input_width);
786 i4 = (const float*) ((uintptr_t) i3 + input_width);
787 i5 = (const float*) ((uintptr_t) i4 + input_width);
788 i6 = (const float*) ((uintptr_t) i5 + input_width);
789 i7 = (const float*) ((uintptr_t) i6 + input_width);
790
791 o0 = o3;
792 o1 = (float*) ((uintptr_t) o0 + input_width);
793 o2 = (float*) ((uintptr_t) o1 + input_width);
794 o3 = (float*) ((uintptr_t) o2 + input_width);
795
796 output_height = doz(output_height, 4);
797 } while (output_height != 0);
798 }
799