1 // Auto-generated file. Do not edit!
2 // Template: src/f32-dwconv2d-chw/5x5p2-wasmsimd-loadsplat.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <wasm_simd128.h>
13
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16
17
18
xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])19 void xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2(
20 size_t input_height,
21 size_t input_width,
22 const float* input,
23 const float* weights,
24 const float* zero,
25 float* output,
26 uint32_t padding_top,
27 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
28 {
29 assert(input_height != 0);
30 assert(input_width != 0);
31 assert(input_width % sizeof(float) == 0);
32 assert(padding_top == 2);
33
34 const v128_t vmask = wasm_v128_load(params->scalar.mask);
35 const v128_t vmax = wasm_v32x4_load_splat(¶ms->scalar.max);
36 const v128_t vmin = wasm_v32x4_load_splat(¶ms->scalar.min);
37
38 const v128_t vw0123 = wasm_v128_load(weights);
39 const v128_t vw4567 = wasm_v128_load(weights + 4);
40 const v128_t vw89AB = wasm_v128_load(weights + 8);
41 const v128_t vwCDEF = wasm_v128_load(weights + 12);
42 const v128_t vwGHIJ = wasm_v128_load(weights + 16);
43 const v128_t vwKLMN = wasm_v128_load(weights + 20);
44 const v128_t vwOP = wasm_v64x2_load_splat(weights + 24);
45 const v128_t vbias = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0);
46 const v128_t vk00 = wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1);
47 const v128_t vk01 = wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2);
48 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3);
49 const v128_t vk03 = wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0);
50 const v128_t vk04 = wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1);
51 const v128_t vk10 = wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2);
52 const v128_t vk11 = wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3);
53 const v128_t vk12 = wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0);
54 const v128_t vk13 = wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1);
55 const v128_t vk14 = wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2);
56 const v128_t vk20 = wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3);
57 const v128_t vk21 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0);
58 const v128_t vk22 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1);
59 const v128_t vk23 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2);
60 const v128_t vk24 = wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3);
61 const v128_t vk30 = wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0);
62 const v128_t vk31 = wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1);
63 const v128_t vk32 = wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2);
64 const v128_t vk33 = wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3);
65 const v128_t vk34 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0);
66 const v128_t vk40 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1);
67 const v128_t vk41 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2);
68 const v128_t vk42 = wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3);
69 const v128_t vk43 = wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0);
70 const v128_t vk44 = wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1);
71
72 const v128_t vzero = wasm_f32x4_splat(0.0f);
73
74 const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
75
76 const float* i0 = zero;
77 const float* i1 = zero;
78 const float* i2 = input;
79 const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
80 const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
81 const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
82 const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
83
84 float* o0 = output;
85 float* o1 = (float*) ((uintptr_t) o0 + input_width);
86 float* o2 = (float*) ((uintptr_t) o1 + input_width);
87
88 size_t output_height = input_height;
89 do {
90 if XNN_UNPREDICTABLE(output_height < 2) {
91 i3 = zero;
92 o1 = o0;
93 }
94 if XNN_UNPREDICTABLE(output_height < 3) {
95 i4 = zero;
96 o2 = o1;
97 }
98 if XNN_UNPREDICTABLE(output_height < 4) {
99 i5 = zero;
100 }
101 if XNN_UNPREDICTABLE(output_height < 5) {
102 i6 = zero;
103 }
104
105 v128_t vi0x0123 = vzero;
106 v128_t vi1x0123 = vzero;
107 v128_t vi2x0123 = vzero;
108 v128_t vi3x0123 = vzero;
109 v128_t vi4x0123 = vzero;
110 v128_t vi5x0123 = vzero;
111 v128_t vi6x0123 = vzero;
112
113 v128_t vi0x4567 = wasm_v128_load(i0); i0 += 4;
114 v128_t vi1x4567 = wasm_v128_load(i1); i1 += 4;
115 v128_t vi2x4567 = wasm_v128_load(i2); i2 += 4;
116 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4;
117 v128_t vi4x4567 = wasm_v128_load(i4); i4 += 4;
118 v128_t vi5x4567 = wasm_v128_load(i5); i5 += 4;
119 v128_t vi6x4567 = wasm_v128_load(i6); i6 += 4;
120
121 size_t w = input_width;
122 for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
123 v128_t vo0p0 = vbias;
124 v128_t vo1p0 = vbias;
125 v128_t vo2p0 = vbias;
126
127 const v128_t vi0x89AB = wasm_v128_load(i0); i0 += 4;
128 const v128_t vi1x89AB = wasm_v128_load(i1); i1 += 4;
129 const v128_t vi2x89AB = wasm_v128_load(i2); i2 += 4;
130 const v128_t vi3x89AB = wasm_v128_load(i3); i3 += 4;
131 const v128_t vi4x89AB = wasm_v128_load(i4); i4 += 4;
132 const v128_t vi5x89AB = wasm_v128_load(i5); i5 += 4;
133 const v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4;
134
135 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02);
136 v128_t vo1p1 = wasm_f32x4_mul(vi1x4567, vk02);
137 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, vk02);
138
139 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk12));
140 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, vk12));
141 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, vk12));
142
143 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk22));
144 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk22));
145 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk22));
146
147 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32));
148 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x4567, vk32));
149 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32));
150
151 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42));
152 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42));
153 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42));
154
155 const v128_t vi0x3456 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 3, 4, 5, 6);
156 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6);
157 const v128_t vi2x3456 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 3, 4, 5, 6);
158 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6);
159 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6);
160 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6);
161 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6);
162
163 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01));
164 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi1x3456, vk01));
165 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, vk01));
166
167 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk11));
168 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, vk11));
169 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, vk11));
170
171 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, vk21));
172 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi3x3456, vk21));
173 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, vk21));
174
175 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, vk31));
176 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, vk31));
177 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, vk31));
178
179 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41));
180 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x3456, vk41));
181 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41));
182
183 const v128_t vi0x2345 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 2, 3, 4, 5);
184 vi0x0123 = vi0x4567;
185 const v128_t vi1x2345 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 2, 3, 4, 5);
186 vi1x0123 = vi1x4567;
187 const v128_t vi2x2345 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 2, 3, 4, 5);
188 vi2x0123 = vi2x4567;
189 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5);
190 vi3x0123 = vi3x4567;
191 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5);
192 vi4x0123 = vi4x4567;
193 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5);
194 vi5x0123 = vi5x4567;
195 const v128_t vi6x2345 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 2, 3, 4, 5);
196 vi6x0123 = vi6x4567;
197
198 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x2345, vk00));
199 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x2345, vk00));
200 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x2345, vk00));
201
202 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10));
203 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x2345, vk10));
204 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, vk10));
205
206 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, vk20));
207 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, vk20));
208 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, vk20));
209
210 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, vk30));
211 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x2345, vk30));
212 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30));
213
214 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, vk40));
215 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40));
216 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, vk40));
217
218 const v128_t vi0x5678 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 1, 2, 3, 4);
219 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4);
220 const v128_t vi2x5678 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 1, 2, 3, 4);
221 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4);
222 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4);
223 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4);
224 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4);
225
226 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, vk03));
227 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi1x5678, vk03));
228 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, vk03));
229
230 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, vk13));
231 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, vk13));
232 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x5678, vk13));
233
234 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23));
235 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi3x5678, vk23));
236 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, vk23));
237
238 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x5678, vk33));
239 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x5678, vk33));
240 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x5678, vk33));
241
242 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, vk43));
243 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x5678, vk43));
244 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43));
245
246 const v128_t vi0x6789 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 2, 3, 4, 5);
247 vi0x4567 = vi0x89AB;
248 const v128_t vi1x6789 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 2, 3, 4, 5);
249 vi1x4567 = vi1x89AB;
250 const v128_t vi2x6789 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 2, 3, 4, 5);
251 vi2x4567 = vi2x89AB;
252 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5);
253 vi3x4567 = vi3x89AB;
254 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5);
255 vi4x4567 = vi4x89AB;
256 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5);
257 vi5x4567 = vi5x89AB;
258 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5);
259 vi6x4567 = vi6x89AB;
260
261 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x6789, vk04));
262 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x6789, vk04));
263 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x6789, vk04));
264
265 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14));
266 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14));
267 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14));
268
269 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24));
270 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24));
271 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24));
272
273 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34));
274 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34));
275 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34));
276
277 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44));
278 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44));
279 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44));
280
281 vo0p0 = wasm_f32x4_add(vo0p0, vo0p1);
282 vo1p0 = wasm_f32x4_add(vo1p0, vo1p1);
283 vo2p0 = wasm_f32x4_add(vo2p0, vo2p1);
284
285 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin);
286 v128_t vo1 = wasm_f32x4_max(vo1p0, vmin);
287 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin);
288 vo0 = wasm_f32x4_min(vo0, vmax);
289 vo1 = wasm_f32x4_min(vo1, vmax);
290 vo2 = wasm_f32x4_min(vo2, vmax);
291
292 wasm_v128_store(o2, vo2); o2 += 4;
293 wasm_v128_store(o1, vo1); o1 += 4;
294 wasm_v128_store(o0, vo0); o0 += 4;
295 }
296 // Always process the last block of 5..8 pixels.
297 if XNN_LIKELY(w > 4 * sizeof(float)) {
298 v128_t vo0p0 = vbias;
299 v128_t vo1p0 = vbias;
300 v128_t vo2p0 = vbias;
301
302 v128_t vi0x89AB = wasm_v128_load(i0); i0 += 4;
303 v128_t vi1x89AB = wasm_v128_load(i1); i1 += 4;
304 v128_t vi2x89AB = wasm_v128_load(i2); i2 += 4;
305 v128_t vi3x89AB = wasm_v128_load(i3); i3 += 4;
306 v128_t vi4x89AB = wasm_v128_load(i4); i4 += 4;
307 v128_t vi5x89AB = wasm_v128_load(i5); i5 += 4;
308 v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4;
309
310 vi0x89AB = wasm_v128_and(vmask, vi0x89AB);
311 vi1x89AB = wasm_v128_and(vmask, vi1x89AB);
312 vi2x89AB = wasm_v128_and(vmask, vi2x89AB);
313 vi3x89AB = wasm_v128_and(vmask, vi3x89AB);
314 vi4x89AB = wasm_v128_and(vmask, vi4x89AB);
315 vi5x89AB = wasm_v128_and(vmask, vi5x89AB);
316 vi6x89AB = wasm_v128_and(vmask, vi6x89AB);
317
318 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02);
319 v128_t vo1p1 = wasm_f32x4_mul(vi1x4567, vk02);
320 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, vk02);
321
322 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk12));
323 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, vk12));
324 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, vk12));
325
326 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk22));
327 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk22));
328 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk22));
329
330 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32));
331 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x4567, vk32));
332 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32));
333
334 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42));
335 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42));
336 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42));
337
338 const v128_t vi0x3456 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 3, 4, 5, 6);
339 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6);
340 const v128_t vi2x3456 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 3, 4, 5, 6);
341 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6);
342 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6);
343 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6);
344 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6);
345
346 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01));
347 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi1x3456, vk01));
348 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, vk01));
349
350 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk11));
351 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, vk11));
352 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, vk11));
353
354 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, vk21));
355 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi3x3456, vk21));
356 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, vk21));
357
358 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, vk31));
359 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, vk31));
360 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, vk31));
361
362 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41));
363 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x3456, vk41));
364 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41));
365
366 const v128_t vi0x2345 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 2, 3, 4, 5);
367 vi0x0123 = vi0x4567;
368 const v128_t vi1x2345 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 2, 3, 4, 5);
369 vi1x0123 = vi1x4567;
370 const v128_t vi2x2345 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 2, 3, 4, 5);
371 vi2x0123 = vi2x4567;
372 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5);
373 vi3x0123 = vi3x4567;
374 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5);
375 vi4x0123 = vi4x4567;
376 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5);
377 vi5x0123 = vi5x4567;
378 const v128_t vi6x2345 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 2, 3, 4, 5);
379 vi6x0123 = vi6x4567;
380
381 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x2345, vk00));
382 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x2345, vk00));
383 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x2345, vk00));
384
385 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10));
386 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x2345, vk10));
387 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, vk10));
388
389 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, vk20));
390 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, vk20));
391 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, vk20));
392
393 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, vk30));
394 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x2345, vk30));
395 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30));
396
397 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, vk40));
398 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40));
399 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, vk40));
400
401 const v128_t vi0x5678 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 1, 2, 3, 4);
402 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 1, 2, 3, 4);
403 const v128_t vi2x5678 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 1, 2, 3, 4);
404 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4);
405 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4);
406 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4);
407 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4);
408
409 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, vk03));
410 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi1x5678, vk03));
411 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, vk03));
412
413 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, vk13));
414 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, vk13));
415 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x5678, vk13));
416
417 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23));
418 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi3x5678, vk23));
419 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, vk23));
420
421 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x5678, vk33));
422 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x5678, vk33));
423 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x5678, vk33));
424
425 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, vk43));
426 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x5678, vk43));
427 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43));
428
429 const v128_t vi0x6789 = wasm_v32x4_shuffle(vi0x4567, vi0x89AB, 2, 3, 4, 5);
430 vi0x4567 = vi0x89AB;
431 const v128_t vi1x6789 = wasm_v32x4_shuffle(vi1x4567, vi1x89AB, 2, 3, 4, 5);
432 vi1x4567 = vi1x89AB;
433 const v128_t vi2x6789 = wasm_v32x4_shuffle(vi2x4567, vi2x89AB, 2, 3, 4, 5);
434 vi2x4567 = vi2x89AB;
435 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5);
436 vi3x4567 = vi3x89AB;
437 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 2, 3, 4, 5);
438 vi4x4567 = vi4x89AB;
439 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5);
440 vi5x4567 = vi5x89AB;
441 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5);
442 vi6x4567 = vi6x89AB;
443
444 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x6789, vk04));
445 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x6789, vk04));
446 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x6789, vk04));
447
448 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14));
449 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14));
450 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14));
451
452 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24));
453 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24));
454 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24));
455
456 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34));
457 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34));
458 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34));
459
460 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44));
461 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44));
462 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44));
463
464 vo0p0 = wasm_f32x4_add(vo0p0, vo0p1);
465 vo1p0 = wasm_f32x4_add(vo1p0, vo1p1);
466 vo2p0 = wasm_f32x4_add(vo2p0, vo2p1);
467
468 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin);
469 v128_t vo1 = wasm_f32x4_max(vo1p0, vmin);
470 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin);
471 vo0 = wasm_f32x4_min(vo0, vmax);
472 vo1 = wasm_f32x4_min(vo1, vmax);
473 vo2 = wasm_f32x4_min(vo2, vmax);
474
475 wasm_v128_store(o2, vo2); o2 += 4;
476 wasm_v128_store(o1, vo1); o1 += 4;
477 wasm_v128_store(o0, vo0); o0 += 4;
478
479 w -= 4 * sizeof(float);
480 }
481 assert(w >= 1 * sizeof(float));
482 assert(w <= 4 * sizeof(float));
483 {
484 v128_t vo0p0 = vbias;
485 v128_t vo1p0 = vbias;
486 v128_t vo2p0 = vbias;
487
488 vi0x4567 = wasm_v128_and(vmask, vi0x4567);
489 vi1x4567 = wasm_v128_and(vmask, vi1x4567);
490 vi2x4567 = wasm_v128_and(vmask, vi2x4567);
491 vi3x4567 = wasm_v128_and(vmask, vi3x4567);
492 vi4x4567 = wasm_v128_and(vmask, vi4x4567);
493 vi5x4567 = wasm_v128_and(vmask, vi5x4567);
494 vi6x4567 = wasm_v128_and(vmask, vi6x4567);
495
496 v128_t vo0p1 = wasm_f32x4_mul(vi0x4567, vk02);
497 v128_t vo1p1 = wasm_f32x4_mul(vi1x4567, vk02);
498 v128_t vo2p1 = wasm_f32x4_mul(vi2x4567, vk02);
499
500 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x4567, vk12));
501 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, vk12));
502 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, vk12));
503
504 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, vk22));
505 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk22));
506 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk22));
507
508 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32));
509 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x4567, vk32));
510 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x4567, vk32));
511
512 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x4567, vk42));
513 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42));
514 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42));
515
516 const v128_t vi0x3456 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 3, 4, 5, 6);
517 const v128_t vi1x3456 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 3, 4, 5, 6);
518 const v128_t vi2x3456 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 3, 4, 5, 6);
519 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6);
520 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6);
521 const v128_t vi5x3456 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 3, 4, 5, 6);
522 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6);
523
524 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x3456, vk01));
525 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi1x3456, vk01));
526 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x3456, vk01));
527
528 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x3456, vk11));
529 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, vk11));
530 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, vk11));
531
532 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x3456, vk21));
533 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi3x3456, vk21));
534 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x3456, vk21));
535
536 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x3456, vk31));
537 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, vk31));
538 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, vk31));
539
540 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x3456, vk41));
541 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x3456, vk41));
542 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41));
543
544 const v128_t vi0x2345 = wasm_v32x4_shuffle(vi0x0123, vi0x4567, 2, 3, 4, 5);
545 const v128_t vi1x2345 = wasm_v32x4_shuffle(vi1x0123, vi1x4567, 2, 3, 4, 5);
546 const v128_t vi2x2345 = wasm_v32x4_shuffle(vi2x0123, vi2x4567, 2, 3, 4, 5);
547 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5);
548 const v128_t vi4x2345 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 2, 3, 4, 5);
549 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5);
550 const v128_t vi6x2345 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 2, 3, 4, 5);
551
552 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x2345, vk00));
553 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x2345, vk00));
554 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x2345, vk00));
555
556 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x2345, vk10));
557 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x2345, vk10));
558 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x2345, vk10));
559
560 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x2345, vk20));
561 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x2345, vk20));
562 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x2345, vk20));
563
564 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x2345, vk30));
565 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x2345, vk30));
566 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30));
567
568 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x2345, vk40));
569 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40));
570 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x2345, vk40));
571
572 const v128_t vi0x5678 = wasm_v32x4_shuffle(vi0x4567, vzero, 1, 2, 3, 4);
573 const v128_t vi1x5678 = wasm_v32x4_shuffle(vi1x4567, vzero, 1, 2, 3, 4);
574 const v128_t vi2x5678 = wasm_v32x4_shuffle(vi2x4567, vzero, 1, 2, 3, 4);
575 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vzero, 1, 2, 3, 4);
576 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vzero, 1, 2, 3, 4);
577 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vzero, 1, 2, 3, 4);
578 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4);
579
580 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi0x5678, vk03));
581 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi1x5678, vk03));
582 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi2x5678, vk03));
583
584 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi1x5678, vk13));
585 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x5678, vk13));
586 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x5678, vk13));
587
588 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi2x5678, vk23));
589 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi3x5678, vk23));
590 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi4x5678, vk23));
591
592 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x5678, vk33));
593 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x5678, vk33));
594 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x5678, vk33));
595
596 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi4x5678, vk43));
597 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x5678, vk43));
598 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43));
599
600 const v128_t vi0x6789 = wasm_v32x4_shuffle(vi0x5678, vzero, 1, 2, 3, 4);
601 const v128_t vi1x6789 = wasm_v32x4_shuffle(vi1x5678, vzero, 1, 2, 3, 4);
602 const v128_t vi2x6789 = wasm_v32x4_shuffle(vi2x5678, vzero, 1, 2, 3, 4);
603 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x5678, vzero, 1, 2, 3, 4);
604 const v128_t vi4x6789 = wasm_v32x4_shuffle(vi4x5678, vzero, 1, 2, 3, 4);
605 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x5678, vzero, 1, 2, 3, 4);
606 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x5678, vzero, 1, 2, 3, 4);
607
608 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x6789, vk04));
609 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x6789, vk04));
610 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x6789, vk04));
611
612 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi1x6789, vk14));
613 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi2x6789, vk14));
614 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14));
615
616 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x6789, vk24));
617 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24));
618 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x6789, vk24));
619
620 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34));
621 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi4x6789, vk34));
622 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34));
623
624 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi4x6789, vk44));
625 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44));
626 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x6789, vk44));
627
628 vo0p0 = wasm_f32x4_add(vo0p0, vo0p1);
629 vo1p0 = wasm_f32x4_add(vo1p0, vo1p1);
630 vo2p0 = wasm_f32x4_add(vo2p0, vo2p1);
631
632 v128_t vo0 = wasm_f32x4_max(vo0p0, vmin);
633 v128_t vo1 = wasm_f32x4_max(vo1p0, vmin);
634 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin);
635 vo0 = wasm_f32x4_min(vo0, vmax);
636 vo1 = wasm_f32x4_min(vo1, vmax);
637 vo2 = wasm_f32x4_min(vo2, vmax);
638
639 if XNN_LIKELY(w & (4 * sizeof(float))) {
640 wasm_v128_store(o2, vo2); o2 += 4;
641 wasm_v128_store(o1, vo1); o1 += 4;
642 wasm_v128_store(o0, vo0); o0 += 4;
643 } else {
644 if (w & (2 * sizeof(float))) {
645 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); o2 += 2;
646 *((double*) o1) = wasm_f64x2_extract_lane(vo1, 0); o1 += 2;
647 *((double*) o0) = wasm_f64x2_extract_lane(vo0, 0); o0 += 2;
648
649 vo0 = wasm_v32x4_shuffle(vo0, vo0, 2, 3, 0, 1);
650 vo1 = wasm_v32x4_shuffle(vo1, vo1, 2, 3, 0, 1);
651 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1);
652 }
653 if (w & (1 * sizeof(float))) {
654 *o2 = wasm_f32x4_extract_lane(vo2, 0); o2 += 1;
655 *o1 = wasm_f32x4_extract_lane(vo1, 0); o1 += 1;
656 *o0 = wasm_f32x4_extract_lane(vo0, 0); o0 += 1;
657 }
658 }
659 }
660
661 i0 = (const float*) ((uintptr_t) i3 - input_decrement);
662 i1 = (const float*) ((uintptr_t) i4 - input_decrement);
663 i2 = (const float*) ((uintptr_t) i1 + input_width);
664 i3 = (const float*) ((uintptr_t) i2 + input_width);
665 i4 = (const float*) ((uintptr_t) i3 + input_width);
666 i5 = (const float*) ((uintptr_t) i4 + input_width);
667 i6 = (const float*) ((uintptr_t) i5 + input_width);
668
669 o0 = o2;
670 o1 = (float*) ((uintptr_t) o0 + input_width);
671 o2 = (float*) ((uintptr_t) o1 + input_width);
672
673 output_height = doz(output_height, 3);
674 } while (output_height != 0);
675 }
676