1 // Auto-generated file. Do not edit!
2 // Template: src/f32-dwconv2d-chw/5x5p2-sse.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <xmmintrin.h>
13
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/math.h>
16
17
xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4(
19 size_t input_height,
20 size_t input_width,
21 const float* input,
22 const float* weights,
23 const float* zero,
24 float* output,
25 uint32_t padding_top,
26 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28 assert(input_height != 0);
29 assert(input_width != 0);
30 assert(input_width % sizeof(float) == 0);
31 assert(padding_top == 2);
32
33 const __m128 vmask = _mm_load_ps((const float*) params->sse.mask);
34 const __m128 vmax = _mm_load_ps(params->sse.max);
35 const __m128 vmin = _mm_load_ps(params->sse.min);
36
37 const __m128 vbias = _mm_load1_ps(weights);
38 const __m128 vk00 = _mm_load1_ps(weights + 1);
39 const __m128 vk01 = _mm_load1_ps(weights + 2);
40 const __m128 vk02 = _mm_load1_ps(weights + 3);
41 const __m128 vk03 = _mm_load1_ps(weights + 4);
42 const __m128 vk04 = _mm_load1_ps(weights + 5);
43 const __m128 vk10 = _mm_load1_ps(weights + 6);
44 const __m128 vk11 = _mm_load1_ps(weights + 7);
45 const __m128 vk12 = _mm_load1_ps(weights + 8);
46 const __m128 vk13 = _mm_load1_ps(weights + 9);
47 const __m128 vk14 = _mm_load1_ps(weights + 10);
48 const __m128 vk20 = _mm_load1_ps(weights + 11);
49 const __m128 vk21 = _mm_load1_ps(weights + 12);
50 const __m128 vk22 = _mm_load1_ps(weights + 13);
51 const __m128 vk23 = _mm_load1_ps(weights + 14);
52 const __m128 vk24 = _mm_load1_ps(weights + 15);
53 const __m128 vk30 = _mm_load1_ps(weights + 16);
54 const __m128 vk31 = _mm_load1_ps(weights + 17);
55 const __m128 vk32 = _mm_load1_ps(weights + 18);
56 const __m128 vk33 = _mm_load1_ps(weights + 19);
57 const __m128 vk34 = _mm_load1_ps(weights + 20);
58 const __m128 vk40 = _mm_load1_ps(weights + 21);
59 const __m128 vk41 = _mm_load1_ps(weights + 22);
60 const __m128 vk42 = _mm_load1_ps(weights + 23);
61 const __m128 vk43 = _mm_load1_ps(weights + 24);
62 const __m128 vk44 = _mm_load1_ps(weights + 25);
63
64 const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
65
66 const float* i0 = zero;
67 const float* i1 = zero;
68 const float* i2 = input;
69 const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
70 const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
71 const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
72 const float* i6 = (const float*) ((uintptr_t) i5 + input_width);
73 const float* i7 = (const float*) ((uintptr_t) i6 + input_width);
74 const float* i8 = (const float*) ((uintptr_t) i7 + input_width);
75
76 float* o0 = output;
77 float* o1 = (float*) ((uintptr_t) o0 + input_width);
78 float* o2 = (float*) ((uintptr_t) o1 + input_width);
79 float* o3 = (float*) ((uintptr_t) o2 + input_width);
80 float* o4 = (float*) ((uintptr_t) o3 + input_width);
81
82 size_t output_height = input_height;
83 do {
84 if XNN_UNPREDICTABLE(output_height < 2) {
85 i3 = zero;
86 o1 = o0;
87 }
88 if XNN_UNPREDICTABLE(output_height < 3) {
89 i4 = zero;
90 o2 = o1;
91 }
92 if XNN_UNPREDICTABLE(output_height < 4) {
93 i5 = zero;
94 o3 = o2;
95 }
96 if XNN_UNPREDICTABLE(output_height < 5) {
97 i6 = zero;
98 o4 = o3;
99 }
100 if XNN_UNPREDICTABLE(output_height < 6) {
101 i7 = zero;
102 }
103 if XNN_UNPREDICTABLE(output_height < 7) {
104 i8 = zero;
105 }
106
107 __m128 vi0x3012 = _mm_setzero_ps();
108 __m128 vi1x3012 = _mm_setzero_ps();
109 __m128 vi2x3012 = _mm_setzero_ps();
110 __m128 vi3x3012 = _mm_setzero_ps();
111 __m128 vi4x3012 = _mm_setzero_ps();
112 __m128 vi5x3012 = _mm_setzero_ps();
113 __m128 vi6x3012 = _mm_setzero_ps();
114 __m128 vi7x3012 = _mm_setzero_ps();
115 __m128 vi8x3012 = _mm_setzero_ps();
116
117 __m128 vi0x4567 = _mm_loadu_ps(i0);
118 i0 += 4;
119 __m128 vi1x4567 = _mm_loadu_ps(i1);
120 i1 += 4;
121 __m128 vi2x4567 = _mm_loadu_ps(i2);
122 i2 += 4;
123 __m128 vi3x4567 = _mm_loadu_ps(i3);
124 i3 += 4;
125 __m128 vi4x4567 = _mm_loadu_ps(i4);
126 i4 += 4;
127 __m128 vi5x4567 = _mm_loadu_ps(i5);
128 i5 += 4;
129 __m128 vi6x4567 = _mm_loadu_ps(i6);
130 i6 += 4;
131 __m128 vi7x4567 = _mm_loadu_ps(i7);
132 i7 += 4;
133 __m128 vi8x4567 = _mm_loadu_ps(i8);
134 i8 += 4;
135
136 size_t w = input_width;
137 for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
138 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
139 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
140 __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
141 __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
142 __m128 vo4p0 = _mm_add_ps(vbias, _mm_mul_ps(vi4x4567, vk02));
143 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
144 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
145 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
146 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
147 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x4567, vk12));
148 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
149 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
150 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
151 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
152 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x4567, vk22));
153 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
154 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
155 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
156 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
157 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x4567, vk32));
158 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
159 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
160 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
161 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
162 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x4567, vk42));
163
164 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
165 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
166 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
167 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
168 const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
169 const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
170 const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
171 const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
172 const __m128 vi8x7456 = _mm_shuffle_ps(vi8x4567, vi8x4567, _MM_SHUFFLE(2, 1, 0, 3));
173
174 const __m128 vi0x89AB = _mm_loadu_ps(i0);
175 i0 += 4;
176 const __m128 vi1x89AB = _mm_loadu_ps(i1);
177 i1 += 4;
178 const __m128 vi2x89AB = _mm_loadu_ps(i2);
179 i2 += 4;
180 const __m128 vi3x89AB = _mm_loadu_ps(i3);
181 i3 += 4;
182 const __m128 vi4x89AB = _mm_loadu_ps(i4);
183 i4 += 4;
184 const __m128 vi5x89AB = _mm_loadu_ps(i5);
185 i5 += 4;
186 const __m128 vi6x89AB = _mm_loadu_ps(i6);
187 i6 += 4;
188 const __m128 vi7x89AB = _mm_loadu_ps(i7);
189 i7 += 4;
190 const __m128 vi8x89AB = _mm_loadu_ps(i8);
191 i8 += 4;
192
193 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
194 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
195 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
196 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
197 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
198 const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
199 const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
200 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
201 const __m128 vi8x3456 = _mm_move_ss(vi8x7456, vi8x3012);
202
203 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
204 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
205 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
206 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
207 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x3456, vk01));
208 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
209 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
210 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
211 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
212 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x3456, vk11));
213 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
214 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
215 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
216 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
217 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x3456, vk21));
218 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
219 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
220 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
221 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
222 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x3456, vk31));
223 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
224 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
225 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
226 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
227 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x3456, vk41));
228
229 const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
230 vi0x3012 = vi0x7456;
231 const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
232 vi1x3012 = vi1x7456;
233 const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
234 vi2x3012 = vi2x7456;
235 const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
236 vi3x3012 = vi3x7456;
237 const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
238 vi4x3012 = vi4x7456;
239 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
240 vi5x3012 = vi5x7456;
241 const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
242 vi6x3012 = vi6x7456;
243 const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
244 vi7x3012 = vi7x7456;
245 const __m128 vi8x2345 = _mm_shuffle_ps(vi8x3012, vi8x7456, _MM_SHUFFLE(2, 1, 0, 3));
246 vi8x3012 = vi8x7456;
247
248 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
249 vi0x4567 = vi0x89AB;
250 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
251 vi1x4567 = vi1x89AB;
252 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
253 vi2x4567 = vi2x89AB;
254 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
255 vi3x4567 = vi3x89AB;
256 const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
257 vi4x4567 = vi4x89AB;
258 const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
259 vi5x4567 = vi5x89AB;
260 const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vi6x89AB);
261 vi6x4567 = vi6x89AB;
262 const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vi7x89AB);
263 vi7x4567 = vi7x89AB;
264 const __m128 vi8x8567 = _mm_move_ss(vi8x4567, vi8x89AB);
265 vi8x4567 = vi8x89AB;
266
267 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
268 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
269 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
270 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
271 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x2345, vk00));
272 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
273 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
274 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
275 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
276 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x2345, vk10));
277 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
278 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
279 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
280 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
281 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x2345, vk20));
282 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
283 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
284 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
285 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
286 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x2345, vk30));
287 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
288 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
289 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
290 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
291 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x2345, vk40));
292
293 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
294 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
295 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
296 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
297 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
298 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
299 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
300 const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
301 const __m128 vi8x5678 = _mm_shuffle_ps(vi8x8567, vi8x8567, _MM_SHUFFLE(0, 3, 2, 1));
302
303 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
304 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
305 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
306 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
307 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk03));
308 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
309 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
310 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
311 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
312 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x5678, vk13));
313 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
314 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
315 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
316 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
317 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk23));
318 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
319 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
320 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
321 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
322 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x5678, vk33));
323 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
324 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
325 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
326 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
327 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x5678, vk43));
328
329 const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
330 const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
331 const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
332 const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
333 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
334 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
335 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1));
336 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1));
337 const __m128 vi8x6789 = _mm_shuffle_ps(vi8x5678, vi8x89AB, _MM_SHUFFLE(1, 0, 2, 1));
338
339 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
340 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
341 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
342 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
343 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x6789, vk04));
344 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
345 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
346 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
347 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
348 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x6789, vk14));
349 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
350 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
351 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
352 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
353 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x6789, vk24));
354 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
355 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
356 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
357 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
358 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x6789, vk34));
359 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
360 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
361 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
362 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
363 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x6789, vk44));
364
365
366 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
367 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
368 __m128 vo2 = _mm_max_ps(vo2p0, vmin);
369 __m128 vo3 = _mm_max_ps(vo3p0, vmin);
370 __m128 vo4 = _mm_max_ps(vo4p0, vmin);
371
372 vo0 = _mm_min_ps(vo0, vmax);
373 vo1 = _mm_min_ps(vo1, vmax);
374 vo2 = _mm_min_ps(vo2, vmax);
375 vo3 = _mm_min_ps(vo3, vmax);
376 vo4 = _mm_min_ps(vo4, vmax);
377
378 _mm_storeu_ps(o4, vo4);
379 o4 += 4;
380 _mm_storeu_ps(o3, vo3);
381 o3 += 4;
382 _mm_storeu_ps(o2, vo2);
383 o2 += 4;
384 _mm_storeu_ps(o1, vo1);
385 o1 += 4;
386 _mm_storeu_ps(o0, vo0);
387 o0 += 4;
388 }
389 // Always process the last block of 5..8 pixels.
390 if XNN_LIKELY(w > 4 * sizeof(float)) {
391 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
392 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
393 __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
394 __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
395 __m128 vo4p0 = _mm_add_ps(vbias, _mm_mul_ps(vi4x4567, vk02));
396 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
397 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
398 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
399 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
400 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x4567, vk12));
401 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
402 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
403 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
404 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
405 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x4567, vk22));
406 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
407 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
408 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
409 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
410 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x4567, vk32));
411 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
412 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
413 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
414 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
415 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x4567, vk42));
416
417 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
418 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
419 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
420 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
421 const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
422 const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
423 const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
424 const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
425 const __m128 vi8x7456 = _mm_shuffle_ps(vi8x4567, vi8x4567, _MM_SHUFFLE(2, 1, 0, 3));
426
427 const __m128 vi0x89AB = _mm_and_ps(_mm_loadu_ps(i0), vmask);
428 i0 += 4;
429 const __m128 vi1x89AB = _mm_and_ps(_mm_loadu_ps(i1), vmask);
430 i1 += 4;
431 const __m128 vi2x89AB = _mm_and_ps(_mm_loadu_ps(i2), vmask);
432 i2 += 4;
433 const __m128 vi3x89AB = _mm_and_ps(_mm_loadu_ps(i3), vmask);
434 i3 += 4;
435 const __m128 vi4x89AB = _mm_and_ps(_mm_loadu_ps(i4), vmask);
436 i4 += 4;
437 const __m128 vi5x89AB = _mm_and_ps(_mm_loadu_ps(i5), vmask);
438 i5 += 4;
439 const __m128 vi6x89AB = _mm_and_ps(_mm_loadu_ps(i6), vmask);
440 i6 += 4;
441 const __m128 vi7x89AB = _mm_and_ps(_mm_loadu_ps(i7), vmask);
442 i7 += 4;
443 const __m128 vi8x89AB = _mm_and_ps(_mm_loadu_ps(i8), vmask);
444 i8 += 4;
445
446 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
447 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
448 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
449 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
450 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
451 const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
452 const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
453 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
454 const __m128 vi8x3456 = _mm_move_ss(vi8x7456, vi8x3012);
455
456 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
457 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
458 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
459 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
460 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x3456, vk01));
461 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
462 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
463 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
464 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
465 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x3456, vk11));
466 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
467 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
468 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
469 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
470 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x3456, vk21));
471 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
472 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
473 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
474 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
475 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x3456, vk31));
476 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
477 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
478 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
479 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
480 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x3456, vk41));
481
482 const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
483 vi0x3012 = vi0x7456;
484 const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
485 vi1x3012 = vi1x7456;
486 const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
487 vi2x3012 = vi2x7456;
488 const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
489 vi3x3012 = vi3x7456;
490 const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
491 vi4x3012 = vi4x7456;
492 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
493 vi5x3012 = vi5x7456;
494 const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
495 vi6x3012 = vi6x7456;
496 const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
497 vi7x3012 = vi7x7456;
498 const __m128 vi8x2345 = _mm_shuffle_ps(vi8x3012, vi8x7456, _MM_SHUFFLE(2, 1, 0, 3));
499 vi8x3012 = vi8x7456;
500
501 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vi0x89AB);
502 vi0x4567 = vi0x89AB;
503 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vi1x89AB);
504 vi1x4567 = vi1x89AB;
505 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vi2x89AB);
506 vi2x4567 = vi2x89AB;
507 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vi3x89AB);
508 vi3x4567 = vi3x89AB;
509 const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vi4x89AB);
510 vi4x4567 = vi4x89AB;
511 const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vi5x89AB);
512 vi5x4567 = vi5x89AB;
513 const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vi6x89AB);
514 vi6x4567 = vi6x89AB;
515 const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vi7x89AB);
516 vi7x4567 = vi7x89AB;
517 const __m128 vi8x8567 = _mm_move_ss(vi8x4567, vi8x89AB);
518 vi8x4567 = vi8x89AB;
519
520 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
521 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
522 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
523 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
524 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x2345, vk00));
525 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
526 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
527 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
528 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
529 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x2345, vk10));
530 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
531 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
532 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
533 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
534 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x2345, vk20));
535 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
536 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
537 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
538 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
539 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x2345, vk30));
540 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
541 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
542 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
543 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
544 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x2345, vk40));
545
546 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
547 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
548 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
549 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
550 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
551 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
552 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
553 const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
554 const __m128 vi8x5678 = _mm_shuffle_ps(vi8x8567, vi8x8567, _MM_SHUFFLE(0, 3, 2, 1));
555
556 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
557 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
558 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
559 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
560 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk03));
561 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
562 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
563 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
564 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
565 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x5678, vk13));
566 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
567 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
568 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
569 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
570 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk23));
571 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
572 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
573 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
574 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
575 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x5678, vk33));
576 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
577 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
578 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
579 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
580 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x5678, vk43));
581
582 const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vi0x89AB, _MM_SHUFFLE(1, 0, 2, 1));
583 const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vi1x89AB, _MM_SHUFFLE(1, 0, 2, 1));
584 const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vi2x89AB, _MM_SHUFFLE(1, 0, 2, 1));
585 const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vi3x89AB, _MM_SHUFFLE(1, 0, 2, 1));
586 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1));
587 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1));
588 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1));
589 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1));
590 const __m128 vi8x6789 = _mm_shuffle_ps(vi8x5678, vi8x89AB, _MM_SHUFFLE(1, 0, 2, 1));
591
592 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
593 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
594 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
595 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
596 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x6789, vk04));
597 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
598 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
599 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
600 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
601 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x6789, vk14));
602 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
603 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
604 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
605 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
606 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x6789, vk24));
607 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
608 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
609 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
610 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
611 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x6789, vk34));
612 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
613 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
614 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
615 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
616 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x6789, vk44));
617
618
619 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
620 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
621 __m128 vo2 = _mm_max_ps(vo2p0, vmin);
622 __m128 vo3 = _mm_max_ps(vo3p0, vmin);
623 __m128 vo4 = _mm_max_ps(vo4p0, vmin);
624
625 vo0 = _mm_min_ps(vo0, vmax);
626 vo1 = _mm_min_ps(vo1, vmax);
627 vo2 = _mm_min_ps(vo2, vmax);
628 vo3 = _mm_min_ps(vo3, vmax);
629 vo4 = _mm_min_ps(vo4, vmax);
630
631 _mm_storeu_ps(o4, vo4);
632 o4 += 4;
633 _mm_storeu_ps(o3, vo3);
634 o3 += 4;
635 _mm_storeu_ps(o2, vo2);
636 o2 += 4;
637 _mm_storeu_ps(o1, vo1);
638 o1 += 4;
639 _mm_storeu_ps(o0, vo0);
640 o0 += 4;
641
642 w -= 4 * sizeof(float);
643 }
644 assert(w >= 1 * sizeof(float));
645 assert(w <= 4 * sizeof(float));
646 {
647 vi0x4567 = _mm_and_ps(vi0x4567, vmask);
648 vi1x4567 = _mm_and_ps(vi1x4567, vmask);
649 vi2x4567 = _mm_and_ps(vi2x4567, vmask);
650 vi3x4567 = _mm_and_ps(vi3x4567, vmask);
651 vi4x4567 = _mm_and_ps(vi4x4567, vmask);
652 vi5x4567 = _mm_and_ps(vi5x4567, vmask);
653 vi6x4567 = _mm_and_ps(vi6x4567, vmask);
654 vi7x4567 = _mm_and_ps(vi7x4567, vmask);
655 vi8x4567 = _mm_and_ps(vi8x4567, vmask);
656
657 __m128 vo0p0 = _mm_add_ps(vbias, _mm_mul_ps(vi0x4567, vk02));
658 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02));
659 __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02));
660 __m128 vo3p0 = _mm_add_ps(vbias, _mm_mul_ps(vi3x4567, vk02));
661 __m128 vo4p0 = _mm_add_ps(vbias, _mm_mul_ps(vi4x4567, vk02));
662 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x4567, vk12));
663 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12));
664 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12));
665 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x4567, vk12));
666 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x4567, vk12));
667 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x4567, vk22));
668 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22));
669 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22));
670 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x4567, vk22));
671 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x4567, vk22));
672 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x4567, vk32));
673 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32));
674 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32));
675 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x4567, vk32));
676 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x4567, vk32));
677 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x4567, vk42));
678 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42));
679 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42));
680 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x4567, vk42));
681 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x4567, vk42));
682
683 const __m128 vi0x7456 = _mm_shuffle_ps(vi0x4567, vi0x4567, _MM_SHUFFLE(2, 1, 0, 3));
684 const __m128 vi1x7456 = _mm_shuffle_ps(vi1x4567, vi1x4567, _MM_SHUFFLE(2, 1, 0, 3));
685 const __m128 vi2x7456 = _mm_shuffle_ps(vi2x4567, vi2x4567, _MM_SHUFFLE(2, 1, 0, 3));
686 const __m128 vi3x7456 = _mm_shuffle_ps(vi3x4567, vi3x4567, _MM_SHUFFLE(2, 1, 0, 3));
687 const __m128 vi4x7456 = _mm_shuffle_ps(vi4x4567, vi4x4567, _MM_SHUFFLE(2, 1, 0, 3));
688 const __m128 vi5x7456 = _mm_shuffle_ps(vi5x4567, vi5x4567, _MM_SHUFFLE(2, 1, 0, 3));
689 const __m128 vi6x7456 = _mm_shuffle_ps(vi6x4567, vi6x4567, _MM_SHUFFLE(2, 1, 0, 3));
690 const __m128 vi7x7456 = _mm_shuffle_ps(vi7x4567, vi7x4567, _MM_SHUFFLE(2, 1, 0, 3));
691 const __m128 vi8x7456 = _mm_shuffle_ps(vi8x4567, vi8x4567, _MM_SHUFFLE(2, 1, 0, 3));
692
693 const __m128 vi0x3456 = _mm_move_ss(vi0x7456, vi0x3012);
694 const __m128 vi1x3456 = _mm_move_ss(vi1x7456, vi1x3012);
695 const __m128 vi2x3456 = _mm_move_ss(vi2x7456, vi2x3012);
696 const __m128 vi3x3456 = _mm_move_ss(vi3x7456, vi3x3012);
697 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012);
698 const __m128 vi5x3456 = _mm_move_ss(vi5x7456, vi5x3012);
699 const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012);
700 const __m128 vi7x3456 = _mm_move_ss(vi7x7456, vi7x3012);
701 const __m128 vi8x3456 = _mm_move_ss(vi8x7456, vi8x3012);
702
703 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk01));
704 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01));
705 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01));
706 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk01));
707 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x3456, vk01));
708 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x3456, vk11));
709 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11));
710 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11));
711 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk11));
712 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x3456, vk11));
713 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk21));
714 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21));
715 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21));
716 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk21));
717 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x3456, vk21));
718 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x3456, vk31));
719 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31));
720 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31));
721 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31));
722 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x3456, vk31));
723 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x3456, vk41));
724 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41));
725 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41));
726 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x3456, vk41));
727 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x3456, vk41));
728
729 const __m128 vi0x2345 = _mm_shuffle_ps(vi0x3012, vi0x7456, _MM_SHUFFLE(2, 1, 0, 3));
730 const __m128 vi1x2345 = _mm_shuffle_ps(vi1x3012, vi1x7456, _MM_SHUFFLE(2, 1, 0, 3));
731 const __m128 vi2x2345 = _mm_shuffle_ps(vi2x3012, vi2x7456, _MM_SHUFFLE(2, 1, 0, 3));
732 const __m128 vi3x2345 = _mm_shuffle_ps(vi3x3012, vi3x7456, _MM_SHUFFLE(2, 1, 0, 3));
733 const __m128 vi4x2345 = _mm_shuffle_ps(vi4x3012, vi4x7456, _MM_SHUFFLE(2, 1, 0, 3));
734 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3));
735 const __m128 vi6x2345 = _mm_shuffle_ps(vi6x3012, vi6x7456, _MM_SHUFFLE(2, 1, 0, 3));
736 const __m128 vi7x2345 = _mm_shuffle_ps(vi7x3012, vi7x7456, _MM_SHUFFLE(2, 1, 0, 3));
737 const __m128 vi8x2345 = _mm_shuffle_ps(vi8x3012, vi8x7456, _MM_SHUFFLE(2, 1, 0, 3));
738
739 const __m128 vzero = _mm_setzero_ps();
740 const __m128 vi0x8567 = _mm_move_ss(vi0x4567, vzero);
741 const __m128 vi1x8567 = _mm_move_ss(vi1x4567, vzero);
742 const __m128 vi2x8567 = _mm_move_ss(vi2x4567, vzero);
743 const __m128 vi3x8567 = _mm_move_ss(vi3x4567, vzero);
744 const __m128 vi4x8567 = _mm_move_ss(vi4x4567, vzero);
745 const __m128 vi5x8567 = _mm_move_ss(vi5x4567, vzero);
746 const __m128 vi6x8567 = _mm_move_ss(vi6x4567, vzero);
747 const __m128 vi7x8567 = _mm_move_ss(vi7x4567, vzero);
748 const __m128 vi8x8567 = _mm_move_ss(vi8x4567, vzero);
749
750 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x2345, vk00));
751 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x2345, vk00));
752 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x2345, vk00));
753 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x2345, vk00));
754 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x2345, vk00));
755 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x2345, vk10));
756 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x2345, vk10));
757 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x2345, vk10));
758 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x2345, vk10));
759 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x2345, vk10));
760 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x2345, vk20));
761 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x2345, vk20));
762 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x2345, vk20));
763 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20));
764 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x2345, vk20));
765 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x2345, vk30));
766 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x2345, vk30));
767 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30));
768 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x2345, vk30));
769 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x2345, vk30));
770 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x2345, vk40));
771 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40));
772 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x2345, vk40));
773 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x2345, vk40));
774 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x2345, vk40));
775
776 const __m128 vi0x5678 = _mm_shuffle_ps(vi0x8567, vi0x8567, _MM_SHUFFLE(0, 3, 2, 1));
777 const __m128 vi1x5678 = _mm_shuffle_ps(vi1x8567, vi1x8567, _MM_SHUFFLE(0, 3, 2, 1));
778 const __m128 vi2x5678 = _mm_shuffle_ps(vi2x8567, vi2x8567, _MM_SHUFFLE(0, 3, 2, 1));
779 const __m128 vi3x5678 = _mm_shuffle_ps(vi3x8567, vi3x8567, _MM_SHUFFLE(0, 3, 2, 1));
780 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1));
781 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1));
782 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1));
783 const __m128 vi7x5678 = _mm_shuffle_ps(vi7x8567, vi7x8567, _MM_SHUFFLE(0, 3, 2, 1));
784 const __m128 vi8x5678 = _mm_shuffle_ps(vi8x8567, vi8x8567, _MM_SHUFFLE(0, 3, 2, 1));
785
786 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk03));
787 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk03));
788 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk03));
789 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk03));
790 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk03));
791 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x5678, vk13));
792 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x5678, vk13));
793 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x5678, vk13));
794 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk13));
795 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x5678, vk13));
796 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk23));
797 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk23));
798 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23));
799 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23));
800 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk23));
801 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x5678, vk33));
802 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33));
803 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33));
804 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33));
805 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x5678, vk33));
806 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43));
807 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43));
808 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43));
809 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x5678, vk43));
810 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x5678, vk43));
811
812 const __m128 vi0x6789 = _mm_shuffle_ps(vi0x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
813 const __m128 vi1x6789 = _mm_shuffle_ps(vi1x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
814 const __m128 vi2x6789 = _mm_shuffle_ps(vi2x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
815 const __m128 vi3x6789 = _mm_shuffle_ps(vi3x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
816 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
817 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
818 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
819 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
820 const __m128 vi8x6789 = _mm_shuffle_ps(vi8x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1));
821
822 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x6789, vk04));
823 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x6789, vk04));
824 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x6789, vk04));
825 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x6789, vk04));
826 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x6789, vk04));
827 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi1x6789, vk14));
828 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x6789, vk14));
829 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x6789, vk14));
830 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x6789, vk14));
831 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x6789, vk14));
832 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x6789, vk24));
833 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x6789, vk24));
834 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x6789, vk24));
835 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24));
836 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x6789, vk24));
837 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi3x6789, vk34));
838 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x6789, vk34));
839 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34));
840 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x6789, vk34));
841 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x6789, vk34));
842 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x6789, vk44));
843 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44));
844 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x6789, vk44));
845 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44));
846 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi8x6789, vk44));
847
848
849 __m128 vo0 = _mm_max_ps(vo0p0, vmin);
850 __m128 vo1 = _mm_max_ps(vo1p0, vmin);
851 __m128 vo2 = _mm_max_ps(vo2p0, vmin);
852 __m128 vo3 = _mm_max_ps(vo3p0, vmin);
853 __m128 vo4 = _mm_max_ps(vo4p0, vmin);
854
855 vo0 = _mm_min_ps(vo0, vmax);
856 vo1 = _mm_min_ps(vo1, vmax);
857 vo2 = _mm_min_ps(vo2, vmax);
858 vo3 = _mm_min_ps(vo3, vmax);
859 vo4 = _mm_min_ps(vo4, vmax);
860
861 if XNN_LIKELY(w & (4 * sizeof(float))) {
862 _mm_storeu_ps(o4, vo4);
863 o4 += 4;
864 _mm_storeu_ps(o3, vo3);
865 o3 += 4;
866 _mm_storeu_ps(o2, vo2);
867 o2 += 4;
868 _mm_storeu_ps(o1, vo1);
869 o1 += 4;
870 _mm_storeu_ps(o0, vo0);
871 o0 += 4;
872 } else {
873 if (w & (2 * sizeof(float))) {
874 _mm_storel_pi((__m64*) o4, vo4);
875 o4 += 2;
876 _mm_storel_pi((__m64*) o3, vo3);
877 o3 += 2;
878 _mm_storel_pi((__m64*) o2, vo2);
879 o2 += 2;
880 _mm_storel_pi((__m64*) o1, vo1);
881 o1 += 2;
882 _mm_storel_pi((__m64*) o0, vo0);
883 o0 += 2;
884
885 vo0 = _mm_movehl_ps(vo0, vo0);
886 vo1 = _mm_movehl_ps(vo1, vo1);
887 vo2 = _mm_movehl_ps(vo2, vo2);
888 vo3 = _mm_movehl_ps(vo3, vo3);
889 vo4 = _mm_movehl_ps(vo4, vo4);
890 }
891 if (w & (1 * sizeof(float))) {
892 _mm_store_ss(o4, vo4);
893 o4 += 1;
894 _mm_store_ss(o3, vo3);
895 o3 += 1;
896 _mm_store_ss(o2, vo2);
897 o2 += 1;
898 _mm_store_ss(o1, vo1);
899 o1 += 1;
900 _mm_store_ss(o0, vo0);
901 o0 += 1;
902 }
903 }
904 }
905
906 i0 = (const float*) ((uintptr_t) i5 - input_decrement);
907 i1 = (const float*) ((uintptr_t) i6 - input_decrement);
908 i2 = (const float*) ((uintptr_t) i1 + input_width);
909 i3 = (const float*) ((uintptr_t) i2 + input_width);
910 i4 = (const float*) ((uintptr_t) i3 + input_width);
911 i5 = (const float*) ((uintptr_t) i4 + input_width);
912 i6 = (const float*) ((uintptr_t) i5 + input_width);
913 i7 = (const float*) ((uintptr_t) i6 + input_width);
914 i8 = (const float*) ((uintptr_t) i7 + input_width);
915
916 o0 = o4;
917 o1 = (float*) ((uintptr_t) o0 + input_width);
918 o2 = (float*) ((uintptr_t) o1 + input_width);
919 o3 = (float*) ((uintptr_t) o2 + input_width);
920 o4 = (float*) ((uintptr_t) o3 + input_width);
921
922 output_height = doz(output_height, 5);
923 } while (output_height != 0);
924 }
925