1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <xmmintrin.h>
9
10 #include <xnnpack/conv.h>
11 #include <xnnpack/math.h>
12
13
xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_channel_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])14 void xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2(
15 size_t input_height,
16 size_t input_width,
17 size_t output_y_start,
18 size_t output_y_end,
19 const float* input,
20 const float* zero,
21 const float* weights,
22 float* output,
23 size_t input_padding_top,
24 size_t output_channels,
25 size_t output_height_stride,
26 size_t output_channel_stride,
27 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
28 {
29 assert(input_width != 0);
30 assert(output_y_end > output_y_start);
31 assert(input_padding_top <= 1);
32 assert(output_channels != 0);
33
34 const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
35 const size_t input_width_increment = round_down_po2(input_width, 4) * 3 /* channels */ * sizeof(float);
36 const size_t output_width = (input_width + 1) / 2;
37 const size_t output_channel_increment = output_channel_stride * 4 - output_width * sizeof(float);
38
39 // Adjustment for padding processed below
40 const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top));
41 const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
42 const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
43 const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
44 const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
45 float* output0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
46 float* output1 = (float*) ((uintptr_t) output0 + output_height_stride);
47
48 if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
49 i0 = zero;
50 }
51
52 const __m128 vmin = _mm_load_ps(params->sse.min);
53 const __m128 vmax = _mm_load_ps(params->sse.max);
54
55 for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
56 const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
57 const size_t input_y4 = input_y2 + 2;
58 if XNN_UNPREDICTABLE(input_y2 >= input_height) {
59 i2 = zero;
60 }
61 if XNN_UNPREDICTABLE(input_y4 > input_height) {
62 i3 = zero;
63 }
64 if XNN_UNPREDICTABLE(input_y4 >= input_height) {
65 i4 = zero;
66 }
67 if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
68 output1 = output0;
69 }
70
71 const float* w = weights;
72 size_t c = output_channels;
73 float* o0c0 = output0;
74 float* o1c0 = output1;
75 float* o0c1 = (float*) ((uintptr_t) o0c0 + output_channel_stride);
76 float* o1c1 = (float*) ((uintptr_t) o1c0 + output_channel_stride);
77 float* o0c2 = (float*) ((uintptr_t) o0c1 + output_channel_stride);
78 float* o1c2 = (float*) ((uintptr_t) o1c1 + output_channel_stride);
79 float* o0c3 = (float*) ((uintptr_t) o0c2 + output_channel_stride);
80 float* o1c3 = (float*) ((uintptr_t) o1c2 + output_channel_stride);
81 do {
82 if XNN_UNPREDICTABLE(c < 2) {
83 o0c1 = o0c0;
84 o1c1 = o1c0;
85 }
86 if XNN_UNPREDICTABLE(c <= 2) {
87 o0c2 = o0c1;
88 o1c2 = o1c1;
89 }
90 if XNN_UNPREDICTABLE(c < 4) {
91 o0c3 = o0c2;
92 o1c3 = o1c2;
93 }
94
95 // viMx0 = ( iM0c2, iM0c1, iM0c0, --- )
96 __m128 vi0x0 = _mm_setzero_ps();
97 __m128 vi1x0 = _mm_setzero_ps();
98 __m128 vi2x0 = _mm_setzero_ps();
99 __m128 vi3x0 = _mm_setzero_ps();
100 __m128 vi4x0 = _mm_setzero_ps();
101
102 size_t iw = input_width;
103 for (; iw >= 4; iw -= 4) {
104 __m128 vo0x0 = _mm_load_ps(w);
105 __m128 vo1x0 = vo0x0;
106 __m128 vo0x1 = vo0x0;
107 __m128 vo1x1 = vo0x0;
108
109 const __m128 vk00c0 = _mm_load_ps(w + 4);
110
111 // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
112 const __m128 vi0x1 = _mm_loadu_ps(i0); i0 += 4;
113 const __m128 vi1x1 = _mm_loadu_ps(i1); i1 += 4;
114 const __m128 vi2x1 = _mm_loadu_ps(i2); i2 += 4;
115 const __m128 vi3x1 = _mm_loadu_ps(i3); i3 += 4;
116 const __m128 vi4x1 = _mm_loadu_ps(i4); i4 += 4;
117
118 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(1, 1, 1, 1))));
119 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
120 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
121 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
122
123 const __m128 vk10c0 = _mm_load_ps(w + 8);
124
125 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(1, 1, 1, 1))));
126 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(1, 1, 1, 1))));
127 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
128 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
129
130 const __m128 vk20c0 = _mm_load_ps(w + 12);
131
132 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
133 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(1, 1, 1, 1))));
134 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
135 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
136
137 const __m128 vk00c1 = _mm_load_ps(w + 16);
138
139 // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
140 const __m128 vi0x2 = _mm_loadu_ps(i0); i0 += 4;
141 const __m128 vi1x2 = _mm_loadu_ps(i1); i1 += 4;
142 const __m128 vi2x2 = _mm_loadu_ps(i2); i2 += 4;
143 const __m128 vi3x2 = _mm_loadu_ps(i3); i3 += 4;
144 const __m128 vi4x2 = _mm_loadu_ps(i4); i4 += 4;
145
146 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(2, 2, 2, 2))));
147 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
148 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
149 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
150
151 const __m128 vk10c1 = _mm_load_ps(w + 20);
152
153 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(2, 2, 2, 2))));
154 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(2, 2, 2, 2))));
155 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
156 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
157
158 const __m128 vk20c1 = _mm_load_ps(w + 24);
159
160 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
161 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(2, 2, 2, 2))));
162 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
163 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
164
165 const __m128 vk00c2 = _mm_load_ps(w + 28);
166
167 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(3, 3, 3, 3))));
168 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
169 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
170 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
171
172 const __m128 vk10c2 = _mm_load_ps(w + 32);
173
174 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(3, 3, 3, 3))));
175 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(3, 3, 3, 3))));
176 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
177 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
178
179 const __m128 vk20c2 = _mm_load_ps(w + 36);
180
181 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
182 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(3, 3, 3, 3))));
183 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
184 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
185
186 const __m128 vk01c0 = _mm_load_ps(w + 40);
187
188 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(0, 0, 0, 0))));
189 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
190 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(2, 2, 2, 2))));
191 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
192
193 const __m128 vk11c0 = _mm_load_ps(w + 44);
194
195 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(0, 0, 0, 0))));
196 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(0, 0, 0, 0))));
197 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(2, 2, 2, 2))));
198 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(2, 2, 2, 2))));
199
200 const __m128 vk21c0 = _mm_load_ps(w + 48);
201
202 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
203 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(0, 0, 0, 0))));
204 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
205 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(2, 2, 2, 2))));
206
207 const __m128 vk01c1 = _mm_load_ps(w + 52);
208
209 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(1, 1, 1, 1))));
210 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
211 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(3, 3, 3, 3))));
212 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
213
214 const __m128 vk11c1 = _mm_load_ps(w + 56);
215
216 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(1, 1, 1, 1))));
217 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(1, 1, 1, 1))));
218 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(3, 3, 3, 3))));
219 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(3, 3, 3, 3))));
220
221 const __m128 vk21c1 = _mm_load_ps(w + 60);
222
223 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
224 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(1, 1, 1, 1))));
225 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
226 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(3, 3, 3, 3))));
227
228 const __m128 vk01c2 = _mm_load_ps(w + 64);
229
230 // viMx3 = ( iM4c2, iM4c1, iM4c0, iM3c2 )
231 const __m128 vi0x3 = _mm_loadu_ps(i0); i0 += 4;
232 const __m128 vi1x3 = _mm_loadu_ps(i1); i1 += 4;
233 const __m128 vi2x3 = _mm_loadu_ps(i2); i2 += 4;
234 const __m128 vi3x3 = _mm_loadu_ps(i3); i3 += 4;
235 const __m128 vi4x3 = _mm_loadu_ps(i4); i4 += 4;
236
237 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(2, 2, 2, 2))));
238 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
239 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(0, 0, 0, 0))));
240 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
241
242 const __m128 vk11c2 = _mm_load_ps(w + 68);
243
244 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(2, 2, 2, 2))));
245 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(2, 2, 2, 2))));
246 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(0, 0, 0, 0))));
247 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(0, 0, 0, 0))));
248
249 const __m128 vk21c2 = _mm_load_ps(w + 72);
250
251 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
252 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(2, 2, 2, 2))));
253 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
254 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(0, 0, 0, 0))));
255
256 const __m128 vk02c0 = _mm_load_ps(w + 76);
257
258 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
259 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
260 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(1, 1, 1, 1))));
261 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(1, 1, 1, 1))));
262
263 const __m128 vk12c0 = _mm_load_ps(w + 80);
264
265 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
266 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
267 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(1, 1, 1, 1))));
268 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(1, 1, 1, 1))));
269
270 const __m128 vk22c0 = _mm_load_ps(w + 84);
271
272 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
273 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
274 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(1, 1, 1, 1))));
275 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(1, 1, 1, 1))));
276
277 const __m128 vk02c1 = _mm_load_ps(w + 88);
278
279 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
280 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
281 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(2, 2, 2, 2))));
282 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(2, 2, 2, 2))));
283
284 const __m128 vk12c1 = _mm_load_ps(w + 92);
285
286 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
287 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
288 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(2, 2, 2, 2))));
289 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(2, 2, 2, 2))));
290
291 const __m128 vk22c1 = _mm_load_ps(w + 96);
292
293 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
294 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
295 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(2, 2, 2, 2))));
296 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(2, 2, 2, 2))));
297
298 const __m128 vk02c2 = _mm_load_ps(w + 100);
299
300 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
301 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
302 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(3, 3, 3, 3))));
303 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(3, 3, 3, 3))));
304
305 const __m128 vk12c2 = _mm_load_ps(w + 104);
306
307 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
308 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
309 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(3, 3, 3, 3))));
310 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(3, 3, 3, 3))));
311
312 const __m128 vk22c2 = _mm_load_ps(w + 108);
313
314 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
315 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
316 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(3, 3, 3, 3))));
317 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(3, 3, 3, 3))));
318
319 vi0x0 = vi0x3;
320 vi1x0 = vi1x3;
321 vi2x0 = vi2x3;
322 vi3x0 = vi3x3;
323 vi4x0 = vi4x3;
324
325 vo0x0 = _mm_max_ps(vo0x0, vmin);
326 vo1x0 = _mm_max_ps(vo1x0, vmin);
327 vo0x1 = _mm_max_ps(vo0x1, vmin);
328 vo1x1 = _mm_max_ps(vo1x1, vmin);
329
330 vo0x0 = _mm_min_ps(vo0x0, vmax);
331 vo1x0 = _mm_min_ps(vo1x0, vmax);
332 vo0x1 = _mm_min_ps(vo0x1, vmax);
333 vo1x1 = _mm_min_ps(vo1x1, vmax);
334
335 const __m128 vo0c01 = _mm_unpacklo_ps(vo0x0, vo0x1);
336 const __m128 vo0c23 = _mm_unpackhi_ps(vo0x0, vo0x1);
337 const __m128 vo1c01 = _mm_unpacklo_ps(vo1x0, vo1x1);
338 const __m128 vo1c23 = _mm_unpackhi_ps(vo1x0, vo1x1);
339
340 // Always 2+ output width elements remaining
341 _mm_storel_pi((__m64 *)o1c0, vo1c01); o1c0 += 2;
342 _mm_storel_pi((__m64 *)o1c1, _mm_shuffle_ps(vo1c01, vo1c01, _MM_SHUFFLE(3, 2, 3, 2))); o1c1 += 2;
343 _mm_storel_pi((__m64 *)o1c2, vo1c23); o1c2 += 2;
344 _mm_storel_pi((__m64 *)o1c3, _mm_shuffle_ps(vo1c23, vo1c23, _MM_SHUFFLE(3, 2, 3, 2))); o1c3 += 2;
345
346 _mm_storel_pi((__m64 *)o0c0, vo0c01); o0c0 += 2;
347 _mm_storel_pi((__m64 *)o0c1, _mm_shuffle_ps(vo0c01, vo0c01, _MM_SHUFFLE(3, 2, 3, 2))); o0c1 += 2;
348 _mm_storel_pi((__m64 *)o0c2, vo0c23); o0c2 += 2;
349 _mm_storel_pi((__m64 *)o0c3, _mm_shuffle_ps(vo0c23, vo0c23, _MM_SHUFFLE(3, 2, 3, 2))); o0c3 += 2;
350 }
351 assert(iw < 4);
352 if XNN_UNLIKELY(iw != 0) {
353 __m128 vo0x0 = _mm_load_ps(w);
354 __m128 vo1x0 = vo0x0;
355 __m128 vo0x1 = vo0x0;
356 __m128 vo1x1 = vo0x0;
357
358 const __m128 vk00c0 = _mm_load_ps(w + 4);
359
360 // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
361 __m128 vi0x1 = _mm_loadu_ps(i0);
362 __m128 vi1x1 = _mm_loadu_ps(i1);
363 __m128 vi2x1 = _mm_loadu_ps(i2);
364 __m128 vi3x1 = _mm_loadu_ps(i3);
365 __m128 vi4x1 = _mm_loadu_ps(i4);
366
367 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(1, 1, 1, 1))));
368 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
369 if (iw > 2) {
370 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
371 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
372 }
373
374 const __m128 vk10c0 = _mm_load_ps(w + 8);
375
376 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(1, 1, 1, 1))));
377 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(1, 1, 1, 1))));
378 if (iw > 2) {
379 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
380 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
381 }
382
383 const __m128 vk20c0 = _mm_load_ps(w + 12);
384
385 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
386 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(1, 1, 1, 1))));
387 if (iw > 2) {
388 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
389 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
390 }
391
392 const __m128 vk00c1 = _mm_load_ps(w + 16);
393
394 __m128 vi0x2 = _mm_setzero_ps();
395 __m128 vi1x2 = _mm_setzero_ps();
396 __m128 vi2x2 = _mm_setzero_ps();
397 __m128 vi3x2 = _mm_setzero_ps();
398 __m128 vi4x2 = _mm_setzero_ps();
399 if (iw >= 2) {
400 // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
401 vi0x2 = _mm_loadu_ps(i0 + 4);
402 vi1x2 = _mm_loadu_ps(i1 + 4);
403 vi2x2 = _mm_loadu_ps(i2 + 4);
404 vi3x2 = _mm_loadu_ps(i3 + 4);
405 vi4x2 = _mm_loadu_ps(i4 + 4);
406 }
407
408 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(2, 2, 2, 2))));
409 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
410 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
411 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
412
413 const __m128 vk10c1 = _mm_load_ps(w + 20);
414
415 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(2, 2, 2, 2))));
416 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(2, 2, 2, 2))));
417 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
418 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
419
420 const __m128 vk20c1 = _mm_load_ps(w + 24);
421
422 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
423 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(2, 2, 2, 2))));
424 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
425 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
426
427 const __m128 vk00c2 = _mm_load_ps(w + 28);
428
429 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(3, 3, 3, 3))));
430 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
431 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
432 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
433
434 const __m128 vk10c2 = _mm_load_ps(w + 32);
435
436 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(3, 3, 3, 3))));
437 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(3, 3, 3, 3))));
438 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
439 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
440
441 const __m128 vk20c2 = _mm_load_ps(w + 36);
442
443 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
444 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(3, 3, 3, 3))));
445 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
446 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
447
448 const __m128 vk01c0 = _mm_load_ps(w + 40);
449
450 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(0, 0, 0, 0))));
451 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
452 if (iw > 2) {
453 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(2, 2, 2, 2))));
454 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
455 }
456
457 const __m128 vk11c0 = _mm_load_ps(w + 44);
458
459 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(0, 0, 0, 0))));
460 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(0, 0, 0, 0))));
461 if (iw > 2) {
462 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(2, 2, 2, 2))));
463 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(2, 2, 2, 2))));
464 }
465
466 const __m128 vk21c0 = _mm_load_ps(w + 48);
467
468 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
469 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(0, 0, 0, 0))));
470 if (iw > 2) {
471 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
472 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(2, 2, 2, 2))));
473 }
474
475 const __m128 vk01c1 = _mm_load_ps(w + 52);
476
477 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(1, 1, 1, 1))));
478 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
479 if (iw > 2) {
480 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(3, 3, 3, 3))));
481 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
482 }
483
484 const __m128 vk11c1 = _mm_load_ps(w + 56);
485
486 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(1, 1, 1, 1))));
487 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(1, 1, 1, 1))));
488 if (iw > 2) {
489 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(3, 3, 3, 3))));
490 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(3, 3, 3, 3))));
491 }
492
493 const __m128 vk21c1 = _mm_load_ps(w + 60);
494
495 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
496 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(1, 1, 1, 1))));
497 if (iw > 2) {
498 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
499 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(3, 3, 3, 3))));
500 }
501
502 const __m128 vk01c2 = _mm_load_ps(w + 64);
503
504 __m128 vi0x3 = _mm_setzero_ps();
505 __m128 vi1x3 = _mm_setzero_ps();
506 __m128 vi2x3 = _mm_setzero_ps();
507 __m128 vi3x3 = _mm_setzero_ps();
508 __m128 vi4x3 = _mm_setzero_ps();
509 if (iw > 2) {
510 // viMx3 = ( 0.0, 0.0, 0.0, iM3c2 )
511 vi0x3 = _mm_load_ss(i0 + 8);
512 vi1x3 = _mm_load_ss(i1 + 8);
513 vi2x3 = _mm_load_ss(i2 + 8);
514 vi3x3 = _mm_load_ss(i3 + 8);
515 vi4x3 = _mm_load_ss(i4 + 8);
516 }
517
518 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(2, 2, 2, 2))));
519 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
520 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(0, 0, 0, 0))));
521 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
522
523 const __m128 vk11c2 = _mm_load_ps(w + 68);
524
525 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(2, 2, 2, 2))));
526 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(2, 2, 2, 2))));
527 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(0, 0, 0, 0))));
528 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(0, 0, 0, 0))));
529
530 const __m128 vk21c2 = _mm_load_ps(w + 72);
531
532 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
533 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(2, 2, 2, 2))));
534 vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
535 vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(0, 0, 0, 0))));
536
537 if (iw >= 2) {
538 const __m128 vk02c0 = _mm_load_ps(w + 76);
539
540 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
541 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
542
543 const __m128 vk12c0 = _mm_load_ps(w + 80);
544
545 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
546 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
547
548 const __m128 vk22c0 = _mm_load_ps(w + 84);
549
550 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
551 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
552
553 const __m128 vk02c1 = _mm_load_ps(w + 88);
554
555 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
556 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
557
558 const __m128 vk12c1 = _mm_load_ps(w + 92);
559
560 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
561 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
562
563 const __m128 vk22c1 = _mm_load_ps(w + 96);
564
565 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
566 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
567
568 const __m128 vk02c2 = _mm_load_ps(w + 100);
569
570 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
571 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
572
573 const __m128 vk12c2 = _mm_load_ps(w + 104);
574
575 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
576 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
577
578 const __m128 vk22c2 = _mm_load_ps(w + 108);
579
580 vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
581 vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
582 }
583
584 vo0x0 = _mm_max_ps(vo0x0, vmin);
585 vo1x0 = _mm_max_ps(vo1x0, vmin);
586 vo0x1 = _mm_max_ps(vo0x1, vmin);
587 vo1x1 = _mm_max_ps(vo1x1, vmin);
588
589 vo0x0 = _mm_min_ps(vo0x0, vmax);
590 vo1x0 = _mm_min_ps(vo1x0, vmax);
591 vo0x1 = _mm_min_ps(vo0x1, vmax);
592 vo1x1 = _mm_min_ps(vo1x1, vmax);
593
594 if (iw == 3) {
595 // Exactly 2 output width elements remaining
596 const __m128 vo0c01 = _mm_unpacklo_ps(vo0x0, vo0x1);
597 const __m128 vo0c23 = _mm_unpackhi_ps(vo0x0, vo0x1);
598 const __m128 vo1c01 = _mm_unpacklo_ps(vo1x0, vo1x1);
599 const __m128 vo1c23 = _mm_unpackhi_ps(vo1x0, vo1x1);
600
601 _mm_storel_pi((__m64 *)o1c0, vo1c01); o1c0 += 2;
602 _mm_storel_pi((__m64 *)o1c1, _mm_shuffle_ps(vo1c01, vo1c01, _MM_SHUFFLE(3, 2, 3, 2))); o1c1 += 2;
603 _mm_storel_pi((__m64 *)o1c2, vo1c23); o1c2 += 2;
604 _mm_storel_pi((__m64 *)o1c3, _mm_shuffle_ps(vo1c23, vo1c23, _MM_SHUFFLE(3, 2, 3, 2))); o1c3 += 2;
605
606 _mm_storel_pi((__m64 *)o0c0, vo0c01); o0c0 += 2;
607 _mm_storel_pi((__m64 *)o0c1, _mm_shuffle_ps(vo0c01, vo0c01, _MM_SHUFFLE(3, 2, 3, 2))); o0c1 += 2;
608 _mm_storel_pi((__m64 *)o0c2, vo0c23); o0c2 += 2;
609 _mm_storel_pi((__m64 *)o0c3, _mm_shuffle_ps(vo0c23, vo0c23, _MM_SHUFFLE(3, 2, 3, 2))); o0c3 += 2;
610 } else {
611 // Exactly 1 output width element remaining
612
613 _mm_store_ss(o1c0, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(0, 0, 0, 0))); o1c0 += 1;
614 _mm_store_ss(o1c1, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(1, 1, 1, 1))); o1c1 += 1;
615 _mm_store_ss(o1c2, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(2, 2, 2, 2))); o1c2 += 1;
616 _mm_store_ss(o1c3, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(3, 3, 3, 3))); o1c3 += 1;
617
618 _mm_store_ss(o0c0, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(0, 0, 0, 0))); o0c0 += 1;
619 _mm_store_ss(o0c1, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(1, 1, 1, 1))); o0c1 += 1;
620 _mm_store_ss(o0c2, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(2, 2, 2, 2))); o0c2 += 1;
621 _mm_store_ss(o0c3, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(3, 3, 3, 3))); o0c3 += 1;
622 }
623 }
624 // Move output pointers back to the position of the first pixel in a row,
625 // and forward to the next block of output channels.
626 o0c0 = (float*) ((uintptr_t) o0c0 + output_channel_increment);
627 o0c1 = (float*) ((uintptr_t) o0c1 + output_channel_increment);
628 o0c2 = (float*) ((uintptr_t) o0c2 + output_channel_increment);
629 o0c3 = (float*) ((uintptr_t) o0c3 + output_channel_increment);
630 o1c0 = (float*) ((uintptr_t) o1c0 + output_channel_increment);
631 o1c1 = (float*) ((uintptr_t) o1c1 + output_channel_increment);
632 o1c2 = (float*) ((uintptr_t) o1c2 + output_channel_increment);
633 o1c3 = (float*) ((uintptr_t) o1c3 + output_channel_increment);
634 // Revert input pointers to the position of the first pixel in a row
635 i0 = (const float*) ((uintptr_t) i0 - input_width_increment);
636 i1 = (const float*) ((uintptr_t) i1 - input_width_increment);
637 i2 = (const float*) ((uintptr_t) i2 - input_width_increment);
638 i3 = (const float*) ((uintptr_t) i3 - input_width_increment);
639 i4 = (const float*) ((uintptr_t) i4 - input_width_increment);
640 // Move to the block of weights for the next 4 output channels
641 w += 112;
642 c = doz(c, 4);
643 } while (c != 0);
644 // Move output pointers forward to the next two rows
645 output0 = (float*) ((uintptr_t) output1 + output_height_stride);
646 output1 = (float*) ((uintptr_t) output0 + output_height_stride);
647 // Move input pointers forward to the next four rows
648 i0 = i4;
649 i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
650 i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
651 i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
652 i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
653 }
654 }
655