1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 
8 #include <xmmintrin.h>
9 
10 #include <xnnpack/conv.h>
11 #include <xnnpack/math.h>
12 
13 
xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_channel_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])14 void xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2(
15     size_t input_height,
16     size_t input_width,
17     size_t output_y_start,
18     size_t output_y_end,
19     const float* input,
20     const float* zero,
21     const float* weights,
22     float* output,
23     size_t input_padding_top,
24     size_t output_channels,
25     size_t output_height_stride,
26     size_t output_channel_stride,
27     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
28 {
29   assert(input_width != 0);
30   assert(output_y_end > output_y_start);
31   assert(input_padding_top <= 1);
32   assert(output_channels != 0);
33 
34   const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
35   const size_t input_width_increment = round_down_po2(input_width, 4) * 3 /* channels */ * sizeof(float);
36   const size_t output_width = (input_width + 1) / 2;
37   const size_t output_channel_increment = output_channel_stride * 4 - output_width * sizeof(float);
38 
39   // Adjustment for padding processed below
40   const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top));
41   const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
42   const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
43   const float* i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
44   const float* i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
45   float* output0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
46   float* output1 = (float*) ((uintptr_t) output0 + output_height_stride);
47 
48   if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
49     i0 = zero;
50   }
51 
52   const __m128 vmin = _mm_load_ps(params->sse.min);
53   const __m128 vmax = _mm_load_ps(params->sse.max);
54 
55   for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 2) {
56     const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
57     const size_t input_y4 = input_y2 + 2;
58     if XNN_UNPREDICTABLE(input_y2 >= input_height) {
59       i2 = zero;
60     }
61     if XNN_UNPREDICTABLE(input_y4 > input_height) {
62       i3 = zero;
63     }
64     if XNN_UNPREDICTABLE(input_y4 >= input_height) {
65       i4 = zero;
66     }
67     if XNN_UNPREDICTABLE(output_y + 2 > output_y_end) {
68       output1 = output0;
69     }
70 
71     const float* w = weights;
72     size_t c = output_channels;
73     float* o0c0 = output0;
74     float* o1c0 = output1;
75     float* o0c1 = (float*) ((uintptr_t) o0c0 + output_channel_stride);
76     float* o1c1 = (float*) ((uintptr_t) o1c0 + output_channel_stride);
77     float* o0c2 = (float*) ((uintptr_t) o0c1 + output_channel_stride);
78     float* o1c2 = (float*) ((uintptr_t) o1c1 + output_channel_stride);
79     float* o0c3 = (float*) ((uintptr_t) o0c2 + output_channel_stride);
80     float* o1c3 = (float*) ((uintptr_t) o1c2 + output_channel_stride);
81     do {
82       if XNN_UNPREDICTABLE(c < 2) {
83         o0c1 = o0c0;
84         o1c1 = o1c0;
85       }
86       if XNN_UNPREDICTABLE(c <= 2) {
87         o0c2 = o0c1;
88         o1c2 = o1c1;
89       }
90       if XNN_UNPREDICTABLE(c < 4) {
91         o0c3 = o0c2;
92         o1c3 = o1c2;
93       }
94 
95       // viMx0 = ( iM0c2, iM0c1, iM0c0, --- )
96       __m128 vi0x0 = _mm_setzero_ps();
97       __m128 vi1x0 = _mm_setzero_ps();
98       __m128 vi2x0 = _mm_setzero_ps();
99       __m128 vi3x0 = _mm_setzero_ps();
100       __m128 vi4x0 = _mm_setzero_ps();
101 
102       size_t iw = input_width;
103       for (; iw >= 4; iw -= 4) {
104         __m128 vo0x0 = _mm_load_ps(w);
105         __m128 vo1x0 = vo0x0;
106         __m128 vo0x1 = vo0x0;
107         __m128 vo1x1 = vo0x0;
108 
109         const __m128 vk00c0 = _mm_load_ps(w + 4);
110 
111         // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
112         const __m128 vi0x1 = _mm_loadu_ps(i0); i0 += 4;
113         const __m128 vi1x1 = _mm_loadu_ps(i1); i1 += 4;
114         const __m128 vi2x1 = _mm_loadu_ps(i2); i2 += 4;
115         const __m128 vi3x1 = _mm_loadu_ps(i3); i3 += 4;
116         const __m128 vi4x1 = _mm_loadu_ps(i4); i4 += 4;
117 
118         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(1, 1, 1, 1))));
119         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
120         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
121         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
122 
123         const __m128 vk10c0 = _mm_load_ps(w + 8);
124 
125         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(1, 1, 1, 1))));
126         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(1, 1, 1, 1))));
127         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
128         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
129 
130         const __m128 vk20c0 = _mm_load_ps(w + 12);
131 
132         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
133         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(1, 1, 1, 1))));
134         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
135         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
136 
137         const __m128 vk00c1 = _mm_load_ps(w + 16);
138 
139         // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
140         const __m128 vi0x2 = _mm_loadu_ps(i0); i0 += 4;
141         const __m128 vi1x2 = _mm_loadu_ps(i1); i1 += 4;
142         const __m128 vi2x2 = _mm_loadu_ps(i2); i2 += 4;
143         const __m128 vi3x2 = _mm_loadu_ps(i3); i3 += 4;
144         const __m128 vi4x2 = _mm_loadu_ps(i4); i4 += 4;
145 
146         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(2, 2, 2, 2))));
147         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
148         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
149         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
150 
151         const __m128 vk10c1 = _mm_load_ps(w + 20);
152 
153         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(2, 2, 2, 2))));
154         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(2, 2, 2, 2))));
155         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
156         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
157 
158         const __m128 vk20c1 = _mm_load_ps(w + 24);
159 
160         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
161         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(2, 2, 2, 2))));
162         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
163         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
164 
165         const __m128 vk00c2 = _mm_load_ps(w + 28);
166 
167         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(3, 3, 3, 3))));
168         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
169         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
170         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
171 
172         const __m128 vk10c2 = _mm_load_ps(w + 32);
173 
174         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(3, 3, 3, 3))));
175         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(3, 3, 3, 3))));
176         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
177         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
178 
179         const __m128 vk20c2 = _mm_load_ps(w + 36);
180 
181         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
182         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(3, 3, 3, 3))));
183         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
184         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
185 
186         const __m128 vk01c0 = _mm_load_ps(w + 40);
187 
188         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(0, 0, 0, 0))));
189         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
190         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(2, 2, 2, 2))));
191         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
192 
193         const __m128 vk11c0 = _mm_load_ps(w + 44);
194 
195         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(0, 0, 0, 0))));
196         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(0, 0, 0, 0))));
197         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(2, 2, 2, 2))));
198         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(2, 2, 2, 2))));
199 
200         const __m128 vk21c0 = _mm_load_ps(w + 48);
201 
202         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
203         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(0, 0, 0, 0))));
204         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
205         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(2, 2, 2, 2))));
206 
207         const __m128 vk01c1 = _mm_load_ps(w + 52);
208 
209         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(1, 1, 1, 1))));
210         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
211         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(3, 3, 3, 3))));
212         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
213 
214         const __m128 vk11c1 = _mm_load_ps(w + 56);
215 
216         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(1, 1, 1, 1))));
217         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(1, 1, 1, 1))));
218         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(3, 3, 3, 3))));
219         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(3, 3, 3, 3))));
220 
221         const __m128 vk21c1 = _mm_load_ps(w + 60);
222 
223         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
224         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(1, 1, 1, 1))));
225         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
226         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(3, 3, 3, 3))));
227 
228         const __m128 vk01c2 = _mm_load_ps(w + 64);
229 
230         // viMx3 = ( iM4c2, iM4c1, iM4c0, iM3c2 )
231         const __m128 vi0x3 = _mm_loadu_ps(i0); i0 += 4;
232         const __m128 vi1x3 = _mm_loadu_ps(i1); i1 += 4;
233         const __m128 vi2x3 = _mm_loadu_ps(i2); i2 += 4;
234         const __m128 vi3x3 = _mm_loadu_ps(i3); i3 += 4;
235         const __m128 vi4x3 = _mm_loadu_ps(i4); i4 += 4;
236 
237         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(2, 2, 2, 2))));
238         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
239         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(0, 0, 0, 0))));
240         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
241 
242         const __m128 vk11c2 = _mm_load_ps(w + 68);
243 
244         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(2, 2, 2, 2))));
245         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(2, 2, 2, 2))));
246         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(0, 0, 0, 0))));
247         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(0, 0, 0, 0))));
248 
249         const __m128 vk21c2 = _mm_load_ps(w + 72);
250 
251         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
252         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(2, 2, 2, 2))));
253         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
254         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(0, 0, 0, 0))));
255 
256         const __m128 vk02c0 = _mm_load_ps(w + 76);
257 
258         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
259         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
260         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(1, 1, 1, 1))));
261         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(1, 1, 1, 1))));
262 
263         const __m128 vk12c0 = _mm_load_ps(w + 80);
264 
265         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
266         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
267         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(1, 1, 1, 1))));
268         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(1, 1, 1, 1))));
269 
270         const __m128 vk22c0 = _mm_load_ps(w + 84);
271 
272         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
273         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
274         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(1, 1, 1, 1))));
275         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(1, 1, 1, 1))));
276 
277         const __m128 vk02c1 = _mm_load_ps(w + 88);
278 
279         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
280         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
281         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(2, 2, 2, 2))));
282         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(2, 2, 2, 2))));
283 
284         const __m128 vk12c1 = _mm_load_ps(w + 92);
285 
286         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
287         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
288         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(2, 2, 2, 2))));
289         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(2, 2, 2, 2))));
290 
291         const __m128 vk22c1 = _mm_load_ps(w + 96);
292 
293         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
294         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
295         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(2, 2, 2, 2))));
296         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(2, 2, 2, 2))));
297 
298         const __m128 vk02c2 = _mm_load_ps(w + 100);
299 
300         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
301         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
302         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(3, 3, 3, 3))));
303         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(3, 3, 3, 3))));
304 
305         const __m128 vk12c2 = _mm_load_ps(w + 104);
306 
307         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
308         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
309         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(3, 3, 3, 3))));
310         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(3, 3, 3, 3))));
311 
312         const __m128 vk22c2 = _mm_load_ps(w + 108);
313 
314         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
315         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
316         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(3, 3, 3, 3))));
317         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(3, 3, 3, 3))));
318 
319         vi0x0 = vi0x3;
320         vi1x0 = vi1x3;
321         vi2x0 = vi2x3;
322         vi3x0 = vi3x3;
323         vi4x0 = vi4x3;
324 
325         vo0x0 = _mm_max_ps(vo0x0, vmin);
326         vo1x0 = _mm_max_ps(vo1x0, vmin);
327         vo0x1 = _mm_max_ps(vo0x1, vmin);
328         vo1x1 = _mm_max_ps(vo1x1, vmin);
329 
330         vo0x0 = _mm_min_ps(vo0x0, vmax);
331         vo1x0 = _mm_min_ps(vo1x0, vmax);
332         vo0x1 = _mm_min_ps(vo0x1, vmax);
333         vo1x1 = _mm_min_ps(vo1x1, vmax);
334 
335         const __m128 vo0c01 = _mm_unpacklo_ps(vo0x0, vo0x1);
336         const __m128 vo0c23 = _mm_unpackhi_ps(vo0x0, vo0x1);
337         const __m128 vo1c01 = _mm_unpacklo_ps(vo1x0, vo1x1);
338         const __m128 vo1c23 = _mm_unpackhi_ps(vo1x0, vo1x1);
339 
340         // Always 2+ output width elements remaining
341         _mm_storel_pi((__m64 *)o1c0, vo1c01); o1c0 += 2;
342         _mm_storel_pi((__m64 *)o1c1, _mm_shuffle_ps(vo1c01, vo1c01, _MM_SHUFFLE(3, 2, 3, 2))); o1c1 += 2;
343         _mm_storel_pi((__m64 *)o1c2, vo1c23); o1c2 += 2;
344         _mm_storel_pi((__m64 *)o1c3, _mm_shuffle_ps(vo1c23, vo1c23, _MM_SHUFFLE(3, 2, 3, 2))); o1c3 += 2;
345 
346         _mm_storel_pi((__m64 *)o0c0, vo0c01); o0c0 += 2;
347         _mm_storel_pi((__m64 *)o0c1, _mm_shuffle_ps(vo0c01, vo0c01, _MM_SHUFFLE(3, 2, 3, 2))); o0c1 += 2;
348         _mm_storel_pi((__m64 *)o0c2, vo0c23); o0c2 += 2;
349         _mm_storel_pi((__m64 *)o0c3, _mm_shuffle_ps(vo0c23, vo0c23, _MM_SHUFFLE(3, 2, 3, 2))); o0c3 += 2;
350       }
351       assert(iw < 4);
352       if XNN_UNLIKELY(iw != 0) {
353         __m128 vo0x0 = _mm_load_ps(w);
354         __m128 vo1x0 = vo0x0;
355         __m128 vo0x1 = vo0x0;
356         __m128 vo1x1 = vo0x0;
357 
358         const __m128 vk00c0 = _mm_load_ps(w + 4);
359 
360         // viMx1 = ( iM2c0, iM1c2, iM1c1, iM1c0 )
361         __m128 vi0x1 = _mm_loadu_ps(i0);
362         __m128 vi1x1 = _mm_loadu_ps(i1);
363         __m128 vi2x1 = _mm_loadu_ps(i2);
364         __m128 vi3x1 = _mm_loadu_ps(i3);
365         __m128 vi4x1 = _mm_loadu_ps(i4);
366 
367         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(1, 1, 1, 1))));
368         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
369         if (iw > 2) {
370           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
371           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
372         }
373 
374         const __m128 vk10c0 = _mm_load_ps(w + 8);
375 
376         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(1, 1, 1, 1))));
377         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(1, 1, 1, 1))));
378         if (iw > 2) {
379           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
380           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
381         }
382 
383         const __m128 vk20c0 = _mm_load_ps(w + 12);
384 
385         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(1, 1, 1, 1))));
386         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(1, 1, 1, 1))));
387         if (iw > 2) {
388           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
389           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
390         }
391 
392         const __m128 vk00c1 = _mm_load_ps(w + 16);
393 
394         __m128 vi0x2 = _mm_setzero_ps();
395         __m128 vi1x2 = _mm_setzero_ps();
396         __m128 vi2x2 = _mm_setzero_ps();
397         __m128 vi3x2 = _mm_setzero_ps();
398         __m128 vi4x2 = _mm_setzero_ps();
399         if (iw >= 2) {
400           // viMx2 = ( iM3c1, iM3c0, iM2c2, iM2c1 )
401           vi0x2 = _mm_loadu_ps(i0 + 4);
402           vi1x2 = _mm_loadu_ps(i1 + 4);
403           vi2x2 = _mm_loadu_ps(i2 + 4);
404           vi3x2 = _mm_loadu_ps(i3 + 4);
405           vi4x2 = _mm_loadu_ps(i4 + 4);
406         }
407 
408         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(2, 2, 2, 2))));
409         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
410         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
411         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
412 
413         const __m128 vk10c1 = _mm_load_ps(w + 20);
414 
415         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(2, 2, 2, 2))));
416         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(2, 2, 2, 2))));
417         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
418         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
419 
420         const __m128 vk20c1 = _mm_load_ps(w + 24);
421 
422         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(2, 2, 2, 2))));
423         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(2, 2, 2, 2))));
424         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
425         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
426 
427         const __m128 vk00c2 = _mm_load_ps(w + 28);
428 
429         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x0, vi0x0, _MM_SHUFFLE(3, 3, 3, 3))));
430         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
431         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
432         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
433 
434         const __m128 vk10c2 = _mm_load_ps(w + 32);
435 
436         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(3, 3, 3, 3))));
437         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(3, 3, 3, 3))));
438         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
439         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
440 
441         const __m128 vk20c2 = _mm_load_ps(w + 36);
442 
443         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x0, vi2x0, _MM_SHUFFLE(3, 3, 3, 3))));
444         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x0, vi4x0, _MM_SHUFFLE(3, 3, 3, 3))));
445         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
446         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
447 
448         const __m128 vk01c0 = _mm_load_ps(w + 40);
449 
450         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(0, 0, 0, 0))));
451         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
452         if (iw > 2) {
453           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(2, 2, 2, 2))));
454           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
455         }
456 
457         const __m128 vk11c0 = _mm_load_ps(w + 44);
458 
459         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(0, 0, 0, 0))));
460         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(0, 0, 0, 0))));
461         if (iw > 2) {
462           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(2, 2, 2, 2))));
463           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c0, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(2, 2, 2, 2))));
464         }
465 
466         const __m128 vk21c0 = _mm_load_ps(w + 48);
467 
468         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))));
469         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(0, 0, 0, 0))));
470         if (iw > 2) {
471           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(2, 2, 2, 2))));
472           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(2, 2, 2, 2))));
473         }
474 
475         const __m128 vk01c1 = _mm_load_ps(w + 52);
476 
477         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(1, 1, 1, 1))));
478         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
479         if (iw > 2) {
480           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(3, 3, 3, 3))));
481           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
482         }
483 
484         const __m128 vk11c1 = _mm_load_ps(w + 56);
485 
486         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(1, 1, 1, 1))));
487         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(1, 1, 1, 1))));
488         if (iw > 2) {
489           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(3, 3, 3, 3))));
490           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(3, 3, 3, 3))));
491         }
492 
493         const __m128 vk21c1 = _mm_load_ps(w + 60);
494 
495         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))));
496         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(1, 1, 1, 1))));
497         if (iw > 2) {
498           vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))));
499           vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(3, 3, 3, 3))));
500         }
501 
502         const __m128 vk01c2 = _mm_load_ps(w + 64);
503 
504         __m128 vi0x3 = _mm_setzero_ps();
505         __m128 vi1x3 = _mm_setzero_ps();
506         __m128 vi2x3 = _mm_setzero_ps();
507         __m128 vi3x3 = _mm_setzero_ps();
508         __m128 vi4x3 = _mm_setzero_ps();
509         if (iw > 2) {
510           // viMx3 = ( 0.0, 0.0, 0.0, iM3c2 )
511           vi0x3 = _mm_load_ss(i0 + 8);
512           vi1x3 = _mm_load_ss(i1 + 8);
513           vi2x3 = _mm_load_ss(i2 + 8);
514           vi3x3 = _mm_load_ss(i3 + 8);
515           vi4x3 = _mm_load_ss(i4 + 8);
516         }
517 
518         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(2, 2, 2, 2))));
519         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
520         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi0x3, vi0x3, _MM_SHUFFLE(0, 0, 0, 0))));
521         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
522 
523         const __m128 vk11c2 = _mm_load_ps(w + 68);
524 
525         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(2, 2, 2, 2))));
526         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(2, 2, 2, 2))));
527         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi1x3, vi1x3, _MM_SHUFFLE(0, 0, 0, 0))));
528         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk11c2, _mm_shuffle_ps(vi3x3, vi3x3, _MM_SHUFFLE(0, 0, 0, 0))));
529 
530         const __m128 vk21c2 = _mm_load_ps(w + 72);
531 
532         vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))));
533         vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(2, 2, 2, 2))));
534         vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x3, vi2x3, _MM_SHUFFLE(0, 0, 0, 0))));
535         vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(0, 0, 0, 0))));
536 
537         if (iw >= 2) {
538           const __m128 vk02c0 = _mm_load_ps(w + 76);
539 
540           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))));
541           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
542 
543           const __m128 vk12c0 = _mm_load_ps(w + 80);
544 
545           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))));
546           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c0, _mm_shuffle_ps(vi3x1, vi3x1, _MM_SHUFFLE(3, 3, 3, 3))));
547 
548           const __m128 vk22c0 = _mm_load_ps(w + 84);
549 
550           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))));
551           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(3, 3, 3, 3))));
552 
553           const __m128 vk02c1 = _mm_load_ps(w + 88);
554 
555           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))));
556           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
557 
558           const __m128 vk12c1 = _mm_load_ps(w + 92);
559 
560           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))));
561           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))));
562 
563           const __m128 vk22c1 = _mm_load_ps(w + 96);
564 
565           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))));
566           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(0, 0, 0, 0))));
567 
568           const __m128 vk02c2 = _mm_load_ps(w + 100);
569 
570           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))));
571           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
572 
573           const __m128 vk12c2 = _mm_load_ps(w + 104);
574 
575           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))));
576           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk12c2, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(1, 1, 1, 1))));
577 
578           const __m128 vk22c2 = _mm_load_ps(w + 108);
579 
580           vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))));
581           vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(1, 1, 1, 1))));
582         }
583 
584         vo0x0 = _mm_max_ps(vo0x0, vmin);
585         vo1x0 = _mm_max_ps(vo1x0, vmin);
586         vo0x1 = _mm_max_ps(vo0x1, vmin);
587         vo1x1 = _mm_max_ps(vo1x1, vmin);
588 
589         vo0x0 = _mm_min_ps(vo0x0, vmax);
590         vo1x0 = _mm_min_ps(vo1x0, vmax);
591         vo0x1 = _mm_min_ps(vo0x1, vmax);
592         vo1x1 = _mm_min_ps(vo1x1, vmax);
593 
594         if (iw == 3) {
595           // Exactly 2 output width elements remaining
596           const __m128 vo0c01 = _mm_unpacklo_ps(vo0x0, vo0x1);
597           const __m128 vo0c23 = _mm_unpackhi_ps(vo0x0, vo0x1);
598           const __m128 vo1c01 = _mm_unpacklo_ps(vo1x0, vo1x1);
599           const __m128 vo1c23 = _mm_unpackhi_ps(vo1x0, vo1x1);
600 
601           _mm_storel_pi((__m64 *)o1c0, vo1c01); o1c0 += 2;
602           _mm_storel_pi((__m64 *)o1c1, _mm_shuffle_ps(vo1c01, vo1c01, _MM_SHUFFLE(3, 2, 3, 2))); o1c1 += 2;
603           _mm_storel_pi((__m64 *)o1c2, vo1c23); o1c2 += 2;
604           _mm_storel_pi((__m64 *)o1c3, _mm_shuffle_ps(vo1c23, vo1c23, _MM_SHUFFLE(3, 2, 3, 2))); o1c3 += 2;
605 
606           _mm_storel_pi((__m64 *)o0c0, vo0c01); o0c0 += 2;
607           _mm_storel_pi((__m64 *)o0c1, _mm_shuffle_ps(vo0c01, vo0c01, _MM_SHUFFLE(3, 2, 3, 2))); o0c1 += 2;
608           _mm_storel_pi((__m64 *)o0c2, vo0c23); o0c2 += 2;
609           _mm_storel_pi((__m64 *)o0c3, _mm_shuffle_ps(vo0c23, vo0c23, _MM_SHUFFLE(3, 2, 3, 2))); o0c3 += 2;
610         } else {
611           // Exactly 1 output width element remaining
612 
613           _mm_store_ss(o1c0, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(0, 0, 0, 0))); o1c0 += 1;
614           _mm_store_ss(o1c1, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(1, 1, 1, 1))); o1c1 += 1;
615           _mm_store_ss(o1c2, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(2, 2, 2, 2))); o1c2 += 1;
616           _mm_store_ss(o1c3, _mm_shuffle_ps(vo1x0, vo1x0, _MM_SHUFFLE(3, 3, 3, 3))); o1c3 += 1;
617 
618           _mm_store_ss(o0c0, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(0, 0, 0, 0))); o0c0 += 1;
619           _mm_store_ss(o0c1, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(1, 1, 1, 1))); o0c1 += 1;
620           _mm_store_ss(o0c2, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(2, 2, 2, 2))); o0c2 += 1;
621           _mm_store_ss(o0c3, _mm_shuffle_ps(vo0x0, vo0x0, _MM_SHUFFLE(3, 3, 3, 3))); o0c3 += 1;
622         }
623       }
624       // Move output pointers back to the position of the first pixel in a row,
625       // and forward to the next block of output channels.
626       o0c0 = (float*) ((uintptr_t) o0c0 + output_channel_increment);
627       o0c1 = (float*) ((uintptr_t) o0c1 + output_channel_increment);
628       o0c2 = (float*) ((uintptr_t) o0c2 + output_channel_increment);
629       o0c3 = (float*) ((uintptr_t) o0c3 + output_channel_increment);
630       o1c0 = (float*) ((uintptr_t) o1c0 + output_channel_increment);
631       o1c1 = (float*) ((uintptr_t) o1c1 + output_channel_increment);
632       o1c2 = (float*) ((uintptr_t) o1c2 + output_channel_increment);
633       o1c3 = (float*) ((uintptr_t) o1c3 + output_channel_increment);
634       // Revert input pointers to the position of the first pixel in a row
635       i0 = (const float*) ((uintptr_t) i0 - input_width_increment);
636       i1 = (const float*) ((uintptr_t) i1 - input_width_increment);
637       i2 = (const float*) ((uintptr_t) i2 - input_width_increment);
638       i3 = (const float*) ((uintptr_t) i3 - input_width_increment);
639       i4 = (const float*) ((uintptr_t) i4 - input_width_increment);
640       // Move to the block of weights for the next 4 output channels
641       w += 112;
642       c = doz(c, 4);
643     } while (c != 0);
644     // Move output pointers forward to the next two rows
645     output0 = (float*) ((uintptr_t) output1 + output_height_stride);
646     output1 = (float*) ((uintptr_t) output0 + output_height_stride);
647     // Move input pointers forward to the next four rows
648     i0 = i4;
649     i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
650     i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
651     i3 = (const float*) ((uintptr_t) i2 + input_height_stride);
652     i4 = (const float*) ((uintptr_t) i3 + input_height_stride);
653   }
654 }
655