1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gavgpool/multipass-sse.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <smmintrin.h>
13
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/math.h>
16
17
xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c24_acc2(size_t rows,size_t channels,const int8_t * input,size_t input_stride,const int8_t * zero,int32_t * buffer,int8_t * output,const union xnn_qs8_avgpool_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c24_acc2(
19 size_t rows,
20 size_t channels,
21 const int8_t* input,
22 size_t input_stride,
23 const int8_t* zero,
24 int32_t* buffer,
25 int8_t* output,
26 const union xnn_qs8_avgpool_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
27 {
28 assert(rows > 7);
29 assert(channels != 0);
30
31 const int8_t* i0 = input;
32 const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
33 const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
34 const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
35 const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
36 const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
37 const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
38 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8);
39
40 const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias);
41 int32_t* b = buffer;
42 size_t c = channels;
43 for (; c >= 24; c -= 24) {
44 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
45 const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
46 const __m128i vxi0xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16)));
47 i0 += 24;
48 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
49 const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
50 const __m128i vxi1xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16)));
51 i1 += 24;
52 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
53 const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
54 const __m128i vxi2xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16)));
55 i2 += 24;
56 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
57 const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
58 const __m128i vxi3xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16)));
59 i3 += 24;
60 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
61 const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
62 const __m128i vxi4xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16)));
63 i4 += 24;
64 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
65 const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
66 const __m128i vxi5xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16)));
67 i5 += 24;
68 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
69 const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
70 const __m128i vxi6xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16)));
71 i6 += 24;
72
73
74 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
75 __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
76 __m128i vacc0xGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
77 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567);
78 __m128i vacc1x89ABCDEF = _mm_add_epi16(vxi2x89ABCDEF, vxi3x89ABCDEF);
79 __m128i vacc1xGHIJKLMN = _mm_add_epi16(vxi2xGHIJKLMN, vxi3xGHIJKLMN);
80
81 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi4x01234567);
82 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vxi4x89ABCDEF);
83 vacc0xGHIJKLMN = _mm_add_epi16(vacc0xGHIJKLMN, vxi4xGHIJKLMN);
84 vacc1x01234567 = _mm_add_epi16(vacc1x01234567, vxi5x01234567);
85 vacc1x89ABCDEF = _mm_add_epi16(vacc1x89ABCDEF, vxi5x89ABCDEF);
86 vacc1xGHIJKLMN = _mm_add_epi16(vacc1xGHIJKLMN, vxi5xGHIJKLMN);
87 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi6x01234567);
88 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vxi6x89ABCDEF);
89 vacc0xGHIJKLMN = _mm_add_epi16(vacc0xGHIJKLMN, vxi6xGHIJKLMN);
90
91 // Add up all accumulators to vacc0x0123456789ABCDEFGHIJKLMN
92 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
93 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vacc1x89ABCDEF);
94 vacc0xGHIJKLMN = _mm_add_epi16(vacc0xGHIJKLMN, vacc1xGHIJKLMN);
95
96 const __m128i vacc0123 = _mm_add_epi32(vbias, _mm_cvtepi16_epi32(vacc0x01234567));
97 const __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)));
98 const __m128i vacc89AB = _mm_add_epi32(vbias, _mm_cvtepi16_epi32(vacc0x89ABCDEF));
99 const __m128i vaccCDEF = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vacc0x89ABCDEF, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x89ABCDEF)));
100 const __m128i vaccGHIJ = _mm_add_epi32(vbias, _mm_cvtepi16_epi32(vacc0xGHIJKLMN));
101 const __m128i vaccKLMN = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vacc0xGHIJKLMN, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0xGHIJKLMN)));
102
103 _mm_store_si128((__m128i*) b, vacc0123);
104 _mm_store_si128((__m128i*) (b + 4), vacc4567);
105 _mm_store_si128((__m128i*) (b + 8), vacc89AB);
106 _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
107 _mm_store_si128((__m128i*) (b + 16), vaccGHIJ);
108 _mm_store_si128((__m128i*) (b + 20), vaccKLMN);
109 b += 24;
110 }
111 if XNN_UNLIKELY(c != 0) {
112 do {
113 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
114 i0 += 8;
115 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
116 i1 += 8;
117 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
118 i2 += 8;
119 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
120 i3 += 8;
121 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
122 i4 += 8;
123 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
124 i5 += 8;
125 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
126 i6 += 8;
127
128
129 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
130 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567);
131
132 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi4x01234567);
133 vacc1x01234567 = _mm_add_epi16(vacc1x01234567, vxi5x01234567);
134 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi6x01234567);
135
136 // Add up all accumulators to vacc0x01234567
137 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
138
139 const __m128i vacc0123 = _mm_add_epi32(vbias, _mm_cvtepi16_epi32(vacc0x01234567));
140 const __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)));
141
142 _mm_store_si128((__m128i*) b, vacc0123);
143 _mm_store_si128((__m128i*) (b + 4), vacc4567);
144 b += 8;
145
146 c = doz(c, 8);
147 } while (c != 0);
148 }
149
150 for (rows -= 7; rows > 7; rows -= 7) {
151 i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
152 i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
153 i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
154 i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
155 i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
156 i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
157 i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
158
159 int32_t* b = buffer;
160 size_t c = channels;
161 for (; c >= 24; c -= 24) {
162 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
163 const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
164 const __m128i vxi0xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16)));
165 i0 += 24;
166 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
167 const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
168 const __m128i vxi1xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16)));
169 i1 += 24;
170 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
171 const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
172 const __m128i vxi2xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16)));
173 i2 += 24;
174 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
175 const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
176 const __m128i vxi3xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16)));
177 i3 += 24;
178 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
179 const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
180 const __m128i vxi4xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16)));
181 i4 += 24;
182 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
183 const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
184 const __m128i vxi5xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16)));
185 i5 += 24;
186 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
187 const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
188 const __m128i vxi6xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16)));
189 i6 += 24;
190
191
192 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
193 __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
194 __m128i vacc0xGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
195 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567);
196 __m128i vacc1x89ABCDEF = _mm_add_epi16(vxi2x89ABCDEF, vxi3x89ABCDEF);
197 __m128i vacc1xGHIJKLMN = _mm_add_epi16(vxi2xGHIJKLMN, vxi3xGHIJKLMN);
198
199 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi4x01234567);
200 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vxi4x89ABCDEF);
201 vacc0xGHIJKLMN = _mm_add_epi16(vacc0xGHIJKLMN, vxi4xGHIJKLMN);
202 vacc1x01234567 = _mm_add_epi16(vacc1x01234567, vxi5x01234567);
203 vacc1x89ABCDEF = _mm_add_epi16(vacc1x89ABCDEF, vxi5x89ABCDEF);
204 vacc1xGHIJKLMN = _mm_add_epi16(vacc1xGHIJKLMN, vxi5xGHIJKLMN);
205 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi6x01234567);
206 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vxi6x89ABCDEF);
207 vacc0xGHIJKLMN = _mm_add_epi16(vacc0xGHIJKLMN, vxi6xGHIJKLMN);
208
209 // Add up all accumulators to vacc0x0123456789ABCDEFGHIJKLMN
210 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
211 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vacc1x89ABCDEF);
212 vacc0xGHIJKLMN = _mm_add_epi16(vacc0xGHIJKLMN, vacc1xGHIJKLMN);
213
214 const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), _mm_load_si128((const __m128i*) (b + 0)));
215 const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)), _mm_load_si128((const __m128i*) (b + 4)));
216 const __m128i vacc89AB = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x89ABCDEF), _mm_load_si128((const __m128i*) (b + 8)));
217 const __m128i vaccCDEF = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x89ABCDEF, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x89ABCDEF)), _mm_load_si128((const __m128i*) (b + 12)));
218 const __m128i vaccGHIJ = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0xGHIJKLMN), _mm_load_si128((const __m128i*) (b + 16)));
219 const __m128i vaccKLMN = _mm_add_epi32(_mm_unpackhi_epi16(vacc0xGHIJKLMN, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0xGHIJKLMN)), _mm_load_si128((const __m128i*) (b + 20)));
220
221 _mm_store_si128((__m128i*) b, vacc0123);
222 _mm_store_si128((__m128i*) (b + 4), vacc4567);
223 _mm_store_si128((__m128i*) (b + 8), vacc89AB);
224 _mm_store_si128((__m128i*) (b + 12), vaccCDEF);
225 _mm_store_si128((__m128i*) (b + 16), vaccGHIJ);
226 _mm_store_si128((__m128i*) (b + 20), vaccKLMN);
227 b += 24;
228 }
229 if XNN_UNLIKELY(c != 0) {
230 do {
231 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
232 i0 += 8;
233 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
234 i1 += 8;
235 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
236 i2 += 8;
237 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
238 i3 += 8;
239 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
240 i4 += 8;
241 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
242 i5 += 8;
243 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
244 i6 += 8;
245
246
247 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
248 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567);
249
250 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi4x01234567);
251 vacc1x01234567 = _mm_add_epi16(vacc1x01234567, vxi5x01234567);
252 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi6x01234567);
253
254 // Add up all accumulators to vacc0x01234567
255 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
256
257 const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), _mm_load_si128((const __m128i*) b));
258 const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)), _mm_load_si128((const __m128i*) (b + 4)));
259
260 _mm_store_si128((__m128i*) b, vacc0123);
261 _mm_store_si128((__m128i*) (b + 4), vacc4567);
262 b += 8;
263
264 c = doz(c, 8);
265 } while (c != 0);
266 }
267 }
268
269 i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
270 i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
271 if XNN_UNPREDICTABLE(rows < 2) {
272 i1 = zero;
273 }
274 i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
275 if XNN_UNPREDICTABLE(rows <= 2) {
276 i2 = zero;
277 }
278 i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
279 if XNN_UNPREDICTABLE(rows < 4) {
280 i3 = zero;
281 }
282 i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
283 if XNN_UNPREDICTABLE(rows <= 4) {
284 i4 = zero;
285 }
286 i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
287 if XNN_UNPREDICTABLE(rows < 6) {
288 i5 = zero;
289 }
290 i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
291 if XNN_UNPREDICTABLE(rows <= 6) {
292 i6 = zero;
293 }
294
295 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
296 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
297 const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift);
298 while (channels >= 24) {
299 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
300 const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8)));
301 const __m128i vxi0xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 16)));
302 i0 += 24;
303 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
304 const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8)));
305 const __m128i vxi1xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 16)));
306 i1 += 24;
307 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
308 const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8)));
309 const __m128i vxi2xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 16)));
310 i2 += 24;
311 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
312 const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8)));
313 const __m128i vxi3xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 16)));
314 i3 += 24;
315 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
316 const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8)));
317 const __m128i vxi4xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 16)));
318 i4 += 24;
319 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
320 const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8)));
321 const __m128i vxi5xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 16)));
322 i5 += 24;
323 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
324 const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8)));
325 const __m128i vxi6xGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 16)));
326 i6 += 24;
327
328
329 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
330 __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF);
331 __m128i vacc0xGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN);
332 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567);
333 __m128i vacc1x89ABCDEF = _mm_add_epi16(vxi2x89ABCDEF, vxi3x89ABCDEF);
334 __m128i vacc1xGHIJKLMN = _mm_add_epi16(vxi2xGHIJKLMN, vxi3xGHIJKLMN);
335
336 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi4x01234567);
337 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vxi4x89ABCDEF);
338 vacc0xGHIJKLMN = _mm_add_epi16(vacc0xGHIJKLMN, vxi4xGHIJKLMN);
339 vacc1x01234567 = _mm_add_epi16(vacc1x01234567, vxi5x01234567);
340 vacc1x89ABCDEF = _mm_add_epi16(vacc1x89ABCDEF, vxi5x89ABCDEF);
341 vacc1xGHIJKLMN = _mm_add_epi16(vacc1xGHIJKLMN, vxi5xGHIJKLMN);
342 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi6x01234567);
343 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vxi6x89ABCDEF);
344 vacc0xGHIJKLMN = _mm_add_epi16(vacc0xGHIJKLMN, vxi6xGHIJKLMN);
345
346 // Add up all accumulators to vacc0x0123456789ABCDEFGHIJKLMN
347 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
348 vacc0x89ABCDEF = _mm_add_epi16(vacc0x89ABCDEF, vacc1x89ABCDEF);
349 vacc0xGHIJKLMN = _mm_add_epi16(vacc0xGHIJKLMN, vacc1xGHIJKLMN);
350
351 const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), _mm_load_si128((const __m128i*) (buffer + 0)));
352 const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)), _mm_load_si128((const __m128i*) (buffer + 4)));
353 const __m128i vacc89AB = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x89ABCDEF), _mm_load_si128((const __m128i*) (buffer + 8)));
354 const __m128i vaccCDEF = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x89ABCDEF, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x89ABCDEF)), _mm_load_si128((const __m128i*) (buffer + 12)));
355 const __m128i vaccGHIJ = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0xGHIJKLMN), _mm_load_si128((const __m128i*) (buffer + 16)));
356 const __m128i vaccKLMN = _mm_add_epi32(_mm_unpackhi_epi16(vacc0xGHIJKLMN, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0xGHIJKLMN)), _mm_load_si128((const __m128i*) (buffer + 20)));
357 buffer += 24;
358
359 const __m128i vabsacc0123 = _mm_abs_epi32(vacc0123);
360 const __m128i vabsacc4567 = _mm_abs_epi32(vacc4567);
361 const __m128i vabsacc89AB = _mm_abs_epi32(vacc89AB);
362 const __m128i vabsaccCDEF = _mm_abs_epi32(vaccCDEF);
363 const __m128i vabsaccGHIJ = _mm_abs_epi32(vaccGHIJ);
364 const __m128i vabsaccKLMN = _mm_abs_epi32(vaccKLMN);
365
366 const __m128i vabsacc13 = _mm_shuffle_epi32(vabsacc0123, _MM_SHUFFLE(3, 3, 1, 1));
367 const __m128i vabsacc57 = _mm_shuffle_epi32(vabsacc4567, _MM_SHUFFLE(3, 3, 1, 1));
368 const __m128i vabsacc9B = _mm_shuffle_epi32(vabsacc89AB, _MM_SHUFFLE(3, 3, 1, 1));
369 const __m128i vabsaccDF = _mm_shuffle_epi32(vabsaccCDEF, _MM_SHUFFLE(3, 3, 1, 1));
370 const __m128i vabsaccHJ = _mm_shuffle_epi32(vabsaccGHIJ, _MM_SHUFFLE(3, 3, 1, 1));
371 const __m128i vabsaccLN = _mm_shuffle_epi32(vabsaccKLMN, _MM_SHUFFLE(3, 3, 1, 1));
372
373 const __m128i vabsprod02 = _mm_mul_epu32(vabsacc0123, vmultiplier);
374 const __m128i vabsprod13 = _mm_mul_epu32(vabsacc13, vmultiplier);
375 const __m128i vabsprod46 = _mm_mul_epu32(vabsacc4567, vmultiplier);
376 const __m128i vabsprod57 = _mm_mul_epu32(vabsacc57, vmultiplier);
377 const __m128i vabsprod8A = _mm_mul_epu32(vabsacc89AB, vmultiplier);
378 const __m128i vabsprod9B = _mm_mul_epu32(vabsacc9B, vmultiplier);
379 const __m128i vabsprodCE = _mm_mul_epu32(vabsaccCDEF, vmultiplier);
380 const __m128i vabsprodDF = _mm_mul_epu32(vabsaccDF, vmultiplier);
381 const __m128i vabsprodGI = _mm_mul_epu32(vabsaccGHIJ, vmultiplier);
382 const __m128i vabsprodHJ = _mm_mul_epu32(vabsaccHJ, vmultiplier);
383 const __m128i vabsprodKM = _mm_mul_epu32(vabsaccKLMN, vmultiplier);
384 const __m128i vabsprodLN = _mm_mul_epu32(vabsaccLN, vmultiplier);
385
386 const __m128i vabsout02 = _mm_srl_epi64(_mm_add_epi64(vabsprod02, vrounding), vshift);
387 const __m128i vabsout13 = _mm_srl_epi64(_mm_add_epi64(vabsprod13, vrounding), vshift);
388 const __m128i vabsout46 = _mm_srl_epi64(_mm_add_epi64(vabsprod46, vrounding), vshift);
389 const __m128i vabsout57 = _mm_srl_epi64(_mm_add_epi64(vabsprod57, vrounding), vshift);
390 const __m128i vabsout8A = _mm_srl_epi64(_mm_add_epi64(vabsprod8A, vrounding), vshift);
391 const __m128i vabsout9B = _mm_srl_epi64(_mm_add_epi64(vabsprod9B, vrounding), vshift);
392 const __m128i vabsoutCE = _mm_srl_epi64(_mm_add_epi64(vabsprodCE, vrounding), vshift);
393 const __m128i vabsoutDF = _mm_srl_epi64(_mm_add_epi64(vabsprodDF, vrounding), vshift);
394 const __m128i vabsoutGI = _mm_srl_epi64(_mm_add_epi64(vabsprodGI, vrounding), vshift);
395 const __m128i vabsoutHJ = _mm_srl_epi64(_mm_add_epi64(vabsprodHJ, vrounding), vshift);
396 const __m128i vabsoutKM = _mm_srl_epi64(_mm_add_epi64(vabsprodKM, vrounding), vshift);
397 const __m128i vabsoutLN = _mm_srl_epi64(_mm_add_epi64(vabsprodLN, vrounding), vshift);
398
399 const __m128i vabsout0123 = _mm_blend_epi16(vabsout02, _mm_shuffle_epi32(vabsout13, _MM_SHUFFLE(2, 2, 0, 0)), 0xCC);
400 const __m128i vabsout4567 = _mm_blend_epi16(vabsout46, _mm_shuffle_epi32(vabsout57, _MM_SHUFFLE(2, 2, 0, 0)), 0xCC);
401 const __m128i vabsout89AB = _mm_blend_epi16(vabsout8A, _mm_shuffle_epi32(vabsout9B, _MM_SHUFFLE(2, 2, 0, 0)), 0xCC);
402 const __m128i vabsoutCDEF = _mm_blend_epi16(vabsoutCE, _mm_shuffle_epi32(vabsoutDF, _MM_SHUFFLE(2, 2, 0, 0)), 0xCC);
403 const __m128i vabsoutGHIJ = _mm_blend_epi16(vabsoutGI, _mm_shuffle_epi32(vabsoutHJ, _MM_SHUFFLE(2, 2, 0, 0)), 0xCC);
404 const __m128i vabsoutKLMN = _mm_blend_epi16(vabsoutKM, _mm_shuffle_epi32(vabsoutLN, _MM_SHUFFLE(2, 2, 0, 0)), 0xCC);
405
406 const __m128i vout0123 = _mm_sign_epi32(vabsout0123, vacc0123);
407 const __m128i vout4567 = _mm_sign_epi32(vabsout4567, vacc4567);
408 const __m128i vout89AB = _mm_sign_epi32(vabsout89AB, vacc89AB);
409 const __m128i voutCDEF = _mm_sign_epi32(vabsoutCDEF, vaccCDEF);
410 const __m128i voutGHIJ = _mm_sign_epi32(vabsoutGHIJ, vaccGHIJ);
411 const __m128i voutKLMN = _mm_sign_epi32(vabsoutKLMN, vaccKLMN);
412
413 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
414 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vout0123, vout4567), voutput_zero_point);
415 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vout89AB, voutCDEF), voutput_zero_point);
416 __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(voutGHIJ, voutKLMN), voutput_zero_point);
417
418 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
419 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
420 vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
421 vout89ABCDEF = _mm_min_epi16(_mm_max_epi16(vout89ABCDEF, voutput_min), voutput_max);
422 voutGHIJKLMN = _mm_min_epi16(_mm_max_epi16(voutGHIJKLMN, voutput_min), voutput_max);
423
424 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
425 __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
426
427 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
428 _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
429 output += 24;
430
431 channels -= 24;
432 }
433 if XNN_UNLIKELY(channels != 0) {
434 do {
435 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0));
436 i0 += 8;
437 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1));
438 i1 += 8;
439 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2));
440 i2 += 8;
441 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3));
442 i3 += 8;
443 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4));
444 i4 += 8;
445 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5));
446 i5 += 8;
447 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6));
448 i6 += 8;
449
450
451 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567);
452 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567);
453
454 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi4x01234567);
455 vacc1x01234567 = _mm_add_epi16(vacc1x01234567, vxi5x01234567);
456 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vxi6x01234567);
457
458 // Add up all accumulators to vacc0x01234567
459 vacc0x01234567 = _mm_add_epi16(vacc0x01234567, vacc1x01234567);
460
461 const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), _mm_load_si128((const __m128i*) buffer));
462 const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567)), _mm_load_si128((const __m128i*) (buffer + 4)));
463 buffer += 8;
464
465 const __m128i vabsacc0123 = _mm_abs_epi32(vacc0123);
466 const __m128i vabsacc4567 = _mm_abs_epi32(vacc4567);
467
468 const __m128i vabsacc13 = _mm_shuffle_epi32(vabsacc0123, _MM_SHUFFLE(3, 3, 1, 1));
469 const __m128i vabsacc57 = _mm_shuffle_epi32(vabsacc4567, _MM_SHUFFLE(3, 3, 1, 1));
470
471 const __m128i vabsprod02 = _mm_mul_epu32(vabsacc0123, vmultiplier);
472 const __m128i vabsprod13 = _mm_mul_epu32(vabsacc13, vmultiplier);
473 const __m128i vabsprod46 = _mm_mul_epu32(vabsacc4567, vmultiplier);
474 const __m128i vabsprod57 = _mm_mul_epu32(vabsacc57, vmultiplier);
475
476 const __m128i vabsout02 = _mm_srl_epi64(_mm_add_epi64(vabsprod02, vrounding), vshift);
477 const __m128i vabsout13 = _mm_srl_epi64(_mm_add_epi64(vabsprod13, vrounding), vshift);
478 const __m128i vabsout46 = _mm_srl_epi64(_mm_add_epi64(vabsprod46, vrounding), vshift);
479 const __m128i vabsout57 = _mm_srl_epi64(_mm_add_epi64(vabsprod57, vrounding), vshift);
480
481 const __m128i vabsout0123 = _mm_blend_epi16(vabsout02, _mm_shuffle_epi32(vabsout13, _MM_SHUFFLE(2, 2, 0, 0)), 0xCC);
482 const __m128i vabsout4567 = _mm_blend_epi16(vabsout46, _mm_shuffle_epi32(vabsout57, _MM_SHUFFLE(2, 2, 0, 0)), 0xCC);
483
484 const __m128i vout0123 = _mm_sign_epi32(vabsout0123, vacc0123);
485 const __m128i vout4567 = _mm_sign_epi32(vabsout4567, vacc4567);
486
487 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
488 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vout0123, vout4567), voutput_zero_point);
489
490 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
491 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
492 vout01234567 = _mm_min_epi16(_mm_max_epi16(vout01234567, voutput_min), voutput_max);
493
494 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
495
496 if XNN_LIKELY(channels >= 8) {
497 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
498 output += 8;
499 channels -= 8;
500 } else {
501 if (channels & 4) {
502 *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
503 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
504 output += 4;
505 }
506 if (channels & 2) {
507 *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
508 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
509 output += 2;
510 }
511 if (channels & 1) {
512 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
513 output += 1;
514 }
515 channels = 0;
516 }
517 } while (channels != 0);
518 }
519 }
520