Lines Matching refs:__m128i
40 const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
44 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
45 const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
47 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
48 const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
50 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
51 const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
53 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
54 const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
56 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
57 const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
59 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
60 const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
62 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
63 const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
67 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
68 __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
69 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
70 __m128i vacc1x89ABCDEF = _mm_add_epi16(vxi2x89ABCDEF, vxi3x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
83 const __m128i vacc0123 = _mm_add_epi32(vbias, _mm_cvtepi16_epi32(vacc0x01234567)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
84 …const __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
85 const __m128i vacc89AB = _mm_add_epi32(vbias, _mm_cvtepi16_epi32(vacc0x89ABCDEF)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
86 …const __m128i vaccCDEF = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vacc0x89ABCDEF, _mm_cmpgt_epi16(_… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
88 _mm_store_si128((__m128i*) b, vacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
89 _mm_store_si128((__m128i*) (b + 4), vacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
90 _mm_store_si128((__m128i*) (b + 8), vacc89AB); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
91 _mm_store_si128((__m128i*) (b + 12), vaccCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
107 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
108 const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
110 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
111 const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
113 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
114 const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
116 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
117 const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
119 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
120 const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
122 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
123 const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
125 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
126 const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
130 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
131 __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
132 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
133 __m128i vacc1x89ABCDEF = _mm_add_epi16(vxi2x89ABCDEF, vxi3x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
146 …const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), _mm_load_si128((const _… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
147 …__m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
148 …const __m128i vacc89AB = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x89ABCDEF), _mm_load_si128((const _… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
149 …__m128i vaccCDEF = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x89ABCDEF, _mm_cmpgt_epi16(_mm_setzero_si… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
151 _mm_store_si128((__m128i*) b, vacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
152 _mm_store_si128((__m128i*) (b + 4), vacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
153 _mm_store_si128((__m128i*) (b + 8), vacc89AB); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
154 _mm_store_si128((__m128i*) (b + 12), vaccCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
185 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
186 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
187 const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
189 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
190 const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i0 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
192 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
193 const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i1 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
195 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
196 const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i2 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
198 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
199 const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i3 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
201 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
202 const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i4 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
204 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
205 const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i5 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
207 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
208 const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i6 + 8))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
212 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
213 __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
214 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
215 __m128i vacc1x89ABCDEF = _mm_add_epi16(vxi2x89ABCDEF, vxi3x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
228 …const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), _mm_load_si128((const _… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
229 …__m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
230 …const __m128i vacc89AB = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x89ABCDEF), _mm_load_si128((const _… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
231 …__m128i vaccCDEF = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x89ABCDEF, _mm_cmpgt_epi16(_mm_setzero_si… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
234 const __m128i vabsacc0123 = _mm_abs_epi32(vacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
235 const __m128i vabsacc4567 = _mm_abs_epi32(vacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
236 const __m128i vabsacc89AB = _mm_abs_epi32(vacc89AB); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
237 const __m128i vabsaccCDEF = _mm_abs_epi32(vaccCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
239 const __m128i vabsacc13 = _mm_shuffle_epi32(vabsacc0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
240 const __m128i vabsacc57 = _mm_shuffle_epi32(vabsacc4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
241 const __m128i vabsacc9B = _mm_shuffle_epi32(vabsacc89AB, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
242 const __m128i vabsaccDF = _mm_shuffle_epi32(vabsaccCDEF, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
244 const __m128i vabsprod02 = _mm_mul_epu32(vabsacc0123, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
245 const __m128i vabsprod13 = _mm_mul_epu32(vabsacc13, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
246 const __m128i vabsprod46 = _mm_mul_epu32(vabsacc4567, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
247 const __m128i vabsprod57 = _mm_mul_epu32(vabsacc57, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
248 const __m128i vabsprod8A = _mm_mul_epu32(vabsacc89AB, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
249 const __m128i vabsprod9B = _mm_mul_epu32(vabsacc9B, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
250 const __m128i vabsprodCE = _mm_mul_epu32(vabsaccCDEF, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
251 const __m128i vabsprodDF = _mm_mul_epu32(vabsaccDF, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
253 const __m128i vabsout02 = _mm_srl_epi64(_mm_add_epi64(vabsprod02, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
254 const __m128i vabsout13 = _mm_srl_epi64(_mm_add_epi64(vabsprod13, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
255 const __m128i vabsout46 = _mm_srl_epi64(_mm_add_epi64(vabsprod46, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
256 const __m128i vabsout57 = _mm_srl_epi64(_mm_add_epi64(vabsprod57, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
257 const __m128i vabsout8A = _mm_srl_epi64(_mm_add_epi64(vabsprod8A, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
258 const __m128i vabsout9B = _mm_srl_epi64(_mm_add_epi64(vabsprod9B, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
259 const __m128i vabsoutCE = _mm_srl_epi64(_mm_add_epi64(vabsprodCE, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
260 const __m128i vabsoutDF = _mm_srl_epi64(_mm_add_epi64(vabsprodDF, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
262 …const __m128i vabsout0123 = _mm_blend_epi16(vabsout02, _mm_shuffle_epi32(vabsout13, _MM_SHUFFLE(2,… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
263 …const __m128i vabsout4567 = _mm_blend_epi16(vabsout46, _mm_shuffle_epi32(vabsout57, _MM_SHUFFLE(2,… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
264 …const __m128i vabsout89AB = _mm_blend_epi16(vabsout8A, _mm_shuffle_epi32(vabsout9B, _MM_SHUFFLE(2,… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
265 …const __m128i vabsoutCDEF = _mm_blend_epi16(vabsoutCE, _mm_shuffle_epi32(vabsoutDF, _MM_SHUFFLE(2,… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
267 const __m128i vout0123 = _mm_sign_epi32(vabsout0123, vacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
268 const __m128i vout4567 = _mm_sign_epi32(vabsout4567, vacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
269 const __m128i vout89AB = _mm_sign_epi32(vabsout89AB, vacc89AB); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
270 const __m128i voutCDEF = _mm_sign_epi32(vabsoutCDEF, vaccCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
272 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
273 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vout0123, vout4567), voutput_zero_point); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
274 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vout89AB, voutCDEF), voutput_zero_point); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
276 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
277 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
281 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
283 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
290 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i0)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
292 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i1)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
294 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i2)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
296 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i3)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
298 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i4)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
300 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i5)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
302 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i6)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
306 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
307 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
316 …const __m128i vacc0123 = _mm_add_epi32(_mm_cvtepi16_epi32(vacc0x01234567), _mm_load_si128((const _… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
317 …__m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, _mm_cmpgt_epi16(_mm_setzero_si… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
320 const __m128i vabsacc0123 = _mm_abs_epi32(vacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
321 const __m128i vabsacc4567 = _mm_abs_epi32(vacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
323 const __m128i vabsacc13 = _mm_shuffle_epi32(vabsacc0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
324 const __m128i vabsacc57 = _mm_shuffle_epi32(vabsacc4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
326 const __m128i vabsprod02 = _mm_mul_epu32(vabsacc0123, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
327 const __m128i vabsprod13 = _mm_mul_epu32(vabsacc13, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
328 const __m128i vabsprod46 = _mm_mul_epu32(vabsacc4567, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
329 const __m128i vabsprod57 = _mm_mul_epu32(vabsacc57, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
331 const __m128i vabsout02 = _mm_srl_epi64(_mm_add_epi64(vabsprod02, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
332 const __m128i vabsout13 = _mm_srl_epi64(_mm_add_epi64(vabsprod13, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
333 const __m128i vabsout46 = _mm_srl_epi64(_mm_add_epi64(vabsprod46, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
334 const __m128i vabsout57 = _mm_srl_epi64(_mm_add_epi64(vabsprod57, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
336 …const __m128i vabsout0123 = _mm_blend_epi16(vabsout02, _mm_shuffle_epi32(vabsout13, _MM_SHUFFLE(2,… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
337 …const __m128i vabsout4567 = _mm_blend_epi16(vabsout46, _mm_shuffle_epi32(vabsout57, _MM_SHUFFLE(2,… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
339 const __m128i vout0123 = _mm_sign_epi32(vabsout0123, vacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
340 const __m128i vout4567 = _mm_sign_epi32(vabsout4567, vacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
342 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
343 … __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vout0123, vout4567), voutput_zero_point); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
345 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
346 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
349 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()
352 _mm_storel_epi64((__m128i*) output, vout0123456701234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c16_acc2()