Lines Matching refs:__m128i

40   const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias);  in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
44 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
45 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
47 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
48 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
50 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
51 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
53 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
54 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
56 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
57 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
59 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
60 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
62 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
63 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
66 …const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
67 …const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
68 …const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
69 …const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
70 …const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
71 …const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
72 …const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
73 …const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
74 …const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
75 …const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
76 …const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
77 …const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
78 …const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
79 …const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
81 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
82 __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
83 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
84 __m128i vacc1x89ABCDEF = _mm_add_epi16(vxi2x89ABCDEF, vxi3x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
97 const __m128i vsgnacc0x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
98 …const __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vacc0x01234567, vsgnacc0x01234567… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
99 …const __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vacc0x01234567, vsgnacc0x01234567… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
100 const __m128i vsgnacc0x89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
101 …const __m128i vacc89AB = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vacc0x89ABCDEF, vsgnacc0x89ABCDEF… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
102 …const __m128i vaccCDEF = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vacc0x89ABCDEF, vsgnacc0x89ABCDEF… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
104 _mm_store_si128((__m128i*) b, vacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
105 _mm_store_si128((__m128i*) (b + 4), vacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
106 _mm_store_si128((__m128i*) (b + 8), vacc89AB); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
107 _mm_store_si128((__m128i*) (b + 12), vaccCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
123 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
124 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
126 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
127 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
129 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
130 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
132 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
133 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
135 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
136 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
138 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
139 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
141 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
142 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
145 …const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
146 …const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
147 …const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
148 …const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
149 …const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
150 …const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
151 …const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
152 …const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
153 …const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
154 …const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
155 …const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
156 …const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
157 …const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
158 …const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
160 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
161 __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
162 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
163 __m128i vacc1x89ABCDEF = _mm_add_epi16(vxi2x89ABCDEF, vxi3x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
176 const __m128i vsgnacc0x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
177 …const __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc0x01234567, vsgnacc0x01234567), _mm_… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
178 …const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, vsgnacc0x01234567), _mm_… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
179 const __m128i vsgnacc0x89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
180 …const __m128i vacc89AB = _mm_add_epi32(_mm_unpacklo_epi16(vacc0x89ABCDEF, vsgnacc0x89ABCDEF), _mm_… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
181 …const __m128i vaccCDEF = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x89ABCDEF, vsgnacc0x89ABCDEF), _mm_… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
183 _mm_store_si128((__m128i*) b, vacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
184 _mm_store_si128((__m128i*) (b + 4), vacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
185 _mm_store_si128((__m128i*) (b + 8), vacc89AB); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
186 _mm_store_si128((__m128i*) (b + 12), vaccCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
217 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
218 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
219 const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
221 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
222 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
224 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
225 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
227 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
228 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
230 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
231 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
233 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
234 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
236 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
237 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
239 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
240 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
243 …const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
244 …const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
245 …const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
246 …const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
247 …const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
248 …const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
249 …const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
250 …const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
251 …const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
252 …const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
253 …const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
254 …const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
255 …const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
256 …const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
258 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
259 __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
260 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
261 __m128i vacc1x89ABCDEF = _mm_add_epi16(vxi2x89ABCDEF, vxi3x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
274 const __m128i vsgnacc0x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
275 …const __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc0x01234567, vsgnacc0x01234567), _mm_… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
276 …const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, vsgnacc0x01234567), _mm_… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
277 const __m128i vsgnacc0x89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
278 …const __m128i vacc89AB = _mm_add_epi32(_mm_unpacklo_epi16(vacc0x89ABCDEF, vsgnacc0x89ABCDEF), _mm_… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
279 …const __m128i vaccCDEF = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x89ABCDEF, vsgnacc0x89ABCDEF), _mm_… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
282 const __m128i vsgnacc0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
283 const __m128i vsgnacc4567 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
284 const __m128i vsgnacc89AB = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc89AB); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
285 const __m128i vsgnaccCDEF = _mm_cmpgt_epi32(_mm_setzero_si128(), vaccCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
287 const __m128i vabsacc0123 = _mm_sub_epi32(_mm_xor_si128(vacc0123, vsgnacc0123), vsgnacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
288 const __m128i vabsacc4567 = _mm_sub_epi32(_mm_xor_si128(vacc4567, vsgnacc4567), vsgnacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
289 const __m128i vabsacc89AB = _mm_sub_epi32(_mm_xor_si128(vacc89AB, vsgnacc89AB), vsgnacc89AB); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
290 const __m128i vabsaccCDEF = _mm_sub_epi32(_mm_xor_si128(vaccCDEF, vsgnaccCDEF), vsgnaccCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
292 const __m128i vabsacc13 = _mm_shuffle_epi32(vabsacc0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
293 const __m128i vabsacc57 = _mm_shuffle_epi32(vabsacc4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
294 const __m128i vabsacc9B = _mm_shuffle_epi32(vabsacc89AB, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
295 const __m128i vabsaccDF = _mm_shuffle_epi32(vabsaccCDEF, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
297 const __m128i vabsprod02 = _mm_mul_epu32(vabsacc0123, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
298 const __m128i vabsprod13 = _mm_mul_epu32(vabsacc13, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
299 const __m128i vabsprod46 = _mm_mul_epu32(vabsacc4567, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
300 const __m128i vabsprod57 = _mm_mul_epu32(vabsacc57, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
301 const __m128i vabsprod8A = _mm_mul_epu32(vabsacc89AB, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
302 const __m128i vabsprod9B = _mm_mul_epu32(vabsacc9B, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
303 const __m128i vabsprodCE = _mm_mul_epu32(vabsaccCDEF, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
304 const __m128i vabsprodDF = _mm_mul_epu32(vabsaccDF, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
306 const __m128i vabsout02 = _mm_srl_epi64(_mm_add_epi64(vabsprod02, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
307 const __m128i vabsout13 = _mm_srl_epi64(_mm_add_epi64(vabsprod13, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
308 const __m128i vabsout46 = _mm_srl_epi64(_mm_add_epi64(vabsprod46, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
309 const __m128i vabsout57 = _mm_srl_epi64(_mm_add_epi64(vabsprod57, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
310 const __m128i vabsout8A = _mm_srl_epi64(_mm_add_epi64(vabsprod8A, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
311 const __m128i vabsout9B = _mm_srl_epi64(_mm_add_epi64(vabsprod9B, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
312 const __m128i vabsoutCE = _mm_srl_epi64(_mm_add_epi64(vabsprodCE, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
313 const __m128i vabsoutDF = _mm_srl_epi64(_mm_add_epi64(vabsprodDF, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
315 const __m128i vabsout0213 = _mm_castps_si128( in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
317 const __m128i vabsout4657 = _mm_castps_si128( in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
319 const __m128i vabsout8A9B = _mm_castps_si128( in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
321 const __m128i vabsoutCEDF = _mm_castps_si128( in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
324 const __m128i vabsout0123 = _mm_shuffle_epi32(vabsout0213, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
325 const __m128i vabsout4567 = _mm_shuffle_epi32(vabsout4657, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
326 const __m128i vabsout89AB = _mm_shuffle_epi32(vabsout8A9B, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
327 const __m128i vabsoutCDEF = _mm_shuffle_epi32(vabsoutCEDF, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
329 const __m128i vout0123 = _mm_sub_epi32(_mm_xor_si128(vabsout0123, vsgnacc0123), vsgnacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
330 const __m128i vout4567 = _mm_sub_epi32(_mm_xor_si128(vabsout4567, vsgnacc4567), vsgnacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
331 const __m128i vout89AB = _mm_sub_epi32(_mm_xor_si128(vabsout89AB, vsgnacc89AB), vsgnacc89AB); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
332 const __m128i voutCDEF = _mm_sub_epi32(_mm_xor_si128(vabsoutCDEF, vsgnaccCDEF), vsgnaccCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
334 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
335 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vout0123, vout4567), voutput_zero_point); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
336 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vout89AB, voutCDEF), voutput_zero_point); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
338 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
339 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
343 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
345 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
352 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
354 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
356 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
358 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
360 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
362 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
364 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
367 …const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
368 …const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
369 …const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
370 …const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
371 …const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
372 …const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
373 …const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
375 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
376 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
385 const __m128i vsgnacc0x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
386 …const __m128i vacc0123 = _mm_add_epi32(_mm_unpacklo_epi16(vacc0x01234567, vsgnacc0x01234567), _mm_… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
387 …const __m128i vacc4567 = _mm_add_epi32(_mm_unpackhi_epi16(vacc0x01234567, vsgnacc0x01234567), _mm_… in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
390 const __m128i vsgnacc0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
391 const __m128i vsgnacc4567 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
393 const __m128i vabsacc0123 = _mm_sub_epi32(_mm_xor_si128(vacc0123, vsgnacc0123), vsgnacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
394 const __m128i vabsacc4567 = _mm_sub_epi32(_mm_xor_si128(vacc4567, vsgnacc4567), vsgnacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
396 const __m128i vabsacc13 = _mm_shuffle_epi32(vabsacc0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
397 const __m128i vabsacc57 = _mm_shuffle_epi32(vabsacc4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
399 const __m128i vabsprod02 = _mm_mul_epu32(vabsacc0123, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
400 const __m128i vabsprod13 = _mm_mul_epu32(vabsacc13, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
401 const __m128i vabsprod46 = _mm_mul_epu32(vabsacc4567, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
402 const __m128i vabsprod57 = _mm_mul_epu32(vabsacc57, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
404 const __m128i vabsout02 = _mm_srl_epi64(_mm_add_epi64(vabsprod02, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
405 const __m128i vabsout13 = _mm_srl_epi64(_mm_add_epi64(vabsprod13, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
406 const __m128i vabsout46 = _mm_srl_epi64(_mm_add_epi64(vabsprod46, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
407 const __m128i vabsout57 = _mm_srl_epi64(_mm_add_epi64(vabsprod57, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
409 const __m128i vabsout0213 = _mm_castps_si128( in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
411 const __m128i vabsout4657 = _mm_castps_si128( in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
414 const __m128i vabsout0123 = _mm_shuffle_epi32(vabsout0213, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
415 const __m128i vabsout4567 = _mm_shuffle_epi32(vabsout4657, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
417 const __m128i vout0123 = _mm_sub_epi32(_mm_xor_si128(vabsout0123, vsgnacc0123), vsgnacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
418 const __m128i vout4567 = _mm_sub_epi32(_mm_xor_si128(vabsout4567, vsgnacc4567), vsgnacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
420 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
421__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vout0123, vout4567), voutput_zero_point); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
423 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
424 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
427 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()
430 _mm_storel_epi64((__m128i*) output, vout0123456701234567); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c16_acc2()