Lines Matching refs:__m128i
56 const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
57 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
58 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
59 const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->sse2.shift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
61 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
62 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
63 const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
65 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
66 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
67 const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
69 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
70 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
71 const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
73 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
74 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
75 const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
77 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
78 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
79 const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
81 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
82 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
83 const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
85 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
86 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
87 const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
90 …const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
91 …const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
92 …const __m128i vxi0xGHIJKLMN = _mm_unpacklo_epi8(vi0xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
93 …const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
94 …const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
95 …const __m128i vxi1xGHIJKLMN = _mm_unpacklo_epi8(vi1xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
96 …const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
97 …const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
98 …const __m128i vxi2xGHIJKLMN = _mm_unpacklo_epi8(vi2xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
99 …const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
100 …const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
101 …const __m128i vxi3xGHIJKLMN = _mm_unpacklo_epi8(vi3xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
102 …const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
103 …const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
104 …const __m128i vxi4xGHIJKLMN = _mm_unpacklo_epi8(vi4xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
105 …const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
106 …const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
107 …const __m128i vxi5xGHIJKLMN = _mm_unpacklo_epi8(vi5xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
108 …const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
109 …const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
110 …const __m128i vxi6xGHIJKLMN = _mm_unpacklo_epi8(vi6xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
112 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
113 __m128i vacc0x89ABCDEF = _mm_add_epi16(vxi0x89ABCDEF, vxi1x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
114 __m128i vacc0xGHIJKLMN = _mm_add_epi16(vxi0xGHIJKLMN, vxi1xGHIJKLMN); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
115 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
116 __m128i vacc1x89ABCDEF = _mm_add_epi16(vxi2x89ABCDEF, vxi3x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
117 __m128i vacc1xGHIJKLMN = _mm_add_epi16(vxi2xGHIJKLMN, vxi3xGHIJKLMN); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
134 const __m128i vsgnacc0x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
135 …const __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vacc0x01234567, vsgnacc0x01234567… in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
136 …const __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vacc0x01234567, vsgnacc0x01234567… in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
137 const __m128i vsgnacc0x89ABCDEF = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
138 …const __m128i vacc89AB = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vacc0x89ABCDEF, vsgnacc0x89ABCDEF… in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
139 …const __m128i vaccCDEF = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vacc0x89ABCDEF, vsgnacc0x89ABCDEF… in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
140 const __m128i vsgnacc0xGHIJKLMN = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0xGHIJKLMN); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
141 …const __m128i vaccGHIJ = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vacc0xGHIJKLMN, vsgnacc0xGHIJKLMN… in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
142 …const __m128i vaccKLMN = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vacc0xGHIJKLMN, vsgnacc0xGHIJKLMN… in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
144 const __m128i vsgnacc0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
145 const __m128i vsgnacc4567 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
146 const __m128i vsgnacc89AB = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc89AB); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
147 const __m128i vsgnaccCDEF = _mm_cmpgt_epi32(_mm_setzero_si128(), vaccCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
148 const __m128i vsgnaccGHIJ = _mm_cmpgt_epi32(_mm_setzero_si128(), vaccGHIJ); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
149 const __m128i vsgnaccKLMN = _mm_cmpgt_epi32(_mm_setzero_si128(), vaccKLMN); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
151 const __m128i vabsacc0123 = _mm_sub_epi32(_mm_xor_si128(vacc0123, vsgnacc0123), vsgnacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
152 const __m128i vabsacc4567 = _mm_sub_epi32(_mm_xor_si128(vacc4567, vsgnacc4567), vsgnacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
153 const __m128i vabsacc89AB = _mm_sub_epi32(_mm_xor_si128(vacc89AB, vsgnacc89AB), vsgnacc89AB); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
154 const __m128i vabsaccCDEF = _mm_sub_epi32(_mm_xor_si128(vaccCDEF, vsgnaccCDEF), vsgnaccCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
155 const __m128i vabsaccGHIJ = _mm_sub_epi32(_mm_xor_si128(vaccGHIJ, vsgnaccGHIJ), vsgnaccGHIJ); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
156 const __m128i vabsaccKLMN = _mm_sub_epi32(_mm_xor_si128(vaccKLMN, vsgnaccKLMN), vsgnaccKLMN); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
158 const __m128i vabsacc13 = _mm_shuffle_epi32(vabsacc0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
159 const __m128i vabsacc57 = _mm_shuffle_epi32(vabsacc4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
160 const __m128i vabsacc9B = _mm_shuffle_epi32(vabsacc89AB, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
161 const __m128i vabsaccDF = _mm_shuffle_epi32(vabsaccCDEF, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
162 const __m128i vabsaccHJ = _mm_shuffle_epi32(vabsaccGHIJ, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
163 const __m128i vabsaccLN = _mm_shuffle_epi32(vabsaccKLMN, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
165 const __m128i vabsprod02 = _mm_mul_epu32(vabsacc0123, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
166 const __m128i vabsprod13 = _mm_mul_epu32(vabsacc13, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
167 const __m128i vabsprod46 = _mm_mul_epu32(vabsacc4567, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
168 const __m128i vabsprod57 = _mm_mul_epu32(vabsacc57, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
169 const __m128i vabsprod8A = _mm_mul_epu32(vabsacc89AB, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
170 const __m128i vabsprod9B = _mm_mul_epu32(vabsacc9B, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
171 const __m128i vabsprodCE = _mm_mul_epu32(vabsaccCDEF, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
172 const __m128i vabsprodDF = _mm_mul_epu32(vabsaccDF, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
173 const __m128i vabsprodGI = _mm_mul_epu32(vabsaccGHIJ, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
174 const __m128i vabsprodHJ = _mm_mul_epu32(vabsaccHJ, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
175 const __m128i vabsprodKM = _mm_mul_epu32(vabsaccKLMN, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
176 const __m128i vabsprodLN = _mm_mul_epu32(vabsaccLN, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
178 const __m128i vabsout02 = _mm_srl_epi64(_mm_add_epi64(vabsprod02, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
179 const __m128i vabsout13 = _mm_srl_epi64(_mm_add_epi64(vabsprod13, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
180 const __m128i vabsout46 = _mm_srl_epi64(_mm_add_epi64(vabsprod46, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
181 const __m128i vabsout57 = _mm_srl_epi64(_mm_add_epi64(vabsprod57, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
182 const __m128i vabsout8A = _mm_srl_epi64(_mm_add_epi64(vabsprod8A, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
183 const __m128i vabsout9B = _mm_srl_epi64(_mm_add_epi64(vabsprod9B, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
184 const __m128i vabsoutCE = _mm_srl_epi64(_mm_add_epi64(vabsprodCE, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
185 const __m128i vabsoutDF = _mm_srl_epi64(_mm_add_epi64(vabsprodDF, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
186 const __m128i vabsoutGI = _mm_srl_epi64(_mm_add_epi64(vabsprodGI, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
187 const __m128i vabsoutHJ = _mm_srl_epi64(_mm_add_epi64(vabsprodHJ, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
188 const __m128i vabsoutKM = _mm_srl_epi64(_mm_add_epi64(vabsprodKM, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
189 const __m128i vabsoutLN = _mm_srl_epi64(_mm_add_epi64(vabsprodLN, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
191 const __m128i vabsout0213 = _mm_castps_si128( in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
193 const __m128i vabsout4657 = _mm_castps_si128( in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
195 const __m128i vabsout8A9B = _mm_castps_si128( in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
197 const __m128i vabsoutCEDF = _mm_castps_si128( in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
199 const __m128i vabsoutGIHJ = _mm_castps_si128( in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
201 const __m128i vabsoutKMLN = _mm_castps_si128( in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
204 const __m128i vabsout0123 = _mm_shuffle_epi32(vabsout0213, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
205 const __m128i vabsout4567 = _mm_shuffle_epi32(vabsout4657, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
206 const __m128i vabsout89AB = _mm_shuffle_epi32(vabsout8A9B, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
207 const __m128i vabsoutCDEF = _mm_shuffle_epi32(vabsoutCEDF, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
208 const __m128i vabsoutGHIJ = _mm_shuffle_epi32(vabsoutGIHJ, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
209 const __m128i vabsoutKLMN = _mm_shuffle_epi32(vabsoutKMLN, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
211 const __m128i vout0123 = _mm_sub_epi32(_mm_xor_si128(vabsout0123, vsgnacc0123), vsgnacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
212 const __m128i vout4567 = _mm_sub_epi32(_mm_xor_si128(vabsout4567, vsgnacc4567), vsgnacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
213 const __m128i vout89AB = _mm_sub_epi32(_mm_xor_si128(vabsout89AB, vsgnacc89AB), vsgnacc89AB); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
214 const __m128i voutCDEF = _mm_sub_epi32(_mm_xor_si128(vabsoutCDEF, vsgnaccCDEF), vsgnaccCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
215 const __m128i voutGHIJ = _mm_sub_epi32(_mm_xor_si128(vabsoutGHIJ, vsgnaccGHIJ), vsgnaccGHIJ); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
216 const __m128i voutKLMN = _mm_sub_epi32(_mm_xor_si128(vabsoutKLMN, vsgnaccKLMN), vsgnaccKLMN); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
218 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
219 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vout0123, vout4567), voutput_zero_point); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
220 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vout89AB, voutCDEF), voutput_zero_point); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
221 __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(voutGHIJ, voutKLMN), voutput_zero_point); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
223 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
224 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
229 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
230 __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
232 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
233 _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
240 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
242 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
244 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
246 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
248 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
250 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
252 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
255 …const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
256 …const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
257 …const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
258 …const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
259 …const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
260 …const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
261 …const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
263 __m128i vacc0x01234567 = _mm_add_epi16(vxi0x01234567, vxi1x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
264 __m128i vacc1x01234567 = _mm_add_epi16(vxi2x01234567, vxi3x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
273 const __m128i vsgnacc0x01234567 = _mm_cmpgt_epi16(_mm_setzero_si128(), vacc0x01234567); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
274 …const __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vacc0x01234567, vsgnacc0x01234567… in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
275 …const __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vacc0x01234567, vsgnacc0x01234567… in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
277 const __m128i vsgnacc0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
278 const __m128i vsgnacc4567 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
280 const __m128i vabsacc0123 = _mm_sub_epi32(_mm_xor_si128(vacc0123, vsgnacc0123), vsgnacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
281 const __m128i vabsacc4567 = _mm_sub_epi32(_mm_xor_si128(vacc4567, vsgnacc4567), vsgnacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
283 const __m128i vabsacc13 = _mm_shuffle_epi32(vabsacc0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
284 const __m128i vabsacc57 = _mm_shuffle_epi32(vabsacc4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
286 const __m128i vabsprod02 = _mm_mul_epu32(vabsacc0123, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
287 const __m128i vabsprod13 = _mm_mul_epu32(vabsacc13, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
288 const __m128i vabsprod46 = _mm_mul_epu32(vabsacc4567, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
289 const __m128i vabsprod57 = _mm_mul_epu32(vabsacc57, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
291 const __m128i vabsout02 = _mm_srl_epi64(_mm_add_epi64(vabsprod02, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
292 const __m128i vabsout13 = _mm_srl_epi64(_mm_add_epi64(vabsprod13, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
293 const __m128i vabsout46 = _mm_srl_epi64(_mm_add_epi64(vabsprod46, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
294 const __m128i vabsout57 = _mm_srl_epi64(_mm_add_epi64(vabsprod57, vrounding), vshift); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
296 const __m128i vabsout0213 = _mm_castps_si128( in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
298 const __m128i vabsout4657 = _mm_castps_si128( in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
301 const __m128i vabsout0123 = _mm_shuffle_epi32(vabsout0213, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
302 const __m128i vabsout4567 = _mm_shuffle_epi32(vabsout4657, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
304 const __m128i vout0123 = _mm_sub_epi32(_mm_xor_si128(vabsout0123, vsgnacc0123), vsgnacc0123); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
305 const __m128i vout4567 = _mm_sub_epi32(_mm_xor_si128(vabsout4567, vsgnacc4567), vsgnacc4567); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
307 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
308 … __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vout0123, vout4567), voutput_zero_point); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
310 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
311 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
314 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()
317 _mm_storel_epi64((__m128i*) output, vout0123456701234567); in xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c24_acc2()