Lines Matching refs:__m128i

83       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);  in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
84 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 4 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
85 __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
86 __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 12 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
89 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
90 …const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
91 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
92 …const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
95 …const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
96 …const __m128i vxk0x01234567 = _mm_unpacklo_epi8(vk0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
97 …const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
98 …const __m128i vxk0x89ABCDEF = _mm_unpacklo_epi8(vk0x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
100 const __m128i vp0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
101 const __m128i vp0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
102 const __m128i vp0x89ABCDEFlo = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
103 const __m128i vp0x89ABCDEFhi = _mm_mulhi_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
110 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
111 …const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
112 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
113 …const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
116 …const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
117 …const __m128i vxk1x01234567 = _mm_unpacklo_epi8(vk1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
118 …const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
119 …const __m128i vxk1x89ABCDEF = _mm_unpacklo_epi8(vk1x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
121 const __m128i vp1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
122 const __m128i vp1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
123 const __m128i vp1x89ABCDEFlo = _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
124 const __m128i vp1x89ABCDEFhi = _mm_mulhi_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
131 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
132 …const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
133 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
134 …const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
137 …const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
138 …const __m128i vxk2x01234567 = _mm_unpacklo_epi8(vk2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
139 …const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
140 …const __m128i vxk2x89ABCDEF = _mm_unpacklo_epi8(vk2x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
142 const __m128i vp2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
143 const __m128i vp2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
144 const __m128i vp2x89ABCDEFlo = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
145 const __m128i vp2x89ABCDEFhi = _mm_mulhi_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
152 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
153 …const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
154 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
155 …const __m128i vk3x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
158 …const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
159 …const __m128i vxk3x01234567 = _mm_unpacklo_epi8(vk3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
160 …const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
161 …const __m128i vxk3x89ABCDEF = _mm_unpacklo_epi8(vk3x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
163 const __m128i vp3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
164 const __m128i vp3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
165 const __m128i vp3x89ABCDEFlo = _mm_mullo_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
166 const __m128i vp3x89ABCDEFhi = _mm_mulhi_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
173 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
174 …const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
175 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
176 …const __m128i vk4x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
179 …const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
180 …const __m128i vxk4x01234567 = _mm_unpacklo_epi8(vk4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
181 …const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
182 …const __m128i vxk4x89ABCDEF = _mm_unpacklo_epi8(vk4x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
184 const __m128i vp4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
185 const __m128i vp4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
186 const __m128i vp4x89ABCDEFlo = _mm_mullo_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
187 const __m128i vp4x89ABCDEFhi = _mm_mulhi_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
194 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
195 …const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
196 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
197 …const __m128i vk5x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
200 …const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
201 …const __m128i vxk5x01234567 = _mm_unpacklo_epi8(vk5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
202 …const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
203 …const __m128i vxk5x89ABCDEF = _mm_unpacklo_epi8(vk5x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
205 const __m128i vp5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
206 const __m128i vp5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
207 const __m128i vp5x89ABCDEFlo = _mm_mullo_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
208 const __m128i vp5x89ABCDEFhi = _mm_mulhi_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
215 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
216 …const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
217 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
218 …const __m128i vk6x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
221 …const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
222 …const __m128i vxk6x01234567 = _mm_unpacklo_epi8(vk6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
223 …const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
224 …const __m128i vxk6x89ABCDEF = _mm_unpacklo_epi8(vk6x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
226 const __m128i vp6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
227 const __m128i vp6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
228 const __m128i vp6x89ABCDEFlo = _mm_mullo_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
229 const __m128i vp6x89ABCDEFhi = _mm_mulhi_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
236 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
237 …const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
238 const __m128i vi7x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i7 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
239 …const __m128i vk7x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
242 …const __m128i vxi7x01234567 = _mm_unpacklo_epi8(vi7x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
243 …const __m128i vxk7x01234567 = _mm_unpacklo_epi8(vk7x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
244 …const __m128i vxi7x89ABCDEF = _mm_unpacklo_epi8(vi7x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
245 …const __m128i vxk7x89ABCDEF = _mm_unpacklo_epi8(vk7x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
247 const __m128i vp7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
248 const __m128i vp7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
249 const __m128i vp7x89ABCDEFlo = _mm_mullo_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
250 const __m128i vp7x89ABCDEFhi = _mm_mulhi_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
257 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
258 …const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
259 const __m128i vi8x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i8 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
260 …const __m128i vk8x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
263 …const __m128i vxi8x01234567 = _mm_unpacklo_epi8(vi8x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
264 …const __m128i vxk8x01234567 = _mm_unpacklo_epi8(vk8x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
265 …const __m128i vxi8x89ABCDEF = _mm_unpacklo_epi8(vi8x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
266 …const __m128i vxk8x89ABCDEF = _mm_unpacklo_epi8(vk8x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
268 const __m128i vp8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
269 const __m128i vp8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
270 const __m128i vp8x89ABCDEFlo = _mm_mullo_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
271 const __m128i vp8x89ABCDEFhi = _mm_mulhi_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
280 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
281 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
283 const __m128i vnmask0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
284 const __m128i vnmask4567 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
285 const __m128i vnmask89AB = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc89AB); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
286 const __m128i vnmaskCDEF = _mm_cmpgt_epi32(_mm_setzero_si128(), vaccCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
288 const __m128i vabsacc0123 = _mm_abs_epi32(vacc0123); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
289 const __m128i vabsacc4567 = _mm_abs_epi32(vacc4567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
290 const __m128i vabsacc89AB = _mm_abs_epi32(vacc89AB); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
291 const __m128i vabsaccCDEF = _mm_abs_epi32(vaccCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
293 const __m128i vabsacc13 = _mm_shuffle_epi32(vabsacc0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
294 const __m128i vabsprod02 = _mm_mul_epu32(vabsacc0123, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
295 const __m128i vabsprod13 = _mm_mul_epu32(vabsacc13, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
296 const __m128i vabsacc57 = _mm_shuffle_epi32(vabsacc4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
297 const __m128i vabsprod46 = _mm_mul_epu32(vabsacc4567, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
298 const __m128i vabsprod57 = _mm_mul_epu32(vabsacc57, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
299 const __m128i vabsacc9B = _mm_shuffle_epi32(vabsacc89AB, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
300 const __m128i vabsprod8A = _mm_mul_epu32(vabsacc89AB, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
301 const __m128i vabsprod9B = _mm_mul_epu32(vabsacc9B, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
302 const __m128i vabsaccDF = _mm_shuffle_epi32(vabsaccCDEF, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
303 const __m128i vabsprodCE = _mm_mul_epu32(vabsaccCDEF, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
304 const __m128i vabsprodDF = _mm_mul_epu32(vabsaccDF, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
306 const __m128i vnmask02 = _mm_shuffle_epi32(vnmask0123, _MM_SHUFFLE(2, 2, 0, 0)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
307 const __m128i vnmask13 = _mm_shuffle_epi32(vnmask0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
308 const __m128i vnmask46 = _mm_shuffle_epi32(vnmask4567, _MM_SHUFFLE(2, 2, 0, 0)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
309 const __m128i vnmask57 = _mm_shuffle_epi32(vnmask4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
310 const __m128i vnmask8A = _mm_shuffle_epi32(vnmask89AB, _MM_SHUFFLE(2, 2, 0, 0)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
311 const __m128i vnmask9B = _mm_shuffle_epi32(vnmask89AB, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
312 const __m128i vnmaskCE = _mm_shuffle_epi32(vnmaskCDEF, _MM_SHUFFLE(2, 2, 0, 0)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
313 const __m128i vnmaskDF = _mm_shuffle_epi32(vnmaskCDEF, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
315 const __m128i vprod02 = _mm_sub_epi64(_mm_xor_si128(vabsprod02, vnmask02), vnmask02); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
316 const __m128i vprod13 = _mm_sub_epi64(_mm_xor_si128(vabsprod13, vnmask13), vnmask13); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
317 const __m128i vprod46 = _mm_sub_epi64(_mm_xor_si128(vabsprod46, vnmask46), vnmask46); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
318 const __m128i vprod57 = _mm_sub_epi64(_mm_xor_si128(vabsprod57, vnmask57), vnmask57); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
319 const __m128i vprod8A = _mm_sub_epi64(_mm_xor_si128(vabsprod8A, vnmask8A), vnmask8A); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
320 const __m128i vprod9B = _mm_sub_epi64(_mm_xor_si128(vabsprod9B, vnmask9B), vnmask9B); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
321 const __m128i vprodCE = _mm_sub_epi64(_mm_xor_si128(vabsprodCE, vnmaskCE), vnmaskCE); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
322 const __m128i vprodDF = _mm_sub_epi64(_mm_xor_si128(vabsprodDF, vnmaskDF), vnmaskDF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
324 const __m128i vq31prod02 = _mm_srli_epi64(_mm_add_epi64(vprod02, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
325 const __m128i vq31prod13 = _mm_srli_epi64(_mm_add_epi64(vprod13, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
326 const __m128i vq31prod46 = _mm_srli_epi64(_mm_add_epi64(vprod46, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
327 const __m128i vq31prod57 = _mm_srli_epi64(_mm_add_epi64(vprod57, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
328 const __m128i vq31prod8A = _mm_srli_epi64(_mm_add_epi64(vprod8A, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
329 const __m128i vq31prod9B = _mm_srli_epi64(_mm_add_epi64(vprod9B, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
330 const __m128i vq31prodCE = _mm_srli_epi64(_mm_add_epi64(vprodCE, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
331 const __m128i vq31prodDF = _mm_srli_epi64(_mm_add_epi64(vprodDF, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
333 const __m128i vq31prod0213 = _mm_castps_si128(_mm_shuffle_ps( in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
335 const __m128i vq31prod4657 = _mm_castps_si128(_mm_shuffle_ps( in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
337 const __m128i vq31prod8A9B = _mm_castps_si128(_mm_shuffle_ps( in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
339 const __m128i vq31prodCEDF = _mm_castps_si128(_mm_shuffle_ps( in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
342 const __m128i vq31prod0123 = _mm_shuffle_epi32(vq31prod0213, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
343 const __m128i vq31prod4567 = _mm_shuffle_epi32(vq31prod4657, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
344 const __m128i vq31prod89AB = _mm_shuffle_epi32(vq31prod8A9B, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
345 const __m128i vq31prodCDEF = _mm_shuffle_epi32(vq31prodCEDF, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
347 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
348 const __m128i vrem0123 = in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
350 const __m128i vrem4567 = in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
352 const __m128i vrem89AB = in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
354 const __m128i vremCDEF = in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
357 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
358 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
368 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
369__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
370__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
372 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
373 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
377 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
379 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
385 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
386 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 4 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
389 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
390 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
393 …const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
394 …const __m128i vxk0x01234567 = _mm_unpacklo_epi8(vk0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
396 const __m128i vp0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
397 const __m128i vp0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
402 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
403 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 16)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
406 …const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
407 …const __m128i vxk1x01234567 = _mm_unpacklo_epi8(vk1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
409 const __m128i vp1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
410 const __m128i vp1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
415 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
416 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 32)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
419 …const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
420 …const __m128i vxk2x01234567 = _mm_unpacklo_epi8(vk2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
422 const __m128i vp2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
423 const __m128i vp2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
428 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
429 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) (k + 48)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
432 …const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
433 …const __m128i vxk3x01234567 = _mm_unpacklo_epi8(vk3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
435 const __m128i vp3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
436 const __m128i vp3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
441 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
442 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) (k + 64)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
445 …const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
446 …const __m128i vxk4x01234567 = _mm_unpacklo_epi8(vk4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
448 const __m128i vp4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
449 const __m128i vp4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
454 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
455 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) (k + 80)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
458 …const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
459 …const __m128i vxk5x01234567 = _mm_unpacklo_epi8(vk5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
461 const __m128i vp5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
462 const __m128i vp5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
467 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
468 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) (k + 96)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
471 …const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
472 …const __m128i vxk6x01234567 = _mm_unpacklo_epi8(vk6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
474 const __m128i vp6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
475 const __m128i vp6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
480 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
481 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) (k + 112)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
484 …const __m128i vxi7x01234567 = _mm_unpacklo_epi8(vi7x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
485 …const __m128i vxk7x01234567 = _mm_unpacklo_epi8(vk7x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
487 const __m128i vp7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
488 const __m128i vp7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
493 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
494 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) (k + 128)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
497 …const __m128i vxi8x01234567 = _mm_unpacklo_epi8(vi8x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
498 …const __m128i vxk8x01234567 = _mm_unpacklo_epi8(vk8x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
500 const __m128i vp8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
501 const __m128i vp8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
509 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
510 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
512 const __m128i vnmask0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
513 const __m128i vnmask4567 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
515 const __m128i vabsacc0123 = _mm_abs_epi32(vacc0123); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
516 const __m128i vabsacc4567 = _mm_abs_epi32(vacc4567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
518 const __m128i vabsacc13 = _mm_shuffle_epi32(vabsacc0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
519 const __m128i vabsacc57 = _mm_shuffle_epi32(vabsacc4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
521 const __m128i vabsprod02 = _mm_mul_epu32(vabsacc0123, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
522 const __m128i vabsprod13 = _mm_mul_epu32(vabsacc13, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
523 const __m128i vabsprod46 = _mm_mul_epu32(vabsacc4567, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
524 const __m128i vabsprod57 = _mm_mul_epu32(vabsacc57, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
526 const __m128i vnmask02 = _mm_shuffle_epi32(vnmask0123, _MM_SHUFFLE(2, 2, 0, 0)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
527 const __m128i vnmask13 = _mm_shuffle_epi32(vnmask0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
528 const __m128i vnmask46 = _mm_shuffle_epi32(vnmask4567, _MM_SHUFFLE(2, 2, 0, 0)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
529 const __m128i vnmask57 = _mm_shuffle_epi32(vnmask4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
531 const __m128i vprod02 = _mm_sub_epi64(_mm_xor_si128(vabsprod02, vnmask02), vnmask02); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
532 const __m128i vprod13 = _mm_sub_epi64(_mm_xor_si128(vabsprod13, vnmask13), vnmask13); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
533 const __m128i vprod46 = _mm_sub_epi64(_mm_xor_si128(vabsprod46, vnmask46), vnmask46); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
534 const __m128i vprod57 = _mm_sub_epi64(_mm_xor_si128(vabsprod57, vnmask57), vnmask57); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
536 const __m128i vq31prod02 = _mm_srli_epi64(_mm_add_epi64(vprod02, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
537 const __m128i vq31prod13 = _mm_srli_epi64(_mm_add_epi64(vprod13, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
538 const __m128i vq31prod46 = _mm_srli_epi64(_mm_add_epi64(vprod46, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
539 const __m128i vq31prod57 = _mm_srli_epi64(_mm_add_epi64(vprod57, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
541 const __m128i vq31prod0213 = _mm_castps_si128(_mm_shuffle_ps( in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
543 const __m128i vq31prod4657 = _mm_castps_si128(_mm_shuffle_ps( in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
546 const __m128i vq31prod0123 = _mm_shuffle_epi32(vq31prod0213, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
547 const __m128i vq31prod4567 = _mm_shuffle_epi32(vq31prod4657, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
549 … const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
550 const __m128i vrem0123 = in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
552 const __m128i vrem4567 = in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
555 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
556 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
562 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
563__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
565 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
566 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
569 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()
572 _mm_storel_epi64((__m128i*) output, vout0123456701234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__ssse3_mul16()