Lines Matching refs:__m128i

83       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);  in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
84 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 4 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
85 __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
86 __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 12 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
89 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
90 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
91 …const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
92 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
93 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
94 const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(vi0x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
95 …const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
96 const __m128i vxk0x89ABCDEF = _mm_cvtepi8_epi16(vk0x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
100 const __m128i vp0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
101 const __m128i vp0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
102 const __m128i vp0x89ABCDEFlo = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
103 const __m128i vp0x89ABCDEFhi = _mm_mulhi_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
110 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
111 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
112 …const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
113 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
114 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
115 const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(vi1x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
116 …const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
117 const __m128i vxk1x89ABCDEF = _mm_cvtepi8_epi16(vk1x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
121 const __m128i vp1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
122 const __m128i vp1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
123 const __m128i vp1x89ABCDEFlo = _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
124 const __m128i vp1x89ABCDEFhi = _mm_mulhi_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
131 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
132 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
133 …const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
134 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
135 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
136 const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(vi2x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
137 …const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
138 const __m128i vxk2x89ABCDEF = _mm_cvtepi8_epi16(vk2x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
142 const __m128i vp2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
143 const __m128i vp2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
144 const __m128i vp2x89ABCDEFlo = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
145 const __m128i vp2x89ABCDEFhi = _mm_mulhi_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
152 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
153 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
154 …const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
155 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
156 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
157 const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(vi3x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
158 …const __m128i vk3x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
159 const __m128i vxk3x89ABCDEF = _mm_cvtepi8_epi16(vk3x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
163 const __m128i vp3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
164 const __m128i vp3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
165 const __m128i vp3x89ABCDEFlo = _mm_mullo_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
166 const __m128i vp3x89ABCDEFhi = _mm_mulhi_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
173 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
174 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
175 …const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
176 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
177 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
178 const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(vi4x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
179 …const __m128i vk4x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
180 const __m128i vxk4x89ABCDEF = _mm_cvtepi8_epi16(vk4x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
184 const __m128i vp4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
185 const __m128i vp4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
186 const __m128i vp4x89ABCDEFlo = _mm_mullo_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
187 const __m128i vp4x89ABCDEFhi = _mm_mulhi_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
194 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
195 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
196 …const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
197 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
198 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
199 const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(vi5x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
200 …const __m128i vk5x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
201 const __m128i vxk5x89ABCDEF = _mm_cvtepi8_epi16(vk5x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
205 const __m128i vp5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
206 const __m128i vp5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
207 const __m128i vp5x89ABCDEFlo = _mm_mullo_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
208 const __m128i vp5x89ABCDEFhi = _mm_mulhi_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
215 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
216 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
217 …const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
218 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
219 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
220 const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(vi6x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
221 …const __m128i vk6x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
222 const __m128i vxk6x89ABCDEF = _mm_cvtepi8_epi16(vk6x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
226 const __m128i vp6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
227 const __m128i vp6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
228 const __m128i vp6x89ABCDEFlo = _mm_mullo_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
229 const __m128i vp6x89ABCDEFhi = _mm_mulhi_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
236 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
237 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
238 …const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
239 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
240 const __m128i vi7x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i7 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
241 const __m128i vxi7x89ABCDEF = _mm_cvtepi8_epi16(vi7x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
242 …const __m128i vk7x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
243 const __m128i vxk7x89ABCDEF = _mm_cvtepi8_epi16(vk7x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
247 const __m128i vp7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
248 const __m128i vp7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
249 const __m128i vp7x89ABCDEFlo = _mm_mullo_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
250 const __m128i vp7x89ABCDEFhi = _mm_mulhi_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
257 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
258 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
259 …const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
260 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
261 const __m128i vi8x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i8 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
262 const __m128i vxi8x89ABCDEF = _mm_cvtepi8_epi16(vi8x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
263 …const __m128i vk8x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
264 const __m128i vxk8x89ABCDEF = _mm_cvtepi8_epi16(vk8x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
268 const __m128i vp8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
269 const __m128i vp8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
270 const __m128i vp8x89ABCDEFlo = _mm_mullo_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
271 const __m128i vp8x89ABCDEFhi = _mm_mulhi_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
280 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
281 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
283 const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
284 const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
285 const __m128i vprod13 = _mm_add_epi64(_mm_mul_epi32(vacc13, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
286 const __m128i vacc57 = _mm_shuffle_epi32(vacc4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
287 const __m128i vprod46 = _mm_add_epi64(_mm_mul_epi32(vacc4567, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
288 const __m128i vprod57 = _mm_add_epi64(_mm_mul_epi32(vacc57, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
289 const __m128i vacc9B = _mm_shuffle_epi32(vacc89AB, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
290 const __m128i vprod8A = _mm_add_epi64(_mm_mul_epi32(vacc89AB, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
291 const __m128i vprod9B = _mm_add_epi64(_mm_mul_epi32(vacc9B, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
292 const __m128i vaccDF = _mm_shuffle_epi32(vaccCDEF, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
293 const __m128i vprodCE = _mm_add_epi64(_mm_mul_epi32(vaccCDEF, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
294 const __m128i vprodDF = _mm_add_epi64(_mm_mul_epi32(vaccDF, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
296 const __m128i vq31prod02 = _mm_srli_epi64(vprod02, 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
297 const __m128i vq31prod13 = _mm_add_epi64(vprod13, vprod13); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
298 const __m128i vq31prod46 = _mm_srli_epi64(vprod46, 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
299 const __m128i vq31prod57 = _mm_add_epi64(vprod57, vprod57); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
300 const __m128i vq31prod8A = _mm_srli_epi64(vprod8A, 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
301 const __m128i vq31prod9B = _mm_add_epi64(vprod9B, vprod9B); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
302 const __m128i vq31prodCE = _mm_srli_epi64(vprodCE, 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
303 const __m128i vq31prodDF = _mm_add_epi64(vprodDF, vprodDF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
305 const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
306 const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
307 const __m128i vq31prod89AB = _mm_blend_epi16(vq31prod8A, vq31prod9B, 0xCC); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
308 const __m128i vq31prodCDEF = _mm_blend_epi16(vq31prodCE, vq31prodDF, 0xCC); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
310 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
311 const __m128i vrem0123 = in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
313 const __m128i vrem4567 = in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
315 const __m128i vrem89AB = in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
317 const __m128i vremCDEF = in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
320 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
321 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
331 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
332__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
333__m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
335 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
336 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
340 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
342 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
348 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
349 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 4 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
352 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
353 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
354 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
355 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
359 const __m128i vp0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
360 const __m128i vp0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
365 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
366 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
367 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 16)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
368 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
372 const __m128i vp1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
373 const __m128i vp1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
378 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
379 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
380 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 32)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
381 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
385 const __m128i vp2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
386 const __m128i vp2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
391 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
392 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
393 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) (k + 48)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
394 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
398 const __m128i vp3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
399 const __m128i vp3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
404 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
405 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
406 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) (k + 64)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
407 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
411 const __m128i vp4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
412 const __m128i vp4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
417 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
418 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
419 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) (k + 80)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
420 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
424 const __m128i vp5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
425 const __m128i vp5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
430 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
431 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
432 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) (k + 96)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
433 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
437 const __m128i vp6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
438 const __m128i vp6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
443 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
444 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
445 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) (k + 112)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
446 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
450 const __m128i vp7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
451 const __m128i vp7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
456 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
457 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
458 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) (k + 128)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
459 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
463 const __m128i vp8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
464 const __m128i vp8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
472 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
473 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
475 const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
476 const __m128i vacc57 = _mm_shuffle_epi32(vacc4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
478 const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
479 const __m128i vprod46 = _mm_add_epi64(_mm_mul_epi32(vacc4567, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
481 const __m128i vprod13 = _mm_add_epi64(_mm_mul_epi32(vacc13, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
482 const __m128i vprod57 = _mm_add_epi64(_mm_mul_epi32(vacc57, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
484 const __m128i vq31prod02 = _mm_srli_epi64(vprod02, 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
485 const __m128i vq31prod13 = _mm_add_epi64(vprod13, vprod13); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
486 const __m128i vq31prod46 = _mm_srli_epi64(vprod46, 31); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
487 const __m128i vq31prod57 = _mm_add_epi64(vprod57, vprod57); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
489 const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
490 const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
492 … const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
493 const __m128i vrem0123 = in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
495 const __m128i vrem4567 = in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
498 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
499 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
505 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
506__m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
508 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
509 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
512 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()
515 _mm_storel_epi64((__m128i*) output, vout0123456701234567); in xnn_qs8_dwconv_minmax_ukernel_up16x9__sse41_mul16()