Lines Matching refs:__m128i
83 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
84 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 4 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
85 __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
86 __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 12 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
87 __m128i vaccGHIJ = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
88 __m128i vaccKLMN = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 20 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
91 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
92 …const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
93 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
94 …const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
95 const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
96 …const __m128i vk0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
99 …const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
100 …const __m128i vxk0x01234567 = _mm_unpacklo_epi8(vk0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
101 …const __m128i vxi0x89ABCDEF = _mm_unpacklo_epi8(vi0x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
102 …const __m128i vxk0x89ABCDEF = _mm_unpacklo_epi8(vk0x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
103 …const __m128i vxi0xGHIJKLMN = _mm_unpacklo_epi8(vi0xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
104 …const __m128i vxk0xGHIJKLMN = _mm_unpacklo_epi8(vk0xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
106 const __m128i vp0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
107 const __m128i vp0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
108 const __m128i vp0x89ABCDEFlo = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
109 const __m128i vp0x89ABCDEFhi = _mm_mulhi_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
110 const __m128i vp0xGHIJKLMNlo = _mm_mullo_epi16(vxi0xGHIJKLMN, vxk0xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
111 const __m128i vp0xGHIJKLMNhi = _mm_mulhi_epi16(vxi0xGHIJKLMN, vxk0xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
120 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
121 …const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
122 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
123 …const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
124 const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
125 …const __m128i vk1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
128 …const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
129 …const __m128i vxk1x01234567 = _mm_unpacklo_epi8(vk1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
130 …const __m128i vxi1x89ABCDEF = _mm_unpacklo_epi8(vi1x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
131 …const __m128i vxk1x89ABCDEF = _mm_unpacklo_epi8(vk1x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
132 …const __m128i vxi1xGHIJKLMN = _mm_unpacklo_epi8(vi1xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
133 …const __m128i vxk1xGHIJKLMN = _mm_unpacklo_epi8(vk1xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
135 const __m128i vp1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
136 const __m128i vp1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
137 const __m128i vp1x89ABCDEFlo = _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
138 const __m128i vp1x89ABCDEFhi = _mm_mulhi_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
139 const __m128i vp1xGHIJKLMNlo = _mm_mullo_epi16(vxi1xGHIJKLMN, vxk1xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
140 const __m128i vp1xGHIJKLMNhi = _mm_mulhi_epi16(vxi1xGHIJKLMN, vxk1xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
149 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
150 …const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
151 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
152 …const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
153 const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
154 …const __m128i vk2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
157 …const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
158 …const __m128i vxk2x01234567 = _mm_unpacklo_epi8(vk2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
159 …const __m128i vxi2x89ABCDEF = _mm_unpacklo_epi8(vi2x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
160 …const __m128i vxk2x89ABCDEF = _mm_unpacklo_epi8(vk2x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
161 …const __m128i vxi2xGHIJKLMN = _mm_unpacklo_epi8(vi2xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
162 …const __m128i vxk2xGHIJKLMN = _mm_unpacklo_epi8(vk2xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
164 const __m128i vp2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
165 const __m128i vp2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
166 const __m128i vp2x89ABCDEFlo = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
167 const __m128i vp2x89ABCDEFhi = _mm_mulhi_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
168 const __m128i vp2xGHIJKLMNlo = _mm_mullo_epi16(vxi2xGHIJKLMN, vxk2xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
169 const __m128i vp2xGHIJKLMNhi = _mm_mulhi_epi16(vxi2xGHIJKLMN, vxk2xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
178 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
179 …const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
180 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
181 …const __m128i vk3x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
182 const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
183 …const __m128i vk3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
186 …const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
187 …const __m128i vxk3x01234567 = _mm_unpacklo_epi8(vk3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
188 …const __m128i vxi3x89ABCDEF = _mm_unpacklo_epi8(vi3x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
189 …const __m128i vxk3x89ABCDEF = _mm_unpacklo_epi8(vk3x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
190 …const __m128i vxi3xGHIJKLMN = _mm_unpacklo_epi8(vi3xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
191 …const __m128i vxk3xGHIJKLMN = _mm_unpacklo_epi8(vk3xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
193 const __m128i vp3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
194 const __m128i vp3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
195 const __m128i vp3x89ABCDEFlo = _mm_mullo_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
196 const __m128i vp3x89ABCDEFhi = _mm_mulhi_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
197 const __m128i vp3xGHIJKLMNlo = _mm_mullo_epi16(vxi3xGHIJKLMN, vxk3xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
198 const __m128i vp3xGHIJKLMNhi = _mm_mulhi_epi16(vxi3xGHIJKLMN, vxk3xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
207 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
208 …const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
209 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
210 …const __m128i vk4x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
211 const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
212 …const __m128i vk4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
215 …const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
216 …const __m128i vxk4x01234567 = _mm_unpacklo_epi8(vk4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
217 …const __m128i vxi4x89ABCDEF = _mm_unpacklo_epi8(vi4x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
218 …const __m128i vxk4x89ABCDEF = _mm_unpacklo_epi8(vk4x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
219 …const __m128i vxi4xGHIJKLMN = _mm_unpacklo_epi8(vi4xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
220 …const __m128i vxk4xGHIJKLMN = _mm_unpacklo_epi8(vk4xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
222 const __m128i vp4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
223 const __m128i vp4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
224 const __m128i vp4x89ABCDEFlo = _mm_mullo_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
225 const __m128i vp4x89ABCDEFhi = _mm_mulhi_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
226 const __m128i vp4xGHIJKLMNlo = _mm_mullo_epi16(vxi4xGHIJKLMN, vxk4xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
227 const __m128i vp4xGHIJKLMNhi = _mm_mulhi_epi16(vxi4xGHIJKLMN, vxk4xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
236 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
237 …const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
238 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
239 …const __m128i vk5x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
240 const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
241 …const __m128i vk5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
244 …const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
245 …const __m128i vxk5x01234567 = _mm_unpacklo_epi8(vk5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
246 …const __m128i vxi5x89ABCDEF = _mm_unpacklo_epi8(vi5x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
247 …const __m128i vxk5x89ABCDEF = _mm_unpacklo_epi8(vk5x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
248 …const __m128i vxi5xGHIJKLMN = _mm_unpacklo_epi8(vi5xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
249 …const __m128i vxk5xGHIJKLMN = _mm_unpacklo_epi8(vk5xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
251 const __m128i vp5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
252 const __m128i vp5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
253 const __m128i vp5x89ABCDEFlo = _mm_mullo_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
254 const __m128i vp5x89ABCDEFhi = _mm_mulhi_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
255 const __m128i vp5xGHIJKLMNlo = _mm_mullo_epi16(vxi5xGHIJKLMN, vxk5xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
256 const __m128i vp5xGHIJKLMNhi = _mm_mulhi_epi16(vxi5xGHIJKLMN, vxk5xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
265 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
266 …const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
267 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
268 …const __m128i vk6x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
269 const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
270 …const __m128i vk6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
273 …const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
274 …const __m128i vxk6x01234567 = _mm_unpacklo_epi8(vk6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
275 …const __m128i vxi6x89ABCDEF = _mm_unpacklo_epi8(vi6x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
276 …const __m128i vxk6x89ABCDEF = _mm_unpacklo_epi8(vk6x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
277 …const __m128i vxi6xGHIJKLMN = _mm_unpacklo_epi8(vi6xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
278 …const __m128i vxk6xGHIJKLMN = _mm_unpacklo_epi8(vk6xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
280 const __m128i vp6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
281 const __m128i vp6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
282 const __m128i vp6x89ABCDEFlo = _mm_mullo_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
283 const __m128i vp6x89ABCDEFhi = _mm_mulhi_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
284 const __m128i vp6xGHIJKLMNlo = _mm_mullo_epi16(vxi6xGHIJKLMN, vxk6xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
285 const __m128i vp6xGHIJKLMNhi = _mm_mulhi_epi16(vxi6xGHIJKLMN, vxk6xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
294 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
295 …const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
296 const __m128i vi7x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i7 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
297 …const __m128i vk7x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
298 const __m128i vi7xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i7 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
299 …const __m128i vk7xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
302 …const __m128i vxi7x01234567 = _mm_unpacklo_epi8(vi7x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
303 …const __m128i vxk7x01234567 = _mm_unpacklo_epi8(vk7x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
304 …const __m128i vxi7x89ABCDEF = _mm_unpacklo_epi8(vi7x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
305 …const __m128i vxk7x89ABCDEF = _mm_unpacklo_epi8(vk7x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
306 …const __m128i vxi7xGHIJKLMN = _mm_unpacklo_epi8(vi7xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
307 …const __m128i vxk7xGHIJKLMN = _mm_unpacklo_epi8(vk7xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
309 const __m128i vp7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
310 const __m128i vp7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
311 const __m128i vp7x89ABCDEFlo = _mm_mullo_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
312 const __m128i vp7x89ABCDEFhi = _mm_mulhi_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
313 const __m128i vp7xGHIJKLMNlo = _mm_mullo_epi16(vxi7xGHIJKLMN, vxk7xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
314 const __m128i vp7xGHIJKLMNhi = _mm_mulhi_epi16(vxi7xGHIJKLMN, vxk7xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
323 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
324 …const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
325 const __m128i vi8x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i8 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
326 …const __m128i vk8x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
327 const __m128i vi8xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i8 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
328 …const __m128i vk8xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
331 …const __m128i vxi8x01234567 = _mm_unpacklo_epi8(vi8x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
332 …const __m128i vxk8x01234567 = _mm_unpacklo_epi8(vk8x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
333 …const __m128i vxi8x89ABCDEF = _mm_unpacklo_epi8(vi8x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
334 …const __m128i vxk8x89ABCDEF = _mm_unpacklo_epi8(vk8x89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
335 …const __m128i vxi8xGHIJKLMN = _mm_unpacklo_epi8(vi8xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
336 …const __m128i vxk8xGHIJKLMN = _mm_unpacklo_epi8(vk8xGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
338 const __m128i vp8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
339 const __m128i vp8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
340 const __m128i vp8x89ABCDEFlo = _mm_mullo_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
341 const __m128i vp8x89ABCDEFhi = _mm_mulhi_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
342 const __m128i vp8xGHIJKLMNlo = _mm_mullo_epi16(vxi8xGHIJKLMN, vxk8xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
343 const __m128i vp8xGHIJKLMNhi = _mm_mulhi_epi16(vxi8xGHIJKLMN, vxk8xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
354 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
355 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
357 const __m128i vnmask0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
358 const __m128i vnmask4567 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
359 const __m128i vnmask89AB = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc89AB); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
360 const __m128i vnmaskCDEF = _mm_cmpgt_epi32(_mm_setzero_si128(), vaccCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
361 const __m128i vnmaskGHIJ = _mm_cmpgt_epi32(_mm_setzero_si128(), vaccGHIJ); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
362 const __m128i vnmaskKLMN = _mm_cmpgt_epi32(_mm_setzero_si128(), vaccKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
364 const __m128i vabsacc0123 = _mm_sub_epi32(_mm_xor_si128(vacc0123, vnmask0123), vnmask0123); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
365 const __m128i vabsacc4567 = _mm_sub_epi32(_mm_xor_si128(vacc4567, vnmask4567), vnmask4567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
366 const __m128i vabsacc89AB = _mm_sub_epi32(_mm_xor_si128(vacc89AB, vnmask89AB), vnmask89AB); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
367 const __m128i vabsaccCDEF = _mm_sub_epi32(_mm_xor_si128(vaccCDEF, vnmaskCDEF), vnmaskCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
368 const __m128i vabsaccGHIJ = _mm_sub_epi32(_mm_xor_si128(vaccGHIJ, vnmaskGHIJ), vnmaskGHIJ); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
369 const __m128i vabsaccKLMN = _mm_sub_epi32(_mm_xor_si128(vaccKLMN, vnmaskKLMN), vnmaskKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
371 const __m128i vabsacc13 = _mm_shuffle_epi32(vabsacc0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
372 const __m128i vabsprod02 = _mm_mul_epu32(vabsacc0123, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
373 const __m128i vabsprod13 = _mm_mul_epu32(vabsacc13, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
374 const __m128i vabsacc57 = _mm_shuffle_epi32(vabsacc4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
375 const __m128i vabsprod46 = _mm_mul_epu32(vabsacc4567, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
376 const __m128i vabsprod57 = _mm_mul_epu32(vabsacc57, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
377 const __m128i vabsacc9B = _mm_shuffle_epi32(vabsacc89AB, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
378 const __m128i vabsprod8A = _mm_mul_epu32(vabsacc89AB, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
379 const __m128i vabsprod9B = _mm_mul_epu32(vabsacc9B, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
380 const __m128i vabsaccDF = _mm_shuffle_epi32(vabsaccCDEF, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
381 const __m128i vabsprodCE = _mm_mul_epu32(vabsaccCDEF, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
382 const __m128i vabsprodDF = _mm_mul_epu32(vabsaccDF, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
383 const __m128i vabsaccHJ = _mm_shuffle_epi32(vabsaccGHIJ, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
384 const __m128i vabsprodGI = _mm_mul_epu32(vabsaccGHIJ, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
385 const __m128i vabsprodHJ = _mm_mul_epu32(vabsaccHJ, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
386 const __m128i vabsaccLN = _mm_shuffle_epi32(vabsaccKLMN, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
387 const __m128i vabsprodKM = _mm_mul_epu32(vabsaccKLMN, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
388 const __m128i vabsprodLN = _mm_mul_epu32(vabsaccLN, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
390 const __m128i vnmask02 = _mm_shuffle_epi32(vnmask0123, _MM_SHUFFLE(2, 2, 0, 0)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
391 const __m128i vnmask13 = _mm_shuffle_epi32(vnmask0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
392 const __m128i vnmask46 = _mm_shuffle_epi32(vnmask4567, _MM_SHUFFLE(2, 2, 0, 0)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
393 const __m128i vnmask57 = _mm_shuffle_epi32(vnmask4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
394 const __m128i vnmask8A = _mm_shuffle_epi32(vnmask89AB, _MM_SHUFFLE(2, 2, 0, 0)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
395 const __m128i vnmask9B = _mm_shuffle_epi32(vnmask89AB, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
396 const __m128i vnmaskCE = _mm_shuffle_epi32(vnmaskCDEF, _MM_SHUFFLE(2, 2, 0, 0)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
397 const __m128i vnmaskDF = _mm_shuffle_epi32(vnmaskCDEF, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
398 const __m128i vnmaskGI = _mm_shuffle_epi32(vnmaskGHIJ, _MM_SHUFFLE(2, 2, 0, 0)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
399 const __m128i vnmaskHJ = _mm_shuffle_epi32(vnmaskGHIJ, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
400 const __m128i vnmaskKM = _mm_shuffle_epi32(vnmaskKLMN, _MM_SHUFFLE(2, 2, 0, 0)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
401 const __m128i vnmaskLN = _mm_shuffle_epi32(vnmaskKLMN, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
403 const __m128i vprod02 = _mm_sub_epi64(_mm_xor_si128(vabsprod02, vnmask02), vnmask02); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
404 const __m128i vprod13 = _mm_sub_epi64(_mm_xor_si128(vabsprod13, vnmask13), vnmask13); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
405 const __m128i vprod46 = _mm_sub_epi64(_mm_xor_si128(vabsprod46, vnmask46), vnmask46); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
406 const __m128i vprod57 = _mm_sub_epi64(_mm_xor_si128(vabsprod57, vnmask57), vnmask57); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
407 const __m128i vprod8A = _mm_sub_epi64(_mm_xor_si128(vabsprod8A, vnmask8A), vnmask8A); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
408 const __m128i vprod9B = _mm_sub_epi64(_mm_xor_si128(vabsprod9B, vnmask9B), vnmask9B); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
409 const __m128i vprodCE = _mm_sub_epi64(_mm_xor_si128(vabsprodCE, vnmaskCE), vnmaskCE); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
410 const __m128i vprodDF = _mm_sub_epi64(_mm_xor_si128(vabsprodDF, vnmaskDF), vnmaskDF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
411 const __m128i vprodGI = _mm_sub_epi64(_mm_xor_si128(vabsprodGI, vnmaskGI), vnmaskGI); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
412 const __m128i vprodHJ = _mm_sub_epi64(_mm_xor_si128(vabsprodHJ, vnmaskHJ), vnmaskHJ); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
413 const __m128i vprodKM = _mm_sub_epi64(_mm_xor_si128(vabsprodKM, vnmaskKM), vnmaskKM); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
414 const __m128i vprodLN = _mm_sub_epi64(_mm_xor_si128(vabsprodLN, vnmaskLN), vnmaskLN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
416 const __m128i vq31prod02 = _mm_srli_epi64(_mm_add_epi64(vprod02, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
417 const __m128i vq31prod13 = _mm_srli_epi64(_mm_add_epi64(vprod13, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
418 const __m128i vq31prod46 = _mm_srli_epi64(_mm_add_epi64(vprod46, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
419 const __m128i vq31prod57 = _mm_srli_epi64(_mm_add_epi64(vprod57, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
420 const __m128i vq31prod8A = _mm_srli_epi64(_mm_add_epi64(vprod8A, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
421 const __m128i vq31prod9B = _mm_srli_epi64(_mm_add_epi64(vprod9B, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
422 const __m128i vq31prodCE = _mm_srli_epi64(_mm_add_epi64(vprodCE, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
423 const __m128i vq31prodDF = _mm_srli_epi64(_mm_add_epi64(vprodDF, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
424 const __m128i vq31prodGI = _mm_srli_epi64(_mm_add_epi64(vprodGI, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
425 const __m128i vq31prodHJ = _mm_srli_epi64(_mm_add_epi64(vprodHJ, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
426 const __m128i vq31prodKM = _mm_srli_epi64(_mm_add_epi64(vprodKM, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
427 const __m128i vq31prodLN = _mm_srli_epi64(_mm_add_epi64(vprodLN, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
429 const __m128i vq31prod0213 = _mm_castps_si128(_mm_shuffle_ps( in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
431 const __m128i vq31prod4657 = _mm_castps_si128(_mm_shuffle_ps( in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
433 const __m128i vq31prod8A9B = _mm_castps_si128(_mm_shuffle_ps( in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
435 const __m128i vq31prodCEDF = _mm_castps_si128(_mm_shuffle_ps( in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
437 const __m128i vq31prodGIHJ = _mm_castps_si128(_mm_shuffle_ps( in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
439 const __m128i vq31prodKMLN = _mm_castps_si128(_mm_shuffle_ps( in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
442 const __m128i vq31prod0123 = _mm_shuffle_epi32(vq31prod0213, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
443 const __m128i vq31prod4567 = _mm_shuffle_epi32(vq31prod4657, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
444 const __m128i vq31prod89AB = _mm_shuffle_epi32(vq31prod8A9B, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
445 const __m128i vq31prodCDEF = _mm_shuffle_epi32(vq31prodCEDF, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
446 const __m128i vq31prodGHIJ = _mm_shuffle_epi32(vq31prodGIHJ, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
447 const __m128i vq31prodKLMN = _mm_shuffle_epi32(vq31prodKMLN, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
449 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
450 const __m128i vrem0123 = in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
452 const __m128i vrem4567 = in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
454 const __m128i vrem89AB = in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
456 const __m128i vremCDEF = in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
458 const __m128i vremGHIJ = in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
460 const __m128i vremKLMN = in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
463 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
464 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
478 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
479 … __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
480 … __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
481 … __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
483 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
484 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
489 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
490 __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
492 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
493 _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
499 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
500 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 4 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
503 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
504 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
507 …const __m128i vxi0x01234567 = _mm_unpacklo_epi8(vi0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
508 …const __m128i vxk0x01234567 = _mm_unpacklo_epi8(vk0x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
510 const __m128i vp0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
511 const __m128i vp0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
516 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
517 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 24)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
520 …const __m128i vxi1x01234567 = _mm_unpacklo_epi8(vi1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
521 …const __m128i vxk1x01234567 = _mm_unpacklo_epi8(vk1x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
523 const __m128i vp1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
524 const __m128i vp1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
529 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
530 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 48)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
533 …const __m128i vxi2x01234567 = _mm_unpacklo_epi8(vi2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
534 …const __m128i vxk2x01234567 = _mm_unpacklo_epi8(vk2x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
536 const __m128i vp2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
537 const __m128i vp2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
542 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
543 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) (k + 72)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
546 …const __m128i vxi3x01234567 = _mm_unpacklo_epi8(vi3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
547 …const __m128i vxk3x01234567 = _mm_unpacklo_epi8(vk3x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
549 const __m128i vp3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
550 const __m128i vp3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
555 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
556 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) (k + 96)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
559 …const __m128i vxi4x01234567 = _mm_unpacklo_epi8(vi4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
560 …const __m128i vxk4x01234567 = _mm_unpacklo_epi8(vk4x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
562 const __m128i vp4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
563 const __m128i vp4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
568 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
569 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) (k + 120)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
572 …const __m128i vxi5x01234567 = _mm_unpacklo_epi8(vi5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
573 …const __m128i vxk5x01234567 = _mm_unpacklo_epi8(vk5x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
575 const __m128i vp5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
576 const __m128i vp5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
581 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
582 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) (k + 144)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
585 …const __m128i vxi6x01234567 = _mm_unpacklo_epi8(vi6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
586 …const __m128i vxk6x01234567 = _mm_unpacklo_epi8(vk6x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
588 const __m128i vp6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
589 const __m128i vp6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
594 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
595 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) (k + 168)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
598 …const __m128i vxi7x01234567 = _mm_unpacklo_epi8(vi7x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
599 …const __m128i vxk7x01234567 = _mm_unpacklo_epi8(vk7x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
601 const __m128i vp7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
602 const __m128i vp7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
607 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
608 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) (k + 192)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
611 …const __m128i vxi8x01234567 = _mm_unpacklo_epi8(vi8x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
612 …const __m128i vxk8x01234567 = _mm_unpacklo_epi8(vk8x01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), … in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
614 const __m128i vp8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
615 const __m128i vp8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
623 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
624 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
626 const __m128i vnmask0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
627 const __m128i vnmask4567 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
629 const __m128i vabsacc0123 = _mm_sub_epi32(_mm_xor_si128(vacc0123, vnmask0123), vnmask0123); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
630 const __m128i vabsacc4567 = _mm_sub_epi32(_mm_xor_si128(vacc4567, vnmask4567), vnmask4567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
632 const __m128i vabsacc13 = _mm_shuffle_epi32(vabsacc0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
633 const __m128i vabsacc57 = _mm_shuffle_epi32(vabsacc4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
635 const __m128i vabsprod02 = _mm_mul_epu32(vabsacc0123, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
636 const __m128i vabsprod13 = _mm_mul_epu32(vabsacc13, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
637 const __m128i vabsprod46 = _mm_mul_epu32(vabsacc4567, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
638 const __m128i vabsprod57 = _mm_mul_epu32(vabsacc57, vmultiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
640 const __m128i vnmask02 = _mm_shuffle_epi32(vnmask0123, _MM_SHUFFLE(2, 2, 0, 0)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
641 const __m128i vnmask13 = _mm_shuffle_epi32(vnmask0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
642 const __m128i vnmask46 = _mm_shuffle_epi32(vnmask4567, _MM_SHUFFLE(2, 2, 0, 0)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
643 const __m128i vnmask57 = _mm_shuffle_epi32(vnmask4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
645 const __m128i vprod02 = _mm_sub_epi64(_mm_xor_si128(vabsprod02, vnmask02), vnmask02); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
646 const __m128i vprod13 = _mm_sub_epi64(_mm_xor_si128(vabsprod13, vnmask13), vnmask13); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
647 const __m128i vprod46 = _mm_sub_epi64(_mm_xor_si128(vabsprod46, vnmask46), vnmask46); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
648 const __m128i vprod57 = _mm_sub_epi64(_mm_xor_si128(vabsprod57, vnmask57), vnmask57); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
650 const __m128i vq31prod02 = _mm_srli_epi64(_mm_add_epi64(vprod02, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
651 const __m128i vq31prod13 = _mm_srli_epi64(_mm_add_epi64(vprod13, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
652 const __m128i vq31prod46 = _mm_srli_epi64(_mm_add_epi64(vprod46, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
653 const __m128i vq31prod57 = _mm_srli_epi64(_mm_add_epi64(vprod57, vrounding), 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
655 const __m128i vq31prod0213 = _mm_castps_si128(_mm_shuffle_ps( in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
657 const __m128i vq31prod4657 = _mm_castps_si128(_mm_shuffle_ps( in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
660 const __m128i vq31prod0123 = _mm_shuffle_epi32(vq31prod0213, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
661 const __m128i vq31prod4567 = _mm_shuffle_epi32(vq31prod4657, _MM_SHUFFLE(3, 1, 2, 0)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
663 … const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
664 const __m128i vrem0123 = in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
666 const __m128i vrem4567 = in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
669 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
670 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
676 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
677 … __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
679 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
680 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
683 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()
686 _mm_storel_epi64((__m128i*) output, vout0123456701234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse2_mul16()