Lines Matching refs:__m128i
83 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
84 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 4 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
85 __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 8 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
86 __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 12 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
87 __m128i vaccGHIJ = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
88 __m128i vaccKLMN = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 20 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
91 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
92 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
93 …const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
94 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
95 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
96 const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(vi0x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
97 …const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
98 const __m128i vxk0x89ABCDEF = _mm_cvtepi8_epi16(vk0x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
99 const __m128i vi0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i0 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
100 const __m128i vxi0xGHIJKLMN = _mm_cvtepi8_epi16(vi0xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
101 …const __m128i vk0xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
102 const __m128i vxk0xGHIJKLMN = _mm_cvtepi8_epi16(vk0xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
106 const __m128i vp0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
107 const __m128i vp0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
108 const __m128i vp0x89ABCDEFlo = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
109 const __m128i vp0x89ABCDEFhi = _mm_mulhi_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
110 const __m128i vp0xGHIJKLMNlo = _mm_mullo_epi16(vxi0xGHIJKLMN, vxk0xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
111 const __m128i vp0xGHIJKLMNhi = _mm_mulhi_epi16(vxi0xGHIJKLMN, vxk0xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
120 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
121 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
122 …const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
123 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
124 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
125 const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(vi1x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
126 …const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
127 const __m128i vxk1x89ABCDEF = _mm_cvtepi8_epi16(vk1x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
128 const __m128i vi1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i1 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
129 const __m128i vxi1xGHIJKLMN = _mm_cvtepi8_epi16(vi1xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
130 …const __m128i vk1xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
131 const __m128i vxk1xGHIJKLMN = _mm_cvtepi8_epi16(vk1xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
135 const __m128i vp1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
136 const __m128i vp1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
137 const __m128i vp1x89ABCDEFlo = _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
138 const __m128i vp1x89ABCDEFhi = _mm_mulhi_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
139 const __m128i vp1xGHIJKLMNlo = _mm_mullo_epi16(vxi1xGHIJKLMN, vxk1xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
140 const __m128i vp1xGHIJKLMNhi = _mm_mulhi_epi16(vxi1xGHIJKLMN, vxk1xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
149 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
150 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
151 …const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
152 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
153 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
154 const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(vi2x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
155 …const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
156 const __m128i vxk2x89ABCDEF = _mm_cvtepi8_epi16(vk2x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
157 const __m128i vi2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i2 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
158 const __m128i vxi2xGHIJKLMN = _mm_cvtepi8_epi16(vi2xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
159 …const __m128i vk2xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
160 const __m128i vxk2xGHIJKLMN = _mm_cvtepi8_epi16(vk2xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
164 const __m128i vp2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
165 const __m128i vp2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
166 const __m128i vp2x89ABCDEFlo = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
167 const __m128i vp2x89ABCDEFhi = _mm_mulhi_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
168 const __m128i vp2xGHIJKLMNlo = _mm_mullo_epi16(vxi2xGHIJKLMN, vxk2xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
169 const __m128i vp2xGHIJKLMNhi = _mm_mulhi_epi16(vxi2xGHIJKLMN, vxk2xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
178 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
179 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
180 …const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
181 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
182 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
183 const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(vi3x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
184 …const __m128i vk3x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
185 const __m128i vxk3x89ABCDEF = _mm_cvtepi8_epi16(vk3x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
186 const __m128i vi3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i3 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
187 const __m128i vxi3xGHIJKLMN = _mm_cvtepi8_epi16(vi3xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
188 …const __m128i vk3xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
189 const __m128i vxk3xGHIJKLMN = _mm_cvtepi8_epi16(vk3xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
193 const __m128i vp3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
194 const __m128i vp3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
195 const __m128i vp3x89ABCDEFlo = _mm_mullo_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
196 const __m128i vp3x89ABCDEFhi = _mm_mulhi_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
197 const __m128i vp3xGHIJKLMNlo = _mm_mullo_epi16(vxi3xGHIJKLMN, vxk3xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
198 const __m128i vp3xGHIJKLMNhi = _mm_mulhi_epi16(vxi3xGHIJKLMN, vxk3xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
207 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
208 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
209 …const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
210 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
211 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
212 const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(vi4x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
213 …const __m128i vk4x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
214 const __m128i vxk4x89ABCDEF = _mm_cvtepi8_epi16(vk4x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
215 const __m128i vi4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i4 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
216 const __m128i vxi4xGHIJKLMN = _mm_cvtepi8_epi16(vi4xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
217 …const __m128i vk4xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
218 const __m128i vxk4xGHIJKLMN = _mm_cvtepi8_epi16(vk4xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
222 const __m128i vp4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
223 const __m128i vp4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
224 const __m128i vp4x89ABCDEFlo = _mm_mullo_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
225 const __m128i vp4x89ABCDEFhi = _mm_mulhi_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
226 const __m128i vp4xGHIJKLMNlo = _mm_mullo_epi16(vxi4xGHIJKLMN, vxk4xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
227 const __m128i vp4xGHIJKLMNhi = _mm_mulhi_epi16(vxi4xGHIJKLMN, vxk4xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
236 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
237 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
238 …const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
239 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
240 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
241 const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(vi5x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
242 …const __m128i vk5x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
243 const __m128i vxk5x89ABCDEF = _mm_cvtepi8_epi16(vk5x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
244 const __m128i vi5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i5 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
245 const __m128i vxi5xGHIJKLMN = _mm_cvtepi8_epi16(vi5xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
246 …const __m128i vk5xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
247 const __m128i vxk5xGHIJKLMN = _mm_cvtepi8_epi16(vk5xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
251 const __m128i vp5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
252 const __m128i vp5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
253 const __m128i vp5x89ABCDEFlo = _mm_mullo_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
254 const __m128i vp5x89ABCDEFhi = _mm_mulhi_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
255 const __m128i vp5xGHIJKLMNlo = _mm_mullo_epi16(vxi5xGHIJKLMN, vxk5xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
256 const __m128i vp5xGHIJKLMNhi = _mm_mulhi_epi16(vxi5xGHIJKLMN, vxk5xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
265 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
266 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
267 …const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
268 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
269 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
270 const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(vi6x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
271 …const __m128i vk6x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
272 const __m128i vxk6x89ABCDEF = _mm_cvtepi8_epi16(vk6x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
273 const __m128i vi6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i6 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
274 const __m128i vxi6xGHIJKLMN = _mm_cvtepi8_epi16(vi6xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
275 …const __m128i vk6xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
276 const __m128i vxk6xGHIJKLMN = _mm_cvtepi8_epi16(vk6xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
280 const __m128i vp6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
281 const __m128i vp6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
282 const __m128i vp6x89ABCDEFlo = _mm_mullo_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
283 const __m128i vp6x89ABCDEFhi = _mm_mulhi_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
284 const __m128i vp6xGHIJKLMNlo = _mm_mullo_epi16(vxi6xGHIJKLMN, vxk6xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
285 const __m128i vp6xGHIJKLMNhi = _mm_mulhi_epi16(vxi6xGHIJKLMN, vxk6xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
294 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
295 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
296 …const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
297 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
298 const __m128i vi7x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i7 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
299 const __m128i vxi7x89ABCDEF = _mm_cvtepi8_epi16(vi7x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
300 …const __m128i vk7x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
301 const __m128i vxk7x89ABCDEF = _mm_cvtepi8_epi16(vk7x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
302 const __m128i vi7xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i7 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
303 const __m128i vxi7xGHIJKLMN = _mm_cvtepi8_epi16(vi7xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
304 …const __m128i vk7xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
305 const __m128i vxk7xGHIJKLMN = _mm_cvtepi8_epi16(vk7xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
309 const __m128i vp7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
310 const __m128i vp7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
311 const __m128i vp7x89ABCDEFlo = _mm_mullo_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
312 const __m128i vp7x89ABCDEFhi = _mm_mulhi_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
313 const __m128i vp7xGHIJKLMNlo = _mm_mullo_epi16(vxi7xGHIJKLMN, vxk7xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
314 const __m128i vp7xGHIJKLMNhi = _mm_mulhi_epi16(vxi7xGHIJKLMN, vxk7xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
323 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
324 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
325 …const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
326 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
327 const __m128i vi8x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i8 + 8)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
328 const __m128i vxi8x89ABCDEF = _mm_cvtepi8_epi16(vi8x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
329 …const __m128i vk8x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
330 const __m128i vxk8x89ABCDEF = _mm_cvtepi8_epi16(vk8x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
331 const __m128i vi8xGHIJKLMN = _mm_loadl_epi64((const __m128i*) (i8 + 16)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
332 const __m128i vxi8xGHIJKLMN = _mm_cvtepi8_epi16(vi8xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
333 …const __m128i vk8xGHIJKLMN = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int32_t… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
334 const __m128i vxk8xGHIJKLMN = _mm_cvtepi8_epi16(vk8xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
338 const __m128i vp8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
339 const __m128i vp8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
340 const __m128i vp8x89ABCDEFlo = _mm_mullo_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
341 const __m128i vp8x89ABCDEFhi = _mm_mulhi_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
342 const __m128i vp8xGHIJKLMNlo = _mm_mullo_epi16(vxi8xGHIJKLMN, vxk8xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
343 const __m128i vp8xGHIJKLMNhi = _mm_mulhi_epi16(vxi8xGHIJKLMN, vxk8xGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
354 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
355 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
357 const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
358 const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
359 const __m128i vprod13 = _mm_add_epi64(_mm_mul_epi32(vacc13, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
360 const __m128i vacc57 = _mm_shuffle_epi32(vacc4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
361 const __m128i vprod46 = _mm_add_epi64(_mm_mul_epi32(vacc4567, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
362 const __m128i vprod57 = _mm_add_epi64(_mm_mul_epi32(vacc57, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
363 const __m128i vacc9B = _mm_shuffle_epi32(vacc89AB, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
364 const __m128i vprod8A = _mm_add_epi64(_mm_mul_epi32(vacc89AB, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
365 const __m128i vprod9B = _mm_add_epi64(_mm_mul_epi32(vacc9B, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
366 const __m128i vaccDF = _mm_shuffle_epi32(vaccCDEF, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
367 const __m128i vprodCE = _mm_add_epi64(_mm_mul_epi32(vaccCDEF, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
368 const __m128i vprodDF = _mm_add_epi64(_mm_mul_epi32(vaccDF, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
369 const __m128i vaccHJ = _mm_shuffle_epi32(vaccGHIJ, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
370 const __m128i vprodGI = _mm_add_epi64(_mm_mul_epi32(vaccGHIJ, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
371 const __m128i vprodHJ = _mm_add_epi64(_mm_mul_epi32(vaccHJ, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
372 const __m128i vaccLN = _mm_shuffle_epi32(vaccKLMN, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
373 const __m128i vprodKM = _mm_add_epi64(_mm_mul_epi32(vaccKLMN, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
374 const __m128i vprodLN = _mm_add_epi64(_mm_mul_epi32(vaccLN, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
376 const __m128i vq31prod02 = _mm_srli_epi64(vprod02, 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
377 const __m128i vq31prod13 = _mm_add_epi64(vprod13, vprod13); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
378 const __m128i vq31prod46 = _mm_srli_epi64(vprod46, 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
379 const __m128i vq31prod57 = _mm_add_epi64(vprod57, vprod57); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
380 const __m128i vq31prod8A = _mm_srli_epi64(vprod8A, 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
381 const __m128i vq31prod9B = _mm_add_epi64(vprod9B, vprod9B); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
382 const __m128i vq31prodCE = _mm_srli_epi64(vprodCE, 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
383 const __m128i vq31prodDF = _mm_add_epi64(vprodDF, vprodDF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
384 const __m128i vq31prodGI = _mm_srli_epi64(vprodGI, 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
385 const __m128i vq31prodHJ = _mm_add_epi64(vprodHJ, vprodHJ); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
386 const __m128i vq31prodKM = _mm_srli_epi64(vprodKM, 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
387 const __m128i vq31prodLN = _mm_add_epi64(vprodLN, vprodLN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
389 const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
390 const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
391 const __m128i vq31prod89AB = _mm_blend_epi16(vq31prod8A, vq31prod9B, 0xCC); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
392 const __m128i vq31prodCDEF = _mm_blend_epi16(vq31prodCE, vq31prodDF, 0xCC); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
393 const __m128i vq31prodGHIJ = _mm_blend_epi16(vq31prodGI, vq31prodHJ, 0xCC); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
394 const __m128i vq31prodKLMN = _mm_blend_epi16(vq31prodKM, vq31prodLN, 0xCC); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
396 const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
397 const __m128i vrem0123 = in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
399 const __m128i vrem4567 = in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
401 const __m128i vrem89AB = in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
403 const __m128i vremCDEF = in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
405 const __m128i vremGHIJ = in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
407 const __m128i vremKLMN = in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
410 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
411 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
425 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
426 … __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
427 … __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
428 … __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
430 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
431 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
436 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
437 __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
439 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
440 _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
446 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
447 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 4 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
450 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
451 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
452 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
453 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
457 const __m128i vp0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
458 const __m128i vp0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
463 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
464 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
465 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 24)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
466 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
470 const __m128i vp1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
471 const __m128i vp1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
476 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
477 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
478 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 48)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
479 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
483 const __m128i vp2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
484 const __m128i vp2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
489 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
490 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
491 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) (k + 72)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
492 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
496 const __m128i vp3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
497 const __m128i vp3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
502 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
503 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
504 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) (k + 96)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
505 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
509 const __m128i vp4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
510 const __m128i vp4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
515 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
516 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
517 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) (k + 120)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
518 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
522 const __m128i vp5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
523 const __m128i vp5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
528 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
529 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
530 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) (k + 144)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
531 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
535 const __m128i vp6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
536 const __m128i vp6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
541 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
542 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
543 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) (k + 168)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
544 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
548 const __m128i vp7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
549 const __m128i vp7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
554 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
555 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
556 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) (k + 192)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
557 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
561 const __m128i vp8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
562 const __m128i vp8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
570 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
571 const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
573 const __m128i vacc13 = _mm_shuffle_epi32(vacc0123, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
574 const __m128i vacc57 = _mm_shuffle_epi32(vacc4567, _MM_SHUFFLE(3, 3, 1, 1)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
576 const __m128i vprod02 = _mm_add_epi64(_mm_mul_epi32(vacc0123, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
577 const __m128i vprod46 = _mm_add_epi64(_mm_mul_epi32(vacc4567, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
579 const __m128i vprod13 = _mm_add_epi64(_mm_mul_epi32(vacc13, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
580 const __m128i vprod57 = _mm_add_epi64(_mm_mul_epi32(vacc57, vmultiplier), vrounding); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
582 const __m128i vq31prod02 = _mm_srli_epi64(vprod02, 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
583 const __m128i vq31prod13 = _mm_add_epi64(vprod13, vprod13); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
584 const __m128i vq31prod46 = _mm_srli_epi64(vprod46, 31); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
585 const __m128i vq31prod57 = _mm_add_epi64(vprod57, vprod57); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
587 const __m128i vq31prod0123 = _mm_blend_epi16(vq31prod02, vq31prod13, 0xCC); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
588 const __m128i vq31prod4567 = _mm_blend_epi16(vq31prod46, vq31prod57, 0xCC); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
590 … const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
591 const __m128i vrem0123 = in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
593 const __m128i vrem4567 = in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
596 …const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_thresh… in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
597 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
603 …const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
604 … __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
606 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
607 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
610 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()
613 _mm_storel_epi64((__m128i*) output, vout0123456701234567); in xnn_qs8_dwconv_minmax_ukernel_up24x9__sse41_mul16()