/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x16c4-minmax-neondot.c | 132 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() local 137 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() local 142 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 144 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 147 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 155 int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 158 vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot()
|
D | 1x16c2-minmax-neon-mull-padal-dup.c | 194 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() local 199 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() local 204 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 206 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 209 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 217 int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 220 vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
|
D | 1x16c8-minmax-neon-mull-padal.c | 199 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() local 204 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() local 209 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() 211 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() 214 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() 222 int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() 225 vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 1x16-minmax-neon-mlal-lane.c | 260 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() local 265 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() local 270 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 272 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 275 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 283 int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 286 vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane()
|
D | 1x16-minmax-neon-mull-addw-dup.c | 257 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() local 262 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() local 267 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 269 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 272 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 280 int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 283 vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 215 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 220 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 225 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 227 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 230 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 238 int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 241 vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 4x16c4-minmax-neondot.c | 252 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() local 266 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() local 277 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 282 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 288 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 299 …int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot() 307 …vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0… in xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 288 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 293 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 298 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 300 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 303 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 311 int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 314 vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 2x16c2-minmax-neon-mull-padal-dup.c | 278 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() local 286 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() local 293 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 296 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 300 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 309 …int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8… in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 313 …vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0… in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
|
D | 1x16c2-minmax-neon-mlal-padal-dup.c | 282 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() local 287 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() local 292 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 294 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 297 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 305 int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 308 vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
|
D | 2x16c8-minmax-neon-mull-padal.c | 310 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() local 318 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() local 325 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 328 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 332 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 341 …int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8… in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 345 …vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0… in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-avx512skx.c | 123 …const __m128i vout0x0123456789ABCDEF = _mm_shuffle_epi8(vout0x084C2A6E195D3B7F, _mm_set_epi8(15, 7… in xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx() local 126 _mm_storeu_si128((__m128i*) c0, vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx() 137 _mm_mask_storeu_epi8(c0, vmask, vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx()
|
D | 2x16-minmax-neon-mlal-lane.c | 350 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() local 358 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() local 365 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 368 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 372 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 381 …int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8… in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 385 …vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0… in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x16c4-minmax-neondot.c | 122 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() local 127 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() local 132 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 134 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 138 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 148 int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 151 vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot()
|
D | 1x16c8-minmax-neon-mull-padal.c | 185 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() local 190 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() local 195 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 197 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 200 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 208 int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 211 vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 1x16c2-minmax-neon-mull-padal-dup.c | 180 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() local 185 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() local 190 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 192 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 195 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 203 int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 206 vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup()
|
D | 1x16-minmax-neon-mull-addw-dup.c | 243 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() local 248 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() local 253 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 255 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 258 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 266 int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 269 vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup()
|
D | 1x16-minmax-neon-mlal-lane.c | 247 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() local 252 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() local 257 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 259 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 262 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 270 int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 273 vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 201 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 206 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 211 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 213 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 216 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 224 int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 227 vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 4x16c4-minmax-neondot.c | 236 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() local 250 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() local 258 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 263 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 270 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 289 …int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot() 296 …vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0… in xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 274 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 279 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 284 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 286 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 289 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 297 int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 300 vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 2x16c2-minmax-neon-mull-padal-dup.c | 262 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() local 270 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() local 276 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 279 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 283 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 294 …int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8… in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 298 …vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0… in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup()
|
D | 1x16c2-minmax-neon-mlal-padal-dup.c | 268 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() local 273 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() local 278 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 280 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 283 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 291 int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 294 vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup()
|
D | 2x16c8-minmax-neon-mull-padal.c | 294 int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() local 302 …int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCD… in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() local 308 vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 311 vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 315 vst1q_s8(c0 + 0, vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 326 …int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8… in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 330 …vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0… in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-avx512skx.c | 112 …const __m128i vout0x0123456789ABCDEF = _mm_shuffle_epi8(vout0x084C2A6E195D3B7F, _mm_set_epi8(15, 7… in xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx() local 115 _mm_storeu_si128((__m128i*) c0, vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx() 126 _mm_mask_storeu_epi8(c0, vmask, vout0x0123456789ABCDEF); in xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx()
|