/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x16c16-minmax-neon-mlal-padal.c | 141 const int8x16_t vb14 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 270 int16x8_t vprod0x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 271 int16x8_t vprod1x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 272 int16x8_t vprod2x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 273 vprod0x14 = vmlal_s8(vprod0x14, vget_high_s8(vb14), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 274 vprod1x14 = vmlal_s8(vprod1x14, vget_high_s8(vb14), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 275 vprod2x14 = vmlal_s8(vprod2x14, vget_high_s8(vb14), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 166 const int8x16_t vb14 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 337 int16x8_t vprod0x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 338 int16x8_t vprod1x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 339 int16x8_t vprod2x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 340 int16x8_t vprod3x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 341 vprod0x14 = vmlal_s8(vprod0x14, vget_high_s8(vb14), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 342 vprod1x14 = vmlal_s8(vprod1x14, vget_high_s8(vb14), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 343 vprod2x14 = vmlal_s8(vprod2x14, vget_high_s8(vb14), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 344 vprod3x14 = vmlal_s8(vprod3x14, vget_high_s8(vb14), vget_high_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 116 const int8x16_t vb14 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 203 int16x8_t vprod0x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 204 int16x8_t vprod1x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 205 vprod0x14 = vmlal_s8(vprod0x14, vget_high_s8(vb14), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 206 vprod1x14 = vmlal_s8(vprod1x14, vget_high_s8(vb14), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 91 const int8x16_t vb14 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 136 int16x8_t vprod0x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 137 vprod0x14 = vmlal_s8(vprod0x14, vget_high_s8(vb14), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 173 const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() local 174 const int16x8_t vprod0x14 = vmull_s8(vb14, va0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 175 const int16x8_t vprod1x14 = vmull_s8(vb14, va1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 279 const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 280 const int16x8_t vprod0x14 = vmull_s8(vb14, va0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 281 const int16x8_t vprod1x14 = vmull_s8(vb14, va1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 282 const int16x8_t vprod2x14 = vmull_s8(vb14, va2); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 283 const int16x8_t vprod3x14 = vmull_s8(vb14, va3); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 226 const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 227 const int16x8_t vprod0x14 = vmull_s8(vb14, va0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 228 const int16x8_t vprod1x14 = vmull_s8(vb14, va1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 229 const int16x8_t vprod2x14 = vmull_s8(vb14, va2); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mull-padal.c | 120 const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() local 121 const int16x8_t vprod0x14 = vmull_s8(vb14, va0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 312 const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 313 const int16x8_t vprod0x14 = vmull_s8(vb14, va0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 314 const int16x8_t vprod1x14 = vmull_s8(vb14, va1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 209 const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 210 const int16x8_t vprod0x14 = vmull_s8(vb14, va0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 518 const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 519 const int16x8_t vprod0x14 = vmull_s8(vb14, va0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 520 const int16x8_t vprod1x14 = vmull_s8(vb14, va1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 521 const int16x8_t vprod2x14 = vmull_s8(vb14, va2); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 522 const int16x8_t vprod3x14 = vmull_s8(vb14, va3); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 415 const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 416 const int16x8_t vprod0x14 = vmull_s8(vb14, va0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 417 const int16x8_t vprod1x14 = vmull_s8(vb14, va1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 418 const int16x8_t vprod2x14 = vmull_s8(vb14, va2); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x16c16-minmax-neon-mlal-padal.c | 126 const int8x16_t vb14 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 255 int16x8_t vprod0x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 256 int16x8_t vprod1x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 257 int16x8_t vprod2x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 258 vprod0x14 = vmlal_s8(vprod0x14, vget_high_s8(vb14), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 259 vprod1x14 = vmlal_s8(vprod1x14, vget_high_s8(vb14), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 260 vprod2x14 = vmlal_s8(vprod2x14, vget_high_s8(vb14), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 149 const int8x16_t vb14 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 320 int16x8_t vprod0x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 321 int16x8_t vprod1x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 322 int16x8_t vprod2x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 323 int16x8_t vprod3x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 324 vprod0x14 = vmlal_s8(vprod0x14, vget_high_s8(vb14), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 325 vprod1x14 = vmlal_s8(vprod1x14, vget_high_s8(vb14), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 326 vprod2x14 = vmlal_s8(vprod2x14, vget_high_s8(vb14), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 327 vprod3x14 = vmlal_s8(vprod3x14, vget_high_s8(vb14), vget_high_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 103 const int8x16_t vb14 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 190 int16x8_t vprod0x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 191 int16x8_t vprod1x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 192 vprod0x14 = vmlal_s8(vprod0x14, vget_high_s8(vb14), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 193 vprod1x14 = vmlal_s8(vprod1x14, vget_high_s8(vb14), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 80 const int8x16_t vb14 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 125 int16x8_t vprod0x14 = vmull_s8(vget_low_s8(vb14), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 126 vprod0x14 = vmlal_s8(vprod0x14, vget_high_s8(vb14), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 1x16c8-minmax-neon-mull-padal.c | 109 const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() local 110 const int16x8_t vprod0x14 = vmull_s8(vb14, va0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 160 const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() local 161 const int16x8_t vprod0x14 = vmull_s8(vb14, va0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 162 const int16x8_t vprod1x14 = vmull_s8(vb14, va1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 262 const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 263 const int16x8_t vprod0x14 = vmull_s8(vb14, va0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 264 const int16x8_t vprod1x14 = vmull_s8(vb14, va1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 265 const int16x8_t vprod2x14 = vmull_s8(vb14, va2); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 266 const int16x8_t vprod3x14 = vmull_s8(vb14, va3); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 211 const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() local 212 const int16x8_t vprod0x14 = vmull_s8(vb14, va0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 213 const int16x8_t vprod1x14 = vmull_s8(vb14, va1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 214 const int16x8_t vprod2x14 = vmull_s8(vb14, va2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 400 const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 401 const int16x8_t vprod0x14 = vmull_s8(vb14, va0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 402 const int16x8_t vprod1x14 = vmull_s8(vb14, va1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 403 const int16x8_t vprod2x14 = vmull_s8(vb14, va2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 299 const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 300 const int16x8_t vprod0x14 = vmull_s8(vb14, va0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 301 const int16x8_t vprod1x14 = vmull_s8(vb14, va1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 198 const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 199 const int16x8_t vprod0x14 = vmull_s8(vb14, va0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 501 const int8x8_t vb14 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 502 const int16x8_t vprod0x14 = vmull_s8(vb14, va0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 503 const int16x8_t vprod1x14 = vmull_s8(vb14, va1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 504 const int16x8_t vprod2x14 = vmull_s8(vb14, va2); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 505 const int16x8_t vprod3x14 = vmull_s8(vb14, va3); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|