/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x16c16-minmax-neon-mlal-padal.c | 137 const int8x16_t vb10 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 234 int16x8_t vprod0x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 235 int16x8_t vprod1x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 236 int16x8_t vprod2x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 237 vprod0x10 = vmlal_s8(vprod0x10, vget_high_s8(vb10), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 238 vprod1x10 = vmlal_s8(vprod1x10, vget_high_s8(vb10), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 239 vprod2x10 = vmlal_s8(vprod2x10, vget_high_s8(vb10), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 162 const int8x16_t vb10 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 289 int16x8_t vprod0x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 290 int16x8_t vprod1x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 291 int16x8_t vprod2x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 292 int16x8_t vprod3x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 293 vprod0x10 = vmlal_s8(vprod0x10, vget_high_s8(vb10), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 294 vprod1x10 = vmlal_s8(vprod1x10, vget_high_s8(vb10), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 295 vprod2x10 = vmlal_s8(vprod2x10, vget_high_s8(vb10), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 296 vprod3x10 = vmlal_s8(vprod3x10, vget_high_s8(vb10), vget_high_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 112 const int8x16_t vb10 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 179 int16x8_t vprod0x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 180 int16x8_t vprod1x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 181 vprod0x10 = vmlal_s8(vprod0x10, vget_high_s8(vb10), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 182 vprod1x10 = vmlal_s8(vprod1x10, vget_high_s8(vb10), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 87 const int8x16_t vb10 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 124 int16x8_t vprod0x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 125 vprod0x10 = vmlal_s8(vprod0x10, vget_high_s8(vb10), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 153 const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() local 154 const int16x8_t vprod0x10 = vmull_s8(vb10, va0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 155 const int16x8_t vprod1x10 = vmull_s8(vb10, va1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 243 const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 244 const int16x8_t vprod0x10 = vmull_s8(vb10, va0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 245 const int16x8_t vprod1x10 = vmull_s8(vb10, va1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 246 const int16x8_t vprod2x10 = vmull_s8(vb10, va2); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 247 const int16x8_t vprod3x10 = vmull_s8(vb10, va3); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 198 const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 199 const int16x8_t vprod0x10 = vmull_s8(vb10, va0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 200 const int16x8_t vprod1x10 = vmull_s8(vb10, va1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 201 const int16x8_t vprod2x10 = vmull_s8(vb10, va2); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mull-padal.c | 108 const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() local 109 const int16x8_t vprod0x10 = vmull_s8(vb10, va0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 292 const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 293 const int16x8_t vprod0x10 = vmull_s8(vb10, va0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 294 const int16x8_t vprod1x10 = vmull_s8(vb10, va1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 197 const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 198 const int16x8_t vprod0x10 = vmull_s8(vb10, va0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 482 const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 483 const int16x8_t vprod0x10 = vmull_s8(vb10, va0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 484 const int16x8_t vprod1x10 = vmull_s8(vb10, va1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 485 const int16x8_t vprod2x10 = vmull_s8(vb10, va2); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 486 const int16x8_t vprod3x10 = vmull_s8(vb10, va3); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 387 const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 388 const int16x8_t vprod0x10 = vmull_s8(vb10, va0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 389 const int16x8_t vprod1x10 = vmull_s8(vb10, va1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 390 const int16x8_t vprod2x10 = vmull_s8(vb10, va2); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x16c16-minmax-neon-mlal-padal.c | 122 const int8x16_t vb10 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 219 int16x8_t vprod0x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 220 int16x8_t vprod1x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 221 int16x8_t vprod2x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 222 vprod0x10 = vmlal_s8(vprod0x10, vget_high_s8(vb10), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 223 vprod1x10 = vmlal_s8(vprod1x10, vget_high_s8(vb10), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 224 vprod2x10 = vmlal_s8(vprod2x10, vget_high_s8(vb10), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 145 const int8x16_t vb10 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 272 int16x8_t vprod0x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 273 int16x8_t vprod1x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 274 int16x8_t vprod2x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 275 int16x8_t vprod3x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 276 vprod0x10 = vmlal_s8(vprod0x10, vget_high_s8(vb10), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 277 vprod1x10 = vmlal_s8(vprod1x10, vget_high_s8(vb10), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 278 vprod2x10 = vmlal_s8(vprod2x10, vget_high_s8(vb10), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 279 vprod3x10 = vmlal_s8(vprod3x10, vget_high_s8(vb10), vget_high_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 99 const int8x16_t vb10 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 166 int16x8_t vprod0x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 167 int16x8_t vprod1x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 168 vprod0x10 = vmlal_s8(vprod0x10, vget_high_s8(vb10), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 169 vprod1x10 = vmlal_s8(vprod1x10, vget_high_s8(vb10), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 76 const int8x16_t vb10 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 113 int16x8_t vprod0x10 = vmull_s8(vget_low_s8(vb10), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 114 vprod0x10 = vmlal_s8(vprod0x10, vget_high_s8(vb10), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 1x16c8-minmax-neon-mull-padal.c | 97 const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() local 98 const int16x8_t vprod0x10 = vmull_s8(vb10, va0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 140 const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() local 141 const int16x8_t vprod0x10 = vmull_s8(vb10, va0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 142 const int16x8_t vprod1x10 = vmull_s8(vb10, va1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 226 const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 227 const int16x8_t vprod0x10 = vmull_s8(vb10, va0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 228 const int16x8_t vprod1x10 = vmull_s8(vb10, va1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 229 const int16x8_t vprod2x10 = vmull_s8(vb10, va2); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 230 const int16x8_t vprod3x10 = vmull_s8(vb10, va3); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 183 const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() local 184 const int16x8_t vprod0x10 = vmull_s8(vb10, va0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 185 const int16x8_t vprod1x10 = vmull_s8(vb10, va1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 186 const int16x8_t vprod2x10 = vmull_s8(vb10, va2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 372 const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 373 const int16x8_t vprod0x10 = vmull_s8(vb10, va0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 374 const int16x8_t vprod1x10 = vmull_s8(vb10, va1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 375 const int16x8_t vprod2x10 = vmull_s8(vb10, va2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 279 const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 280 const int16x8_t vprod0x10 = vmull_s8(vb10, va0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 281 const int16x8_t vprod1x10 = vmull_s8(vb10, va1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 186 const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 187 const int16x8_t vprod0x10 = vmull_s8(vb10, va0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 465 const int8x8_t vb10 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 466 const int16x8_t vprod0x10 = vmull_s8(vb10, va0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 467 const int16x8_t vprod1x10 = vmull_s8(vb10, va1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 468 const int16x8_t vprod2x10 = vmull_s8(vb10, va2); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 469 const int16x8_t vprod3x10 = vmull_s8(vb10, va3); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|