/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x16c8-minmax-neon-mull-padal.c | 55 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() local 102 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 125 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() 150 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 55 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 131 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 191 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 214 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 239 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 55 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 118 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 141 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 166 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 61 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() local 77 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 148 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 180 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 217 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 61 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 77 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 190 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 287 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 319 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 356 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 61 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 77 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 176 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 212 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 249 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 67 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() local 83 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 99 int32x4_t vacc2x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 194 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 235 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 284 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 67 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 83 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 99 int32x4_t vacc2x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 249 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 383 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 424 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 473 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 73 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 89 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 105 int32x4_t vacc2x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 121 int32x4_t vacc3x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 240 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 290 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 351 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 67 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 83 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 99 int32x4_t vacc2x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 234 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 283 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 332 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 73 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 89 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 105 int32x4_t vacc2x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 121 int32x4_t vacc3x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 292 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 354 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 415 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 73 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 89 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 105 int32x4_t vacc2x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 121 int32x4_t vacc3x11 = vacc0x11; in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 308 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 479 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 529 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 590 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x16c8-minmax-neon-mlal-padal.c | 58 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 142 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 202 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 228 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 253 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 1x16c8-minmax-neon-mull-padal.c | 58 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() local 113 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() 139 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() 164 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 58 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 129 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 155 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 180 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 62 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() local 78 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 161 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 196 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 233 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 62 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 78 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 203 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 300 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 335 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 372 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 66 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 82 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 98 int32x4_t vacc2x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 209 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 253 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 302 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 62 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 78 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 189 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 228 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 265 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 70 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 86 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 102 int32x4_t vacc2x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 118 int32x4_t vacc3x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 257 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 310 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 371 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 66 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 82 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 98 int32x4_t vacc2x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 249 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 301 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 350 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 66 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 82 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 98 int32x4_t vacc2x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 264 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 398 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 442 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 491 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 70 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 86 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 102 int32x4_t vacc2x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 118 int32x4_t vacc3x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 309 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 374 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 435 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 70 …int32x4_t vacc0x11 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + size… in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 86 int32x4_t vacc1x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 102 int32x4_t vacc2x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 118 int32x4_t vacc3x11 = vacc0x11; in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 325 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 496 vacc0x11 = vpadalq_s16(vacc0x11, vprod0x11); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 549 const int32x4_t vsum0xAB = vpaddq_s32(vacc0x10, vacc0x11); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 610 const int32x2_t vpsum0xB = vadd_s32(vget_low_s32(vacc0x11), vget_high_s32(vacc0x11)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|