/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x16-minmax-neon-mull-addw-dup.c | 46 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() local 61 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 71 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 81 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 91 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 101 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 111 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 121 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 131 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() 146 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mull_addw_dup() [all …]
|
D | 1x16-minmax-neon-mlal-lane.c | 46 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() local 62 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 72 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 82 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 92 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 103 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 113 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 123 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 133 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() 149 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_1x16__neon_mlal_lane() [all …]
|
D | 1x16c2-minmax-neon-mull-padal-dup.c | 47 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() local 100 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 101 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 102 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 103 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 123 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 138 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 153 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 161 vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 168 vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() [all …]
|
D | 2x16-minmax-neon-mlal-lane.c | 52 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() local 56 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 76 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 90 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 104 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 118 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 133 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 147 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 161 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() 175 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mlal_lane() [all …]
|
D | 2x16-minmax-neon-mull-addw-dup.c | 52 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() local 56 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 75 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 91 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 107 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 123 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 139 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 155 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 171 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() 187 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc7)); in xnn_qs8_gemm_minmax_ukernel_2x16__neon_mull_addw_dup() [all …]
|
D | 1x16c4-minmax-neondot.c | 49 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() local 72 vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb0123xCDEF, va0x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 76 vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 95 vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb0123xCDEF, va0x01234567, 0); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 105 const int32x4_t vproduct0xCDEF = vqrdmulhq_n_s32(vacc0xCDEF, params->neon.multiplier); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 110 vacc0xCDEF = vsraq_n_s32(vproduct0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 115 vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift); in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 120 …const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), v… in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot() 125 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_… in xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot()
|
D | 1x16c2-minmax-neon-mlal-padal-dup.c | 47 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() local 87 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 103 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 119 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 135 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 188 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 189 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 190 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 191 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 211 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() [all …]
|
D | 3x16-minmax-neon-mlal-lane.c | 58 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() local 62 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 66 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 90 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 108 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 126 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 144 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 163 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 181 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() 199 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mlal_lane() [all …]
|
D | 3x16-minmax-neon-mull-addw-dup.c | 58 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() local 62 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 66 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 89 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 111 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 133 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 155 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 177 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 199 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() 221 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6)); in xnn_qs8_gemm_minmax_ukernel_3x16__neon_mull_addw_dup() [all …]
|
D | 2x16c2-minmax-neon-mull-padal-dup.c | 53 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() local 57 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 111 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 112 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 113 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 114 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 167 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 190 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 213 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 229 vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() [all …]
|
D | 2x16c2-minmax-neon-mlal-padal-dup.c | 53 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() local 57 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 110 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 138 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 166 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 194 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 249 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 250 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 251 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() 252 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3); in xnn_qs8_gemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup() [all …]
|
D | 4x16-minmax-neon-mlal-lane.c | 64 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() local 68 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 72 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 76 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 104 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 126 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 148 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 170 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 193 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() 215 vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mull-addw-dup.c | 64 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() local 68 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 72 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 76 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 103 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 131 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 159 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 187 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 215 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() 243 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5)); in xnn_qs8_gemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x16-minmax-neon-mlal-lane.c | 49 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() local 73 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 83 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 93 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 103 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 114 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 124 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 134 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 144 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() 160 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mlal_lane() [all …]
|
D | 1x16-minmax-neon-mull-addw-dup.c | 49 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() local 72 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 82 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 92 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 102 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 112 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 122 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 132 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 142 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() 157 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_1x16__neon_mull_addw_dup() [all …]
|
D | 1x16c4-minmax-neondot.c | 50 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() local 81 vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb0123xCDEF, va0x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 85 vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb4567xCDEF, va0x01234567, 1); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 104 vacc0xCDEF = vdotq_lane_s32(vacc0xCDEF, vb0123xCDEF, va0x01234567, 0); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 113 vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 120 vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 125 vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift); in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 130 …const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), v… in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot() 135 …x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_… in xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot()
|
D | 1x16c2-minmax-neon-mull-padal-dup.c | 50 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() local 111 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 112 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 113 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 114 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 134 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 149 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 164 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 175 vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() 182 vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup() [all …]
|
D | 2x16-minmax-neon-mlal-lane.c | 53 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() local 57 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 89 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 103 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 117 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 131 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 146 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 160 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 174 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() 188 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mlal_lane() [all …]
|
D | 2x16-minmax-neon-mull-addw-dup.c | 53 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() local 57 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 88 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 104 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 120 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 136 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 152 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 168 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 184 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() 200 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc7)); in xnn_qs8_igemm_minmax_ukernel_2x16__neon_mull_addw_dup() [all …]
|
D | 1x16c2-minmax-neon-mlal-padal-dup.c | 50 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() local 98 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 114 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 130 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 146 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 199 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 200 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 201 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 202 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() 222 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup() [all …]
|
D | 3x16-minmax-neon-mull-addw-dup.c | 57 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() local 61 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 65 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 104 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 126 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 148 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 170 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 192 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 214 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() 236 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc6)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mull_addw_dup() [all …]
|
D | 3x16-minmax-neon-mlal-lane.c | 57 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() local 61 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 65 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 105 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 123 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 141 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 159 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 178 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 196 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() 214 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_3x16__neon_mlal_lane() [all …]
|
D | 2x16c2-minmax-neon-mull-padal-dup.c | 54 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() local 58 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 124 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 125 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 126 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 127 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 180 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 203 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 226 vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() 245 vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier); in xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup() [all …]
|
D | 4x16-minmax-neon-mlal-lane.c | 61 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() local 65 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 69 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 73 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 121 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 143 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 165 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 187 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 210 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() 232 … vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mlal_lane() [all …]
|
D | 4x16-minmax-neon-mull-addw-dup.c | 61 int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() local 65 int32x4_t vacc1xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 69 int32x4_t vacc2xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 73 int32x4_t vacc3xCDEF = vacc0xCDEF; in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 120 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc0)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 148 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc1)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 176 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc2)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 204 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc3)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 232 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc4)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() 260 vacc0xCDEF = vaddw_s16(vacc0xCDEF, vget_high_s16(vprod0x89ABCDEFc5)); in xnn_qs8_igemm_minmax_ukernel_4x16__neon_mull_addw_dup() [all …]
|