/external/XNNPACK/src/qu8-igemm/ |
D | 4x8-minmax-neon.c | 96 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t)); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() local 97 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point)); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 110 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t)); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() local 111 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point)); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 124 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t)); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() local 125 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point)); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 138 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t)); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() local 139 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point)); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() 152 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t)); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() local 153 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point)); in xnn_qu8_igemm_minmax_ukernel_4x8__neon() [all …]
|
D | 8x8-minmax-neon.c | 144 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t)); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() local 145 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point)); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 166 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t)); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() local 167 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point)); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 188 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t)); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() local 189 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point)); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 210 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t)); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() local 211 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point)); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() 232 const uint8x8_t vb01234567 = vld1_u8(w); w = (void*) ((uintptr_t) w + sizeof(uint8x8_t)); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() local 233 const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point)); in xnn_qu8_igemm_minmax_ukernel_8x8__neon() [all …]
|
/external/XNNPACK/src/f16-vbinary/gen/ |
D | vmax-neonfp16arith-x16.c | 38 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vmax_ukernel__neonfp16arith_x16() local 42 float16x8_t vy01234567 = vmaxq_f16(va01234567, vb01234567); in xnn_f16_vmax_ukernel__neonfp16arith_x16() 52 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vmax_ukernel__neonfp16arith_x16() local 54 float16x8_t vy01234567 = vmaxq_f16(va01234567, vb01234567); in xnn_f16_vmax_ukernel__neonfp16arith_x16() 59 const float16x8_t vb01234567 = vld1q_f16(b); in xnn_f16_vmax_ukernel__neonfp16arith_x16() local 61 float16x8_t vy01234567 = vmaxq_f16(va01234567, vb01234567); in xnn_f16_vmax_ukernel__neonfp16arith_x16()
|
D | vmin-neonfp16arith-x16.c | 38 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vmin_ukernel__neonfp16arith_x16() local 42 float16x8_t vy01234567 = vminq_f16(va01234567, vb01234567); in xnn_f16_vmin_ukernel__neonfp16arith_x16() 52 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vmin_ukernel__neonfp16arith_x16() local 54 float16x8_t vy01234567 = vminq_f16(va01234567, vb01234567); in xnn_f16_vmin_ukernel__neonfp16arith_x16() 59 const float16x8_t vb01234567 = vld1q_f16(b); in xnn_f16_vmin_ukernel__neonfp16arith_x16() local 61 float16x8_t vy01234567 = vminq_f16(va01234567, vb01234567); in xnn_f16_vmin_ukernel__neonfp16arith_x16()
|
D | vsqrdiff-neonfp16arith-x16.c | 38 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vsqrdiff_ukernel__neonfp16arith_x16() local 42 float16x8_t vy01234567 = vsubq_f16(va01234567, vb01234567); in xnn_f16_vsqrdiff_ukernel__neonfp16arith_x16() 54 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vsqrdiff_ukernel__neonfp16arith_x16() local 56 float16x8_t vy01234567 = vsubq_f16(va01234567, vb01234567); in xnn_f16_vsqrdiff_ukernel__neonfp16arith_x16() 62 const float16x8_t vb01234567 = vld1q_f16(b); in xnn_f16_vsqrdiff_ukernel__neonfp16arith_x16() local 64 float16x8_t vy01234567 = vsubq_f16(va01234567, vb01234567); in xnn_f16_vsqrdiff_ukernel__neonfp16arith_x16()
|
D | vdiv-minmax-neonfp16arith-x16.c | 40 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vdiv_minmax_ukernel__neonfp16arith_x16() local 44 float16x8_t vy01234567 = vdivq_f16(va01234567, vb01234567); in xnn_f16_vdiv_minmax_ukernel__neonfp16arith_x16() 59 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vdiv_minmax_ukernel__neonfp16arith_x16() local 61 float16x8_t vy01234567 = vdivq_f16(va01234567, vb01234567); in xnn_f16_vdiv_minmax_ukernel__neonfp16arith_x16() 68 const float16x8_t vb01234567 = vld1q_f16(b); in xnn_f16_vdiv_minmax_ukernel__neonfp16arith_x16() local 70 float16x8_t vy01234567 = vdivq_f16(va01234567, vb01234567); in xnn_f16_vdiv_minmax_ukernel__neonfp16arith_x16()
|
D | vsub-minmax-neonfp16arith-x16.c | 40 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vsub_minmax_ukernel__neonfp16arith_x16() local 44 float16x8_t vy01234567 = vsubq_f16(va01234567, vb01234567); in xnn_f16_vsub_minmax_ukernel__neonfp16arith_x16() 59 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vsub_minmax_ukernel__neonfp16arith_x16() local 61 float16x8_t vy01234567 = vsubq_f16(va01234567, vb01234567); in xnn_f16_vsub_minmax_ukernel__neonfp16arith_x16() 68 const float16x8_t vb01234567 = vld1q_f16(b); in xnn_f16_vsub_minmax_ukernel__neonfp16arith_x16() local 70 float16x8_t vy01234567 = vsubq_f16(va01234567, vb01234567); in xnn_f16_vsub_minmax_ukernel__neonfp16arith_x16()
|
D | vmul-minmax-neonfp16arith-x16.c | 40 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16() local 44 float16x8_t vy01234567 = vmulq_f16(va01234567, vb01234567); in xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16() 59 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16() local 61 float16x8_t vy01234567 = vmulq_f16(va01234567, vb01234567); in xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16() 68 const float16x8_t vb01234567 = vld1q_f16(b); in xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16() local 70 float16x8_t vy01234567 = vmulq_f16(va01234567, vb01234567); in xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16()
|
D | vadd-minmax-neonfp16arith-x16.c | 40 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16() local 44 float16x8_t vy01234567 = vaddq_f16(va01234567, vb01234567); in xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16() 59 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16() local 61 float16x8_t vy01234567 = vaddq_f16(va01234567, vb01234567); in xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16() 68 const float16x8_t vb01234567 = vld1q_f16(b); in xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16() local 70 float16x8_t vy01234567 = vaddq_f16(va01234567, vb01234567); in xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16()
|
D | vmin-neonfp16arith-x8.c | 38 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vmin_ukernel__neonfp16arith_x8() local 40 float16x8_t vy01234567 = vminq_f16(va01234567, vb01234567); in xnn_f16_vmin_ukernel__neonfp16arith_x8() 48 const float16x8_t vb01234567 = vld1q_f16(b); in xnn_f16_vmin_ukernel__neonfp16arith_x8() local 50 float16x8_t vy01234567 = vminq_f16(va01234567, vb01234567); in xnn_f16_vmin_ukernel__neonfp16arith_x8()
|
D | vmax-neonfp16arith-x8.c | 38 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vmax_ukernel__neonfp16arith_x8() local 40 float16x8_t vy01234567 = vmaxq_f16(va01234567, vb01234567); in xnn_f16_vmax_ukernel__neonfp16arith_x8() 48 const float16x8_t vb01234567 = vld1q_f16(b); in xnn_f16_vmax_ukernel__neonfp16arith_x8() local 50 float16x8_t vy01234567 = vmaxq_f16(va01234567, vb01234567); in xnn_f16_vmax_ukernel__neonfp16arith_x8()
|
D | vsqrdiff-neonfp16arith-x8.c | 38 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vsqrdiff_ukernel__neonfp16arith_x8() local 40 float16x8_t vy01234567 = vsubq_f16(va01234567, vb01234567); in xnn_f16_vsqrdiff_ukernel__neonfp16arith_x8() 49 const float16x8_t vb01234567 = vld1q_f16(b); in xnn_f16_vsqrdiff_ukernel__neonfp16arith_x8() local 51 float16x8_t vy01234567 = vsubq_f16(va01234567, vb01234567); in xnn_f16_vsqrdiff_ukernel__neonfp16arith_x8()
|
D | vdiv-minmax-neonfp16arith-x8.c | 40 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vdiv_minmax_ukernel__neonfp16arith_x8() local 42 float16x8_t vy01234567 = vdivq_f16(va01234567, vb01234567); in xnn_f16_vdiv_minmax_ukernel__neonfp16arith_x8() 53 const float16x8_t vb01234567 = vld1q_f16(b); in xnn_f16_vdiv_minmax_ukernel__neonfp16arith_x8() local 55 float16x8_t vy01234567 = vdivq_f16(va01234567, vb01234567); in xnn_f16_vdiv_minmax_ukernel__neonfp16arith_x8()
|
D | vadd-minmax-neonfp16arith-x8.c | 40 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vadd_minmax_ukernel__neonfp16arith_x8() local 42 float16x8_t vy01234567 = vaddq_f16(va01234567, vb01234567); in xnn_f16_vadd_minmax_ukernel__neonfp16arith_x8() 53 const float16x8_t vb01234567 = vld1q_f16(b); in xnn_f16_vadd_minmax_ukernel__neonfp16arith_x8() local 55 float16x8_t vy01234567 = vaddq_f16(va01234567, vb01234567); in xnn_f16_vadd_minmax_ukernel__neonfp16arith_x8()
|
D | vsub-minmax-neonfp16arith-x8.c | 40 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vsub_minmax_ukernel__neonfp16arith_x8() local 42 float16x8_t vy01234567 = vsubq_f16(va01234567, vb01234567); in xnn_f16_vsub_minmax_ukernel__neonfp16arith_x8() 53 const float16x8_t vb01234567 = vld1q_f16(b); in xnn_f16_vsub_minmax_ukernel__neonfp16arith_x8() local 55 float16x8_t vy01234567 = vsubq_f16(va01234567, vb01234567); in xnn_f16_vsub_minmax_ukernel__neonfp16arith_x8()
|
D | vmul-minmax-neonfp16arith-x8.c | 40 const float16x8_t vb01234567 = vld1q_f16(b); b += 8; in xnn_f16_vmul_minmax_ukernel__neonfp16arith_x8() local 42 float16x8_t vy01234567 = vmulq_f16(va01234567, vb01234567); in xnn_f16_vmul_minmax_ukernel__neonfp16arith_x8() 53 const float16x8_t vb01234567 = vld1q_f16(b); in xnn_f16_vmul_minmax_ukernel__neonfp16arith_x8() local 55 float16x8_t vy01234567 = vmulq_f16(va01234567, vb01234567); in xnn_f16_vmul_minmax_ukernel__neonfp16arith_x8()
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 8x8inc-minmax-fma3-broadcast.c | 115 const __m256 vb01234567 = _mm256_load_ps(w); in xnn_f32_gemminc_minmax_ukernel_8x8__fma3_broadcast() local 118 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567); in xnn_f32_gemminc_minmax_ukernel_8x8__fma3_broadcast() 119 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567); in xnn_f32_gemminc_minmax_ukernel_8x8__fma3_broadcast() 120 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_8x8__fma3_broadcast() 121 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_8x8__fma3_broadcast() 122 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_8x8__fma3_broadcast() 123 vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567); in xnn_f32_gemminc_minmax_ukernel_8x8__fma3_broadcast() 124 vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567); in xnn_f32_gemminc_minmax_ukernel_8x8__fma3_broadcast() 125 vacc7x01234567 = _mm256_fmadd_ps(va7, vb01234567, vacc7x01234567); in xnn_f32_gemminc_minmax_ukernel_8x8__fma3_broadcast()
|
D | 7x8inc-minmax-fma3-broadcast.c | 106 const __m256 vb01234567 = _mm256_load_ps(w); in xnn_f32_gemminc_minmax_ukernel_7x8__fma3_broadcast() local 109 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567); in xnn_f32_gemminc_minmax_ukernel_7x8__fma3_broadcast() 110 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567); in xnn_f32_gemminc_minmax_ukernel_7x8__fma3_broadcast() 111 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567); in xnn_f32_gemminc_minmax_ukernel_7x8__fma3_broadcast() 112 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567); in xnn_f32_gemminc_minmax_ukernel_7x8__fma3_broadcast() 113 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemminc_minmax_ukernel_7x8__fma3_broadcast() 114 vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567); in xnn_f32_gemminc_minmax_ukernel_7x8__fma3_broadcast() 115 vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567); in xnn_f32_gemminc_minmax_ukernel_7x8__fma3_broadcast()
|
D | 7x8inc-minmax-avx-broadcast.c | 106 const __m256 vb01234567 = _mm256_load_ps(w); in xnn_f32_gemminc_minmax_ukernel_7x8__avx_broadcast() local 109 vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567)); in xnn_f32_gemminc_minmax_ukernel_7x8__avx_broadcast() 110 vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567)); in xnn_f32_gemminc_minmax_ukernel_7x8__avx_broadcast() 111 vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567)); in xnn_f32_gemminc_minmax_ukernel_7x8__avx_broadcast() 112 vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567)); in xnn_f32_gemminc_minmax_ukernel_7x8__avx_broadcast() 113 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_gemminc_minmax_ukernel_7x8__avx_broadcast() 114 vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567)); in xnn_f32_gemminc_minmax_ukernel_7x8__avx_broadcast() 115 vacc6x01234567 = _mm256_add_ps(vacc6x01234567, _mm256_mul_ps(va6, vb01234567)); in xnn_f32_gemminc_minmax_ukernel_7x8__avx_broadcast()
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 8x8-minmax-fma3-broadcast.c | 130 const __m256 vb01234567 = _mm256_load_ps(w); in xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast() local 150 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567); in xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast() 151 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567); in xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast() 152 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast() 153 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567); in xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast() 154 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast() 155 vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567); in xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast() 156 vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567); in xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast() 157 vacc7x01234567 = _mm256_fmadd_ps(va7, vb01234567, vacc7x01234567); in xnn_f32_igemm_minmax_ukernel_8x8__fma3_broadcast()
|
D | 7x8-minmax-avx-broadcast.c | 120 const __m256 vb01234567 = _mm256_load_ps(w); in xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast() local 138 vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567)); in xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast() 139 vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567)); in xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast() 140 vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567)); in xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast() 141 vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567)); in xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast() 142 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast() 143 vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567)); in xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast() 144 vacc6x01234567 = _mm256_add_ps(vacc6x01234567, _mm256_mul_ps(va6, vb01234567)); in xnn_f32_igemm_minmax_ukernel_7x8__avx_broadcast()
|
D | 7x8-minmax-fma3-broadcast.c | 120 const __m256 vb01234567 = _mm256_load_ps(w); in xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast() local 138 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567); in xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast() 139 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567); in xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast() 140 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567); in xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast() 141 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567); in xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast() 142 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast() 143 vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567); in xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast() 144 vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567); in xnn_f32_igemm_minmax_ukernel_7x8__fma3_broadcast()
|
/external/XNNPACK/src/f32-gemm/gen/ |
D | 8x8-minmax-fma3-broadcast.c | 113 const __m256 vb01234567 = _mm256_load_ps(w); in xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast() local 116 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast() 117 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567); in xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast() 118 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast() 119 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast() 120 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast() 121 vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567); in xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast() 122 vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567); in xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast() 123 vacc7x01234567 = _mm256_fmadd_ps(va7, vb01234567, vacc7x01234567); in xnn_f32_gemm_minmax_ukernel_8x8__fma3_broadcast()
|
D | 7x8-minmax-avx-broadcast.c | 104 const __m256 vb01234567 = _mm256_load_ps(w); in xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast() local 107 vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567)); in xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast() 108 vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567)); in xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast() 109 vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567)); in xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast() 110 vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567)); in xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast() 111 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567)); in xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast() 112 vacc5x01234567 = _mm256_add_ps(vacc5x01234567, _mm256_mul_ps(va5, vb01234567)); in xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast() 113 vacc6x01234567 = _mm256_add_ps(vacc6x01234567, _mm256_mul_ps(va6, vb01234567)); in xnn_f32_gemm_minmax_ukernel_7x8__avx_broadcast()
|
D | 7x8-minmax-fma3-broadcast.c | 104 const __m256 vb01234567 = _mm256_load_ps(w); in xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast() local 107 vacc0x01234567 = _mm256_fmadd_ps(va0, vb01234567, vacc0x01234567); in xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast() 108 vacc1x01234567 = _mm256_fmadd_ps(va1, vb01234567, vacc1x01234567); in xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast() 109 vacc2x01234567 = _mm256_fmadd_ps(va2, vb01234567, vacc2x01234567); in xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast() 110 vacc3x01234567 = _mm256_fmadd_ps(va3, vb01234567, vacc3x01234567); in xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast() 111 vacc4x01234567 = _mm256_fmadd_ps(va4, vb01234567, vacc4x01234567); in xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast() 112 vacc5x01234567 = _mm256_fmadd_ps(va5, vb01234567, vacc5x01234567); in xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast() 113 vacc6x01234567 = _mm256_fmadd_ps(va6, vb01234567, vacc6x01234567); in xnn_f32_gemm_minmax_ukernel_7x8__fma3_broadcast()
|