Lines Matching refs:_pM128i

406 #   define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))  macro
411 # define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a)) macro
415 #define _pM128(a) _mm_castsi128_ps(_pM128i(a))
2789 return64(_mm_add_epi8(_pM128i(a),_pM128i(b))); in vadd_s8()
2797 return64(_mm_add_epi16(_pM128i(a),_pM128i(b))); in vadd_s16()
2805 return64(_mm_add_epi32(_pM128i(a),_pM128i(b))); in vadd_s32()
2880 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, in vaddl_s8()
2881 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, in vaddl_s8()
2889 a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 in vaddl_s16()
2890 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1 in vaddl_s16()
2899 a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1 in vaddl_s32()
2900 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 in vaddl_s32()
2908 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1 in vaddl_u8()
2909 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1 in vaddl_u8()
2917 a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1 in vaddl_u16()
2918 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1 in vaddl_u16()
2927 a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1 in vaddl_u32()
2928 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 in vaddl_u32()
2938 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, in vaddw_s8()
2946 b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1, in vaddw_s16()
2954 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 in vaddw_s32()
2962 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1 in vaddw_u8()
2970 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1 in vaddw_u16()
2978 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 in vaddw_u32()
2988 return64(vhaddq_s8(_pM128i(a), _pM128i(b))); in vhadd_s8()
2996 return64( vhaddq_s16(_pM128i(a), _pM128i(b))); in vhadd_s16()
3004 return64( vhaddq_s32(_pM128i(a), _pM128i(b))); in vhadd_s32()
3012 return64( vhaddq_u8(_pM128i(a), _pM128i(b))); in vhadd_u8()
3020 return64( vhaddq_u16(_pM128i(a), _pM128i(b))); in vhadd_u16()
3028 return64( vhaddq_u32(_pM128i(a), _pM128i(b))); in vhadd_u32()
3104 return64(vrhaddq_s8(_pM128i(a), _pM128i(b))); in vrhadd_s8()
3112 return64(vrhaddq_s16(_pM128i(a), _pM128i(b))); in vrhadd_s16()
3120 return64(vrhaddq_s32(_pM128i(a), _pM128i(b))); in vrhadd_s32()
3128 return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!! in vrhadd_u8()
3136 return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!! in vrhadd_u16()
3144 return64(vrhaddq_u32(_pM128i(a), _pM128i(b))); in vrhadd_u32()
3213 return64(_mm_adds_epi8(_pM128i(a),_pM128i(b))); in vqadd_s8()
3221 return64(_mm_adds_epi16(_pM128i(a),_pM128i(b))); in vqadd_s16()
3229 return64(vqaddq_s32(_pM128i(a), _pM128i(b))); in vqadd_s32()
3252 return64(_mm_adds_epu8(_pM128i(a),_pM128i(b))); in vqadd_u8()
3260 return64(_mm_adds_epu16(_pM128i(a),_pM128i(b))); in vqadd_u16()
3268 return64(vqaddq_u32(_pM128i(a), _pM128i(b))); in vqadd_u32()
3528 a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits in vmul_s8()
3529 b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits in vmul_s8()
3558 a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); in vmul_u8()
3559 b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); in vmul_u8()
3570 return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b))); in vmul_u16()
3589 a64 = _pM128i(a); in vmul_p8()
3590 b64 = _pM128i(b); in vmul_p8()
3689 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 in vmull_s8()
3690 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 in vmull_s8()
3699 a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1 in vmull_s16()
3700 b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1 in vmull_s16()
3704 a128 = _pM128i(a); in vmull_s16()
3705 b128 = _pM128i(b); in vmull_s16()
3716 a128 = _pM128i(a); in vmull_s32()
3717 b128 = _pM128i(b); in vmull_s32()
3728 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 in vmull_u8()
3729 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 in vmull_u8()
3738 a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1 in vmull_u16()
3739 b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1 in vmull_u16()
3743 a128 = _pM128i(a); in vmull_u16()
3744 b128 = _pM128i(b); in vmull_u16()
3756 a128 = _pM128i(a); in vmull_u32()
3757 b128 = _pM128i(b); in vmull_u32()
3769 a128 = _pM128i(a); in vmull_p8()
3770 b128 = _pM128i(b); in vmull_p8()
3816 b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits in vmla_s8()
3817 c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits in vmla_s8()
3820 res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits in vmla_s8()
3828 return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c))); in vmla_s16()
3837 res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1 in vmla_s32()
3838 res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits in vmla_s32()
3861 b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits in vmla_u8()
3862 c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits in vmla_u8()
3866 res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits in vmla_u8()
4019 res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64)); in vmls_s8()
4027 return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c))); in vmls_s16()
4036 res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1 in vmls_s32()
4037 res = _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only in vmls_s32()
4059 res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64)); in vmls_u8()
4277 return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b))); in vqrdmulh_s16()
4367 return64(_mm_sub_epi8(_pM128i(a),_pM128i(b))); in vsub_s8()
4375 return64(_mm_sub_epi16(_pM128i(a),_pM128i(b))); in vsub_s16()
4383 return64(_mm_sub_epi32(_pM128i(a),_pM128i(b))); in vsub_s32()
4458 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, in vsubl_s8()
4459 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, in vsubl_s8()
4467 a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 in vsubl_s16()
4468 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, in vsubl_s16()
4477 a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1 in vsubl_s32()
4478 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1, in vsubl_s32()
4486 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1, in vsubl_u8()
4487 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, in vsubl_u8()
4495 a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1 in vsubl_u16()
4496 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1, in vsubl_u16()
4505 a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1 in vsubl_u32()
4506 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1, in vsubl_u32()
4516 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, in vsubw_s8()
4524 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, in vsubw_s16()
4532 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 in vsubw_s32()
4540 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, in vsubw_u8()
4548 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1, in vsubw_u16()
4556 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 in vsubw_u32()
4566 return64(_mm_subs_epi8(_pM128i(a),_pM128i(b))); in vqsub_s8()
4574 return64(_mm_subs_epi16(_pM128i(a),_pM128i(b))); in vqsub_s16()
4582 return64(vqsubq_s32(_pM128i(a), _pM128i(b))); in vqsub_s32()
4606 return64(_mm_subs_epu8(_pM128i(a),_pM128i(b))); in vqsub_u8()
4614 return64(_mm_subs_epu16(_pM128i(a),_pM128i(b))); in vqsub_u16()
4622 return64(vqsubq_u32(_pM128i(a), _pM128i(b))); in vqsub_u32()
4733 r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1 in vhsub_s8()
4743 return64(vhsubq_s16(_pM128i(a), _pM128i(b))); in vhsub_s16()
4752 return64(vhsubq_s32(_pM128i(a), _pM128i(b))); in vhsub_s32()
4760 return64(vhsubq_u8(_pM128i(a), _pM128i(b))); in vhsub_u8()
4767 return64(vhsubq_u16(_pM128i(a), _pM128i(b))); in vhsub_u16()
4774 return64(vhsubq_u32(_pM128i(a), _pM128i(b))); in vhsub_u32()
5023 return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b))); in vceq_s8()
5031 return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b))); in vceq_s16()
5039 return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b))); in vceq_s32()
5056 return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b))); in vceq_u8()
5064 return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b))); in vceq_u16()
5072 return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b))); in vceq_u32()
5118 return64(vcgeq_s8(_pM128i(a), _pM128i(b))); in vcge_s8()
5126 return64(vcgeq_s16(_pM128i(a), _pM128i(b))); in vcge_s16()
5134 return64(vcgeq_s32(_pM128i(a), _pM128i(b))); in vcge_s32()
5151 return64(vcgeq_u8(_pM128i(a), _pM128i(b))); in vcge_u8()
5159 return64(vcgeq_u16(_pM128i(a), _pM128i(b))); in vcge_u16()
5168 return64(vcgeq_u32 (_pM128i(a), _pM128i(b))); in vcge_u32()
5263 return64(vcleq_s8(_pM128i(a), _pM128i(b))); in vcle_s8()
5271 return64(vcleq_s16(_pM128i(a), _pM128i(b))); in vcle_s16()
5279 return64(vcleq_s32(_pM128i(a), _pM128i(b))); in vcle_s32()
5387 return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b))); in vcgt_s8()
5395 return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b))); in vcgt_s16()
5403 return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b))); in vcgt_s32()
5420 return64(vcgtq_u8(_pM128i(a), _pM128i(b))); in vcgt_u8()
5428 return64(vcgtq_u16(_pM128i(a), _pM128i(b))); in vcgt_u16()
5436 return64(vcgtq_u32(_pM128i(a), _pM128i(b))); in vcgt_u32()
5654 return64(vtstq_s8(_pM128i(a), _pM128i(b))); in vtst_s8()
5662 return64(vtstq_s16(_pM128i(a), _pM128i(b))); in vtst_s16()
5670 return64(vtstq_s32(_pM128i(a), _pM128i(b))); in vtst_s32()
5739 return64(vabdq_s8(_pM128i(a), _pM128i(b))); in vabd_s8()
5746 return64(vabdq_s16(_pM128i(a), _pM128i(b))); in vabd_s16()
5762 return64(vabdq_u8(_pM128i(a), _pM128i(b))); in vabd_u8()
5769 return64(vabdq_u16(_pM128i(a), _pM128i(b))); in vabd_u16()
5776 return64(vabdq_u32(_pM128i(a), _pM128i(b))); in vabd_u32()
5871 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, in vabdl_s8()
5872 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, in vabdl_s8()
5881 a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 in vabdl_s16()
5882 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, in vabdl_s16()
5931 return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c))); in vaba_s8()
5938 return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c))); in vaba_s16()
5945 return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c))); in vaba_s32()
5952 return64(vabaq_u8(_pM128i(a),_pM128i(b), _pM128i(c))); in vaba_u8()
5960 return64(vabaq_u16(_pM128i(a), _pM128i(b), _pM128i(c))); in vaba_u16()
5967 return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c))); in vaba_u32()
6024 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, in vabal_s8()
6025 c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1, in vabal_s8()
6034 b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1 in vabal_s16()
6035 c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1 in vabal_s16()
6052 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, in vabal_u8()
6053 c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1, in vabal_u8()
6062 b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1 in vabal_u16()
6063 c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1 in vabal_u16()
6086 res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits in vmax_s8()
6094 return64(_mm_max_epi16(_pM128i(a),_pM128i(b))); in vmax_s16()
6102 res = _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits in vmax_s32()
6110 return64(_mm_max_epu8(_pM128i(a),_pM128i(b))); in vmax_u8()
6118 return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b))); in vmax_u16()
6127 …res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective… in vmax_u32()
6175 res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits in vmin_s8()
6183 return64(_mm_min_epi16(_pM128i(a),_pM128i(b))); in vmin_s16()
6192 res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits in vmin_s32()
6200 return64(_mm_min_epu8(_pM128i(a),_pM128i(b))); in vmin_u8()
6208 return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b))); in vmin_u16()
6217 …res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective… in vmin_u32()
6266 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 in vpadd_s8()
6267 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 in vpadd_s8()
6278 hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b)); in vpadd_s16()
6289 hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b)); in vpadd_s32()
6303 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 in vpadd_u8()
6304 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 in vpadd_u8()
6320 as = _mm_sub_epi16 (_pM128i(a), c32767); in vpadd_u16()
6321 bs = _mm_sub_epi16 (_pM128i(b), c32767); in vpadd_u16()
6334 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1 in vpadd_u32()
6364 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 in vpaddl_s8()
6375 r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a)); in vpaddl_s16()
6395 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits in vpaddl_u8()
6498 return64(vpadalq_s8(_pM128i(a), _pM128i(b))); in vpadal_s8()
6505 return64(vpadalq_s16(_pM128i(a), _pM128i(b))); in vpadal_s16()
6521 return64(vpadalq_u8(_pM128i(a), _pM128i(b))); in vpadal_u8()
6529 return64(vpadalq_u16(_pM128i(a), _pM128i(b))); in vpadal_u16()
6601 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab in vpmax_s8()
6615 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab in vpmax_s16()
6639 ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab in vpmax_u8()
6653 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab in vpmax_u16()
6690 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab in vpmin_s8()
6704 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab in vpmin_s16()
6728 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab in vpmin_u8()
6742 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab in vpmin_u16()
7545 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7555 return64(_mm_srai_epi16(_pM128i(a), b));
7563 return64(_mm_srai_epi32(_pM128i(a), b));
7582 r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7592 return64(_mm_srli_epi16(_pM128i(a), b));
7600 return64(_mm_srli_epi32(_pM128i(a), b));
7608 return64(_mm_srli_epi64(_pM128i(a), b));
7676 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7686 return64(_mm_slli_epi16(_pM128i(a), b));
7694 return64(_mm_slli_epi32(_pM128i(a), b));
7702 return64(_mm_slli_epi64(_pM128i(a), b));
7714 r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7772 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
7785 return64(vrshrq_n_s16(_pM128i(a), b));
7793 return64(vrshrq_n_s32(_pM128i(a), b));
7818 r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
7831 return64(vrshrq_n_u16(_pM128i(a), b));
7839 return64(vrshrq_n_u32(_pM128i(a), b));
7847 return64(vrshrq_n_u64(_pM128i(a), b));
8215 a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8227 a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
8238 return64(vqshlq_n_s32 (_pM128i(a), b));
8265 a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
8277 a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
8287 return64(vqshlq_n_u32(_pM128i(a), b));
8436 a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8447 a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
8457 return64( vqshluq_n_s32(_pM128i(a), b));
8919 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
8927 r = _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
8935 r = _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
8945 r = _mm_unpacklo_epi8(_pM128i(a), zero);
8955 r = _mm_unpacklo_epi16(_pM128i(a), zero);
8965 r = _mm_unpacklo_epi32(_pM128i(a), zero);
8982 return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
8990 return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
8998 return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
9107 return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
9115 return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
9123 return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
11138 v0 = _mm_unpacklo_epi8(_pM128i(val.val[0]), _pM128i(val.val[1]));
11146 v0 = _mm_unpacklo_epi16(_pM128i(val.val[0]), _pM128i(val.val[1]));
11154 v0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1]));
11338 tmp = _mm_unpacklo_epi64(_pM128i(val.val[0]), _pM128i(val.val[1]) );
11340 val2 = _pM128i(val.val[2]);
11358 tmp = _mm_unpacklo_epi64(_pM128i(val.val[0]), _pM128i(val.val[1]));
11360 val2 = _pM128i(val.val[2]);
11375 val0 = _mm_unpacklo_epi64(_pM128i(val.val[1]), _pM128i(val.val[2])); //val[0]: 1,4,2,5
11379 val0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), val0); //0,1,3,2
11535 …sh0 = _mm_unpacklo_epi8(_pM128i(val.val[0]),_pM128i(val.val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4…
11536 …sh1 = _mm_unpacklo_epi8(_pM128i(val.val[2]),_pM128i(val.val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d…
11547 sh0 = _mm_unpacklo_epi16(_pM128i(val.val[0]),_pM128i(val.val[1])); //a0,a1,b0,b1,c0,c1,d0,d1,
11548 sh1 = _mm_unpacklo_epi16(_pM128i(val.val[2]),_pM128i(val.val[3])); //a2,a3,b2,b3,c2,c3,d2,d3
11560 sh0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1])); //0,1,4,5
11561 sh1 = _mm_unpacklo_epi32(_pM128i(val.val[2]), _pM128i(val.val[3])); //2,3,6,7
12448 return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8);
12459 return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16);
12467 return _mm_shuffle_epi32 (_pM128i(vec), (1 | (1 << 2) | (1 << 4) | (1 << 6)) );
12469 return _mm_shuffle_epi32 (_pM128i(vec), 0);
12500 vec128 = _pM128i(vec);
12514 return _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ); in vcombine_s8()
12533 res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) ); in vcombine_f32()
12930 return _MM_CVTEPI8_EPI16(_pM128i(a)); //SSE4.1 in vmovl_s8()
12936 return _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1 in vmovl_s16()
12942 return _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1 in vmovl_s32()
12948 return _MM_CVTEPU8_EPI16(_pM128i(a)); //SSE4.1 in vmovl_u8()
12954 return _MM_CVTEPU16_EPI32(_pM128i(a)); //SSE4.1 in vmovl_u16()
12960 return _MM_CVTEPU32_EPI64(_pM128i(a)); //SSE4.1 in vmovl_u32()
13103 b128 = _pM128i(b); in vtbl1_u8()
13106 bmask = _mm_shuffle_epi8(_pM128i(a),bmask); in vtbl1_u8()
13122 b128 = _pM128i(b); in vtbl2_u8()
13125 a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]), _pM128i(a.val[1])); in vtbl2_u8()
13144 b128 = _pM128i(b); in vtbl3_u8()
13148 a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]),_pM128i(a.val[1])); in vtbl3_u8()
13150 sh1 = _mm_shuffle_epi8(_pM128i(a.val[2]), bmask); //for bi>15 bi is wrapped (bi-=15) in vtbl3_u8()
13169 b128 = _pM128i(b); in vtbl4_u8()
13173 a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]),_pM128i(a.val[1])); in vtbl4_u8()
13174 a23 = _mm_unpacklo_epi64(_pM128i(a.val[2]),_pM128i(a.val[3])); in vtbl4_u8()
13198 c128 = _pM128i(c); in vtbx1_u8()
13200 c7 = _mm_and_si128(maskgt,_pM128i(a)); in vtbx1_u8()
13201 sh = _mm_shuffle_epi8(_pM128i(b),c128); in vtbx1_u8()
13219 c128 = _pM128i(c); in vtbx2_u8()
13221 c15 = _mm_and_si128(maskgt15, _pM128i(a)); in vtbx2_u8()
13222 b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]), _pM128i(b.val[1])); in vtbx2_u8()
13243 c128 = _pM128i(c); in vtbx3_u8()
13246 c23 = _mm_and_si128(maskgt23, _pM128i(a)); in vtbx3_u8()
13247 b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]),_pM128i(b.val[1])); in vtbx3_u8()
13249 sh1 = _mm_shuffle_epi8(_pM128i(b.val[2]), c128); //for bi>15 bi is wrapped (bi-=15) in vtbx3_u8()
13270 c128 = _pM128i(c); in vtbx4_u8()
13273 c31 = _mm_and_si128(maskgt31, _pM128i(a)); in vtbx4_u8()
13275 b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]),_pM128i(b.val[1])); in vtbx4_u8()
13276 b23 = _mm_unpacklo_epi64(_pM128i(b.val[2]),_pM128i(b.val[3])); in vtbx4_u8()
13874 return64(vqdmulhq_n_s16(_pM128i(vec1), val2)); in vqdmulh_n_s16()
13881 return64(vqdmulhq_n_s32(_pM128i(vec1), val2)); in vqdmulh_n_s32()
13940 vlane = vgetq_lane_s32(_pM128i(val2), val3);
14019 vlane = vgetq_lane_s32(_pM128i(val2), val3);
14396 res = vrev64q_s8(_pM128i(vec)); in vrev64_s8()
14405 res = vrev64q_s16(_pM128i(vec)); in vrev64_s16()
14488 res = vrev32q_s8(_pM128i(vec)); in vrev32_s8()
14497 res = vrev32q_s16(_pM128i(vec)); in vrev32_s16()
14546 res = vrev16q_s8(_pM128i(vec)); in vrev16_s8()
14580 res = _mm_abs_epi8(_pM128i(a)); in vabs_s8()
14590 res = _mm_abs_epi16(_pM128i(a)); in vabs_s16()
14599 res = _mm_abs_epi32(_pM128i(a)); in vabs_s32()
14654 res = vqabsq_s8(_pM128i(a)); in vqabs_s8()
14663 res = vqabsq_s16(_pM128i(a)); in vqabs_s16()
14672 res = vqabsq_s32(_pM128i(a)); in vqabs_s32()
14715 res = vnegq_s8(_pM128i(a)); in vneg_s8()
14724 res = vnegq_s16(_pM128i(a)); in vneg_s16()
14733 res = vnegq_s32(_pM128i(a)); in vneg_s32()
14787 res = vqnegq_s8(_pM128i(a)); in vqneg_s8()
14796 res = vqnegq_s16(_pM128i(a)); in vqneg_s16()
14805 res = vqnegq_s32(_pM128i(a)); in vqneg_s32()
14845 res = vclzq_s8(_pM128i(a)); in vclz_s8()
14854 res = vclzq_s16(_pM128i(a)); in vclz_s16()
14863 res = vclzq_s32(_pM128i(a)); in vclz_s32()
14976 res = vclsq_s8(_pM128i(a)); in vcls_s8()
14985 res = vclsq_s16(_pM128i(a)); in vcls_s16()
14994 res = vclsq_s32(_pM128i(a)); in vcls_s32()
15059 res = vcntq_u8(_pM128i(a)); in vcnt_u8()
15102 res = vmvnq_s8(_pM128i(a)); in vmvn_s8()
15111 res = vmvnq_s16(_pM128i(a)); in vmvn_s16()
15120 res = vmvnq_s32(_pM128i(a)); in vmvn_s32()
15178 return64(_mm_and_si128(_pM128i(a),_pM128i(b))); in vand_s8()
15185 return64(_mm_and_si128(_pM128i(a),_pM128i(b))); in vand_s16()
15192 return64(_mm_and_si128(_pM128i(a),_pM128i(b))); in vand_s32()
15247 return64(_mm_or_si128(_pM128i(a),_pM128i(b))); in vorr_s8()
15255 return64(_mm_or_si128(_pM128i(a),_pM128i(b))); in vorr_s16()
15263 return64(_mm_or_si128(_pM128i(a),_pM128i(b))); in vorr_s32()
15317 return64(_mm_xor_si128(_pM128i(a),_pM128i(b))); in veor_s8()
15377 return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap" in vbic_s8()
15437 return64(vornq_s8(_pM128i(a), _pM128i(b))); in vorn_s8()
15445 return64(vornq_s16(_pM128i(a), _pM128i(b))); in vorn_s16()
15453 return64(vornq_s32(_pM128i(a), _pM128i(b))); in vorn_s32()
15557 res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c)); in vbsl_s8()
15662 tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7 in vtrn_s8()
15674 tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3 in vtrn_s16()
15685 val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1 in vtrn_s32()
15792 val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); in vzip_s8()
15802 val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); in vzip_s16()
15889 tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7 in vuzp_s8()
15901 tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3 in vuzp_s16()
15912 val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1 in vuzp_s32()