/external/XNNPACK/src/qs8-vaddc/gen/ |
D | minmax-sse2-mul16-ld64-x8.c | 37 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8() local 40 vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8() 42 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8() 43 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8() 45 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8() 47 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8() 71 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8() local 73 vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8() 75 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8() 76 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8() [all …]
|
D | minmax-sse2-mul16-ld64-x16.c | 37 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() local 41 vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() 44 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() 45 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() 49 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() 52 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() 86 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() local 89 vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() 91 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() 92 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16() [all …]
|
D | minmax-sse2-mul16-ld64-x24.c | 37 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() local 42 vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() 46 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() 47 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() 53 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() 57 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() 103 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() local 106 vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() 108 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() 109 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24() [all …]
|
D | minmax-sse2-mul16-ld64-x32.c | 37 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() local 43 vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() 48 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() 49 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() 57 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() 62 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() 118 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() local 121 vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567)); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() 123 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() 124 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x32() [all …]
|
D | minmax-sse41-mul16-ld64-x8.c | 37 const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8() local 41 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8() 42 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8() 44 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8() 46 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8() 70 const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8() local 73 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8() 74 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8() 76 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8() 78 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8()
|
D | minmax-sse41-mul16-ld64-x16.c | 37 const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() local 42 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() 43 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() 47 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() 50 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() 84 const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() local 88 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() 89 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() 91 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16() 93 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x16()
|
D | minmax-sse41-mul16-ld64-x24.c | 37 const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() local 43 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() 44 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() 50 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() 54 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() 100 const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() local 104 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() 105 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() 107 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24() 109 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x24()
|
D | minmax-wasmsimd-x8.c | 36 const v128_t vx01234567 = wasm_i16x8_load_8x8(input_x); in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8() local 39 …2x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8() 40 …x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8() 61 const v128_t vx01234567 = wasm_i16x8_load_8x8(input_x); in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8() local 63 …2x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8() 64 …x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8()
|
D | minmax-sse41-mul16-ld64-x32.c | 37 const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() local 44 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() 45 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() 53 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() 58 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() 114 const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() local 118 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() 119 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() 121 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32() 123 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x32()
|
D | minmax-wasmsimd-x16.c | 36 const v128_t vx01234567 = wasm_i16x8_load_8x8(input_x); in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x16() local 40 …2x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x16() 41 …x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x16() 69 const v128_t vx01234567 = wasm_i16x8_load_8x8(input_x); in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x16() local 72 …2x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x16() 73 …x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x16()
|
D | minmax-wasmsimd-x24.c | 36 const v128_t vx01234567 = wasm_i16x8_load_8x8(input_x); in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x24() local 41 …2x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x24() 42 …x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x24() 81 const v128_t vx01234567 = wasm_i16x8_load_8x8(input_x); in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x24() local 84 …2x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x24() 85 …x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x24()
|
/external/XNNPACK/src/qs8-vadd/gen/ |
D | minmax-sse2-mul16-ld64-x8.c | 37 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() local 42 vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 45 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 47 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 50 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 53 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 81 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() local 84 vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 87 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 89 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() [all …]
|
D | minmax-sse2-mul16-ld64-x16.c | 37 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() local 44 vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 49 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 51 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 58 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 63 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 104 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() local 109 vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 112 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() 114 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16() [all …]
|
D | minmax-sse2-mul16-ld64-x24.c | 37 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() local 46 vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 53 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 55 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 66 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 73 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 129 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() local 134 vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 137 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() 139 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24() [all …]
|
D | minmax-sse41-mul16-ld64-x8.c | 37 const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() local 43 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 45 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 48 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 51 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 79 const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() local 83 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 85 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 88 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 91 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8()
|
D | minmax-sse2-mul16-ld64-x32.c | 37 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() local 48 vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 57 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 59 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 74 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 83 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 152 __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() local 157 vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 160 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() 162 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32() [all …]
|
D | minmax-sse41-mul16-ld64-x16.c | 37 const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() local 45 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 47 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 54 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 59 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 100 const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() local 106 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 108 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 111 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16() 114 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16()
|
D | minmax-wasmsimd-x8.c | 35 const v128_t vx01234567 = wasm_i16x8_load_8x8(input_x); in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() local 40 …2x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 41 …x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 65 const v128_t vx01234567 = wasm_i16x8_load_8x8(input_x); in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() local 68 …2x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 69 …x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8()
|
D | minmax-sse41-mul16-ld64-x24.c | 37 const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() local 47 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 49 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 60 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 67 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 123 const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() local 129 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 131 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 134 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24() 137 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24()
|
D | minmax-wasmsimd-x16.c | 35 const v128_t vx01234567 = wasm_i16x8_load_8x8(input_x); in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() local 42 …2x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 43 …x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 76 const v128_t vx01234567 = wasm_i16x8_load_8x8(input_x); in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() local 81 …2x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 82 …x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(vx01234567), vx_multiplier)… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16()
|
D | minmax-sse41-mul16-ld64-x32.c | 37 const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() local 49 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 51 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 66 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 75 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 144 const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() local 150 __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 152 const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 155 … vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32() 158 …vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), v… in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32()
|
/external/XNNPACK/src/f32-vlrelu/gen/ |
D | vlrelu-avx-x8.c | 31 const __m256 vx01234567 = _mm256_loadu_ps(x); in xnn_f32_vlrelu_ukernel__avx_x8() local 34 __m256 vacc01234567 = _mm256_mul_ps(vx01234567, vslope); in xnn_f32_vlrelu_ukernel__avx_x8() 36 vacc01234567 = _mm256_blendv_ps(vx01234567, vacc01234567, vx01234567); in xnn_f32_vlrelu_ukernel__avx_x8()
|
D | vlrelu-avx-x16.c | 31 const __m256 vx01234567 = _mm256_loadu_ps(x); in xnn_f32_vlrelu_ukernel__avx_x16() local 35 __m256 vacc01234567 = _mm256_mul_ps(vx01234567, vslope); in xnn_f32_vlrelu_ukernel__avx_x16() 38 vacc01234567 = _mm256_blendv_ps(vx01234567, vacc01234567, vx01234567); in xnn_f32_vlrelu_ukernel__avx_x16()
|
/external/XNNPACK/src/f16-hswish/gen/ |
D | hswish-neonfp16arith-x16.c | 36 float16x8_t vx01234567 = vld1q_f16(x); x += 8; in xnn_f16_hswish_ukernel__neonfp16arith_x16() local 39 float16x8_t vacc01234567 = vaddq_f16(vx01234567, vthree); in xnn_f16_hswish_ukernel__neonfp16arith_x16() 40 vx01234567 = vmulq_f16(vx01234567, vsixth); in xnn_f16_hswish_ukernel__neonfp16arith_x16() 50 vacc01234567 = vmulq_f16(vacc01234567, vx01234567); in xnn_f16_hswish_ukernel__neonfp16arith_x16()
|
/external/XNNPACK/src/f32-vunary/gen/ |
D | vsqr-avx-x8.c | 32 const __m256 vx01234567 = _mm256_loadu_ps(x); in xnn_f32_vsqr_ukernel__avx_x8() local 35 const __m256 vy01234567 = _mm256_mul_ps(vx01234567, vx01234567); in xnn_f32_vsqr_ukernel__avx_x8()
|