Home
last modified time | relevance | path

Searched refs:ABC (Results 1 – 25 of 548) sorted by relevance

12345678910>>...22

/external/XNNPACK/src/qs8-dwconv/
Dunipass-sse-mul16.c.in6 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
46 __m128i vacc${ABC[0:4]} = _mm_loadu_si128((const __m128i*) w);
48 …__m128i vacc${ABC[C:C+4]} = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + ${C} * sizeof(int32_…
54 const __m128i vi${K}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${K});
56 const __m128i vi${K}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${K} + ${C}));
58 const __m128i vxi${K}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(vi${K}x${ABC[C:C+8]});
59 …const __m128i vk${K}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + ${CHANNEL_T…
61 const __m128i vxk${K}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(vk${K}x${ABC[C:C+8]});
66 …const __m128i vxi${K}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${K}x${ABC[C:C+8]}, _mm_cmpgt_epi8(_mm_s…
67 …const __m128i vxk${K}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vk${K}x${ABC[C:C+8]}, _mm_cmpgt_epi8(_mm_s…
[all …]
Dunipass-avx512skx-mul32.c.in6 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
61 __m512i vacc${ABC[0:16]} = _mm512_loadu_si512(w);
63 …__m512i vacc${ABC[C:C+16]} = _mm512_loadu_si512((const void*) ((uintptr_t) w + ${C} * sizeof(int32…
69 … const __m512i vi${K}x${ABC[0:16]} = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i${K}));
71 …const __m512i vi${K}x${ABC[C:C+16]} = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i${K}…
72 …const __m512i vk${K}x${ABC[C:C+16]} = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintp…
76 …vacc${ABC[C:C+16]} = _mm512_add_epi32(vacc${ABC[C:C+16]}, _mm512_mullo_epi32(vi${K}x${ABC[C:C+16]}…
81 …const __m512i vacc${ABC[C+1:C+16:2]} = _mm512_shuffle_epi32(vacc${ABC[C:C+16]}, _MM_SHUFFLE(3, 3, …
84 …const __m512i vprod${ABC[C:C+16:2]} = _mm512_add_epi64(_mm512_mul_epi32(vacc${ABC[C:C+16]}, vmulti…
85 …const __m512i vprod${ABC[C+1:C+16:2]} = _mm512_add_epi64(_mm512_mul_epi32(vacc${ABC[C+1:C+16:2]}, …
[all …]
Dunipass-wasmsimd-mul16.c.in6 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
44 v128_t vacc${ABC[0:4]} = wasm_v128_load(w);
46 … v128_t vacc${ABC[C:C+4]} = wasm_v128_load((const void*) ((uintptr_t) w + ${C} * sizeof(int32_t)));
52 const v128_t vi${K}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${K});
54 const v128_t vi${K}x${ABC[C:C+8]} = wasm_i16x8_load_8x8(i${K} + ${C});
55 …const v128_t vk${K}x${ABC[C:C+8]} = wasm_i16x8_load_8x8((const void*) ((uintptr_t) w + ${CHANNEL_T…
59 … const v128_t vprod${K}x${ABC[C:C+8]} = wasm_i16x8_mul(vi${K}x${ABC[C:C+8]}, vk${K}x${ABC[C:C+8]});
62 …vacc${ABC[C:C+4]} = wasm_i32x4_add(vacc${ABC[C:C+4]}, wasm_i32x4_widen_low_i16x8(vprod${K}x${ABC[C…
63 …vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vacc${ABC[C+4:C+8]}, wasm_i32x4_widen_high_i16x8(vprod${K}x${
68 const v128_t vsign${ABC[C:C+4]} = wasm_i32x4_shr(vacc${ABC[C:C+4]}, 31);
[all …]
Dunipass-avx2-mul16.c.in6 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
44 __m256i vacc${ABC[0:8]} = _mm256_loadu_si256((const __m256i*) w);
46 …__m256i vacc${ABC[C:C+8]} = _mm256_loadu_si256((const __m256i*) ((uintptr_t) w + ${C} * sizeof(int…
52 … const __m256i vi${K}x${ABC[0:16]} = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i${K}));
54 …const __m256i vi${K}x${ABC[C:C+16]} = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (i${K}…
55 …const __m256i vk${K}x${ABC[C:C+16]} = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uint…
59 …const __m256i vprod${K}x${ABC[C:C+16]} = _mm256_mullo_epi16(vi${K}x${ABC[C:C+16]}, vk${K}x${ABC[C…
60 … const __m128i vprod${K}x${ABC[C+8:C+16]} = _mm256_extracti128_si256(vprod${K}x${ABC[C:C+16]}, 1);
61 …vacc${ABC[C:C+8]} = _mm256_add_epi32(vacc${ABC[C:C+8]}, _mm256_cvtepi16_epi32(_mm256_castsi256_si1…
62 …vacc${ABC[C+8:C+16]} = _mm256_add_epi32(vacc${ABC[C+8:C+16]}, _mm256_cvtepi16_epi32(vprod${K}x${AB…
[all …]
Dunipass-neon-mul16.c.in6 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
55 …int32x4_t vacc${ABC[C:C+4]} = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)…
60 const int16x8_t vi${K}x${ABC[C:C+8]} = vmovl_s8(vld1_s8(i${K})); i${K} += 8;
61 …const int16x8_t vk${K}x${ABC[C:C+8]} = vmovl_s8(vld1_s8(w)); w = (const void*) ((uintptr_t) w + 8 …
64 …vacc${ABC[C:C+4]} = vmlal_s16(vacc${ABC[C:C+4]}, vget_low_s16(vi${K}x${ABC[C:C+8]}), vget_low_s16(…
65 …vacc${ABC[C+4:C+8]} = vmlal_s16(vacc${ABC[C+4:C+8]}, vget_high_s16(vi${K}x${ABC[C:C+8]}), vget_hig…
68 vacc${ABC[C:C+4]} = vqrdmulhq_s32(vacc${ABC[C:C+4]}, vmultiplier);
71 …vacc${ABC[C:C+4]} = vsraq_n_s32(vacc${ABC[C:C+4]}, vbicq_s32(vacc${ABC[C:C+4]}, vzero_shift_mask),…
74 vacc${ABC[C:C+4]} = vrshlq_s32(vacc${ABC[C:C+4]}, vright_shift);
78 …const int16x8_t vacc${ABC[C:C+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${ABC[C:C+4]}), vacc…
[all …]
Dunipass-avx2-mul32.c.in6 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
44 __m256i vacc${ABC[0:8]} = _mm256_loadu_si256((const __m256i*) w);
46 …__m256i vacc${ABC[C:C+8]} = _mm256_loadu_si256((const __m256i*) ((uintptr_t) w + ${C} * sizeof(int…
52 … const __m256i vi${K}x${ABC[0:8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i${K}));
54 …const __m256i vi${K}x${ABC[C:C+8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i${K} …
55 …const __m256i vk${K}x${ABC[C:C+8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintp…
59 …vacc${ABC[C:C+8]} = _mm256_add_epi32(vacc${ABC[C:C+8]}, _mm256_mullo_epi32(vi${K}x${ABC[C:C+8]}, v…
67 …const __m256i vacc${ABC[C+1:C+8:2]} = _mm256_shuffle_epi32(vacc${ABC[C:C+8]}, _MM_SHUFFLE(3, 3, 1,…
70 …const __m256i vprod${ABC[C:C+8:2]} = _mm256_add_epi64(_mm256_mul_epi32(vacc${ABC[C:C+8]}, vmultipl…
71 …const __m256i vprod${ABC[C+1:C+8:2]} = _mm256_add_epi64(_mm256_mul_epi32(vacc${ABC[C+1:C+8:2]}, vm…
[all …]
/external/XNNPACK/src/qs8-vadd/
Dsse-mul16-ld64.c.in9 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
39 const __m128i vx${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x));
40 const __m128i vy${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_y));
42 …const __m128i vx${ABC[N:N+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_x + ${N}…
43 …const __m128i vy${ABC[N:N+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_y + ${N}…
45 __m128i vx${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) input_x);
46 __m128i vy${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) input_y);
48 __m128i vx${ABC[N:N+8]} = _mm_loadl_epi64((const __m128i*) (input_x + ${N}));
49 __m128i vy${ABC[N:N+8]} = _mm_loadl_epi64((const __m128i*) (input_y + ${N}));
55 …vx${ABC[N:N+8]} = _mm_unpacklo_epi8(vx${ABC[N:N+8]}, _mm_cmpgt_epi8(_mm_setzero_si128(), vx${ABC[N…
[all …]
Dwasmsimd.c.in8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
34 const v128_t vx${ABC[0:8]} = wasm_i16x8_load_8x8(input_x);
35 const v128_t vy${ABC[0:8]} = wasm_i16x8_load_8x8(input_y);
37 const v128_t vx${ABC[N:N+8]} = wasm_i16x8_load_8x8(input_x + ${N});
38 const v128_t vy${ABC[N:N+8]} = wasm_i16x8_load_8x8(input_y + ${N});
43 …v128_t vacc${ABC[N:N+4]} = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low…
44 …v128_t vacc${ABC[N+4:N+8]} = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_h…
47 …vacc${ABC[N:N+4]} = wasm_i32x4_add(vacc${ABC[N:N+4]}, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vy…
48 …vacc${ABC[N+4:N+8]} = wasm_i32x4_add(vacc${ABC[N+4:N+8]}, wasm_i32x4_mul(wasm_i32x4_widen_high_i16…
51 …const v128_t vrem${ABC[N:N+4]} = wasm_i32x4_add(wasm_v128_and(vacc${ABC[N:N+4]}, vremainder_mask),…
[all …]
Dsse-mul32-ld32.c.in8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
44 const __m128i vx${ABC[0:4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(input_x));
45 const __m128i vy${ABC[0:4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(input_y));
47 const __m128i vx${ABC[N:N+4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(input_x + ${N}));
48 const __m128i vy${ABC[N:N+4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(input_y + ${N}));
54 … __m128i vacc${ABC[N:N+4]} = _mm_macc_epi32(vx${ABC[N:N+4]}, vx_multiplier, vzero_point_product);
57 vacc${ABC[N:N+4]} = _mm_macc_epi32(vy${ABC[N:N+4]}, vy_multiplier, vacc${ABC[N:N+4]});
60 …__m128i vacc${ABC[N:N+4]} = _mm_add_epi32(vzero_point_product, _mm_mullo_epi32(vx${ABC[N:N+4]}, vx…
63 …vacc${ABC[N:N+4]} = _mm_add_epi32(vacc${ABC[N:N+4]}, _mm_mullo_epi32(vy${ABC[N:N+4]}, vy_multiplie…
66 …__m128i vrem${ABC[N:N+4]} = _mm_add_epi32(_mm_and_si128(vacc${ABC[N:N+4]}, vremainder_mask), _mm_c…
[all …]
Davx2-mul32-ld64.c.in8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
40 const __m256i vx${ABC[0:8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_x));
41 const __m256i vy${ABC[0:8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_y));
43 …const __m256i vx${ABC[N:N+8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (input_x + $…
44 …const __m256i vy${ABC[N:N+8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (input_y + $…
49 …__m256i vacc${ABC[N:N+8]} = _mm256_add_epi32(vzero_point_product, _mm256_mullo_epi32(vx${ABC[N:N+8…
52 …vacc${ABC[N:N+8]} = _mm256_add_epi32(vacc${ABC[N:N+8]}, _mm256_mullo_epi32(vy${ABC[N:N+8]}, vy_mul…
55 …const __m256i vrem${ABC[N:N+8]} = _mm256_add_epi32(_mm256_and_si256(vacc${ABC[N:N+8]}, vremainder_…
58 …vacc${ABC[N:N+8]} = _mm256_sub_epi32(_mm256_sra_epi32(vacc${ABC[N:N+8]}, vshift), _mm256_cmpgt_epi…
62 …_m256i vout${ABC[N:N+4]}${ABC[N+8:N+12]}${ABC[N+4:N+8]}${ABC[N+12:N+16]} = _mm256_adds_epi16(_mm25…
[all …]
Dneon-ld64.c.in8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
35 const int8x8_t vx${ABC[N:N+8]} = vld1_s8(input_x); input_x += 8;
36 const int8x8_t vy${ABC[N:N+8]} = vld1_s8(input_y); input_y += 8;
39 const int16x8_t vex${ABC[N:N+8]} = vsubl_s8(vx${ABC[N:N+8]}, vx_zero_point);
40 const int16x8_t vey${ABC[N:N+8]} = vsubl_s8(vy${ABC[N:N+8]}, vy_zero_point);
43 … int32x4_t vacc${ABC[N:N+4]} = vmulq_s32(vmovl_s16(vget_low_s16(vex${ABC[N:N+8]})), vx_multiplier);
44 …int32x4_t vacc${ABC[N+4:N+8]} = vmulq_s32(vmovl_s16(vget_high_s16(vex${ABC[N:N+8]})), vx_multiplie…
47 …vacc${ABC[N:N+4]} = vmlaq_s32(vacc${ABC[N:N+4]}, vmovl_s16(vget_low_s16(vey${ABC[N:N+8]})), vy_mul…
48 …vacc${ABC[N+4:N+8]} = vmlaq_s32(vacc${ABC[N+4:N+8]}, vmovl_s16(vget_high_s16(vey${ABC[N:N+8]})), v…
51 …vacc${ABC[N:N+4]} = vsraq_n_s32(vacc${ABC[N:N+4]}, vbicq_s32(vacc${ABC[N:N+4]}, vzero_shift_mask),…
[all …]
/external/XNNPACK/src/qs8-gavgpool/
Dunipass-sse.c.in12 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
53 … const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
55 …const __m128i vxi${M}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i${M} + …
57 const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M});
59 const __m128i vi${M}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${M} + ${C}));
65 …const __m128i vxi${M}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M}x${ABC[C:C+8]}, _mm_cmpgt_epi8(_mm_s…
69 …__m128i vacc${A}x${ABC[C:C+8]} = _mm_add_epi16(vxi${A*2}x${ABC[C:C+8]}, vxi${A*2+1}x${ABC[C:C+8]});
73 …vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = _mm_add_epi16(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vxi${…
76 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]}
82 …vacc${A}x${ABC[C:C+8]} = _mm_add_epi16(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]});
[all …]
Dunipass-neon.c.in11 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
61 const int8x8_t vi${M}x${ABC[C:C+8]} = vld1_s8(i${M}); i${M} += 8;
65 … int16x8_t vacc${A}x${ABC[C:C+8]} = vaddl_s8(vi${A*2}x${ABC[C:C+8]}, vi${A*2+1}x${ABC[C:C+8]});
69 …vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = vaddw_s8(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vi${M}x${A…
72 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]}
78 … vacc${A}x${ABC[C:C+8]} = vaddq_s16(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]});
82 int32x4_t vacc${ABC[C:C+4]} = vaddw_s16(vbias, vget_low_s16(vacc0x${ABC[C:C+8]}));
83 int32x4_t vacc${ABC[C+4:C+8]} = vaddw_s16(vbias, vget_high_s16(vacc0x${ABC[C:C+8]}));
86 …const int32x4_t vsgnacc${ABC[C:C+4]} = vreinterpretq_s32_u32(vcltq_s32(vacc${ABC[C:C+4]}, vmovq_n_…
90 …const int64x2_t vprod${ABC[C:C+2]} = vmull_s32(vget_low_s32(vacc${ABC[C:C+4]}), vget_low_s32(vmult…
[all …]
Dmultipass-sse.c.in15 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
52 … const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M}));
54 …const __m128i vxi${M}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i${M} + …
56 const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M});
58 const __m128i vi${M}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${M} + ${C}));
64 …const __m128i vxi${M}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M}x${ABC[C:C+8]}, _mm_cmpgt_epi8(_mm_s…
68 …__m128i vacc${A}x${ABC[C:C+8]} = _mm_add_epi16(vxi${A*2}x${ABC[C:C+8]}, vxi${A*2+1}x${ABC[C:C+8]});
72 …vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = _mm_add_epi16(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vxi${…
75 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]}
81 …vacc${A}x${ABC[C:C+8]} = _mm_add_epi16(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]});
[all …]
Dunipass-wasmsimd.c.in11 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
51 const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M});
53 const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load_8x8(i${M} + ${C});
58 …v128_t vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vxi${A*2}x${ABC[C:C+8]}, vxi${A*2+1}x${ABC[C:C+8]});
62 …vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vxi$…
65 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]}
71 …vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]}…
75 …const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_low_i16x8(vacc0x${ABC[C:C+…
76 …const v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_high_i16x8(vacc0x${ABC[C…
79 const v128_t vabsacc${ABC[C:C+4]} = wasm_i32x4_abs(vacc${ABC[C:C+4]});
[all …]
Dmultipass-neon.c.in14 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
50 const int8x8_t vi${M}x${ABC[C:C+8]} = vld1_s8(i${M}); i${M} += 8;
54 … int16x8_t vacc${A}x${ABC[C:C+8]} = vaddl_s8(vi${A*2}x${ABC[C:C+8]}, vi${A*2+1}x${ABC[C:C+8]});
58 …vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = vaddw_s8(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vi${M}x${A…
61 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]}
67 … vacc${A}x${ABC[C:C+8]} = vaddq_s16(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]});
71 const int32x4_t vacc${ABC[C:C+4]} = vaddw_s16(vbias, vget_low_s16(vacc0x${ABC[C:C+8]}));
72 const int32x4_t vacc${ABC[C+4:C+8]} = vaddw_s16(vbias, vget_high_s16(vacc0x${ABC[C:C+8]}));
75 vst1q_s32(b, vacc${ABC[C:C+4]}); b += 4;
81 const int8x8_t vi${M}x${ABC[0:8]} = vld1_s8(i${M}); i${M} += 8;
[all …]
Dmultipass-wasmsimd.c.in14 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
49 const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M});
51 const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load_8x8(i${M} + ${C});
56 …v128_t vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vxi${A*2}x${ABC[C:C+8]}, vxi${A*2+1}x${ABC[C:C+8]});
60 …vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vxi$…
63 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]}
69 …vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]}…
73 …const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_low_i16x8(vacc0x${ABC[C:C+…
74 …const v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_high_i16x8(vacc0x${ABC[C…
76 wasm_v128_store(b, vacc${ABC[0:4]});
[all …]
/external/XNNPACK/src/qs8-vaddc/
Dwasmsimd.c.in8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
35 const v128_t vx${ABC[0:8]} = wasm_i16x8_load_8x8(input_x);
37 const v128_t vx${ABC[N:N+8]} = wasm_i16x8_load_8x8(input_x + ${N});
41 …v128_t vacc${ABC[N:N+4]} = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low…
42 …v128_t vacc${ABC[N+4:N+8]} = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_h…
45 …const v128_t vrem${ABC[N:N+4]} = wasm_i32x4_add(wasm_v128_and(vacc${ABC[N:N+4]}, vremainder_mask),…
48 …vacc${ABC[N:N+4]} = wasm_i32x4_sub(wasm_i32x4_shr(vacc${ABC[N:N+4]}, vshift), wasm_i32x4_gt(vrem${
51 …v128_t vout${ABC[N:N+8]} = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc${ABC[N:N+4]}, vacc…
55 … v128_t vout${ABC[N:N+16]} = wasm_i8x16_narrow_i16x8(vout${ABC[N:N+8]}, vout${ABC[N+8:N+16]});
57 …v128_t vout${ABC[N:N+8]}${ABC[N:N+8]} = wasm_i8x16_narrow_i16x8(vout${ABC[N:N+8]}, vout${ABC[N:N+8…
[all …]
Dsse-mul16-ld64.c.in9 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
39 const __m128i vx${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x));
41 …const __m128i vx${ABC[N:N+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_x + ${N}…
43 __m128i vx${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) input_x);
45 __m128i vx${ABC[N:N+8]} = _mm_loadl_epi64((const __m128i*) (input_x + ${N}));
50 …vx${ABC[N:N+8]} = _mm_unpacklo_epi8(vx${ABC[N:N+8]}, _mm_cmpgt_epi8(_mm_setzero_si128(), vx${ABC[N…
53 __m128i vxprod${ABC[N:N+8]}hi = _mm_mulhi_epu16(vx${ABC[N:N+8]}, vx_multiplier_lo);
54 const __m128i vxprod${ABC[N:N+8]}lo = _mm_mullo_epi16(vx${ABC[N:N+8]}, vx_multiplier_lo);
57 …vxprod${ABC[N:N+8]}hi = _mm_add_epi16(vxprod${ABC[N:N+8]}hi, _mm_mullo_epi16(vx${ABC[N:N+8]}, vx_m…
60 …vxprod${ABC[N:N+8]}hi = _mm_sub_epi16(vxprod${ABC[N:N+8]}hi, _mm_and_si128(_mm_srai_epi16(vx${ABC[…
[all …]
Dsse-mul32-ld32.c.in8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
45 const __m128i vx${ABC[0:4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(input_x));
47 const __m128i vx${ABC[N:N+4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(input_x + ${N}));
53 … __m128i vacc${ABC[N:N+4]} = _mm_macc_epi32(vx${ABC[N:N+4]}, vx_multiplier, vzero_point_product);
56 …__m128i vacc${ABC[N:N+4]} = _mm_add_epi32(vzero_point_product, _mm_mullo_epi32(vx${ABC[N:N+4]}, vx…
59 …__m128i vrem${ABC[N:N+4]} = _mm_add_epi32(_mm_and_si128(vacc${ABC[N:N+4]}, vremainder_mask), _mm_c…
62 …vacc${ABC[N:N+4]} = _mm_sub_epi32(_mm_sra_epi32(vacc${ABC[N:N+4]}, vshift), _mm_cmpgt_epi32(vrem${
65 …__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[N:N+4]}, vacc${ABC[N+4:N+8]})…
68 vout${ABC[N:N+8]} = _mm_max_epi16(vout${ABC[N:N+8]}, voutput_min);
71 vout${ABC[N:N+8]} = _mm_min_epi16(vout${ABC[N:N+8]}, voutput_max);
[all …]
Davx2-mul32-ld64.c.in8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
41 const __m256i vx${ABC[0:8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_x));
43 …const __m256i vx${ABC[N:N+8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (input_x + $…
47 …__m256i vacc${ABC[N:N+8]} = _mm256_add_epi32(vzero_point_product, _mm256_mullo_epi32(vx${ABC[N:N+8…
50 …const __m256i vrem${ABC[N:N+8]} = _mm256_add_epi32(_mm256_and_si256(vacc${ABC[N:N+8]}, vremainder_…
53 …vacc${ABC[N:N+8]} = _mm256_sub_epi32(_mm256_sra_epi32(vacc${ABC[N:N+8]}, vshift), _mm256_cmpgt_epi…
57 …_m256i vout${ABC[N:N+4]}${ABC[N+8:N+12]}${ABC[N+4:N+8]}${ABC[N+12:N+16]} = _mm256_adds_epi16(_mm25…
59 …__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[N:N+8]…
61 …__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[N:N+8]…
65 …${ABC[N:N+4]}${ABC[N+8:N+12]}${ABC[N+4:N+8]}${ABC[N+12:N+16]} = _mm256_min_epi16(_mm256_max_epi16(…
[all …]
Dneon-ld64.c.in8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
40 const int8x8_t vx${ABC[N:N+8]} = vld1_s8(input_x); input_x += 8;
43 const int16x8_t vex${ABC[N:N+8]} = vsubl_s8(vx${ABC[N:N+8]}, vx_zero_point);
46 …int32x4_t vacc${ABC[N:N+4]} = vmlaq_s32(vy_bias, vmovl_s16(vget_low_s16(vex${ABC[N:N+8]})), vx_mul…
47 …int32x4_t vacc${ABC[N+4:N+8]} = vmlaq_s32(vy_bias, vmovl_s16(vget_high_s16(vex${ABC[N:N+8]})), vx_…
50 …vacc${ABC[N:N+4]} = vsraq_n_s32(vacc${ABC[N:N+4]}, vbicq_s32(vacc${ABC[N:N+4]}, vzero_shift_mask),…
53 vacc${ABC[N:N+4]} = vrshlq_s32(vacc${ABC[N:N+4]}, vright_shift);
56 …const int16x8_t vacc${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${ABC[N:N+4]}), vqmovn_…
60 …int8x16_t vout${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${ABC[N:N+8]}), vqmovn_s16(vacc${ABC[N+8…
62 int8x8_t vout${ABC[N:N+8]} = vqmovn_s16(vacc${ABC[N:N+8]});
[all …]
/external/XNNPACK/src/f32-conv-hwc/
D3x3s2p1c3-neon-x2.c.in8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
90 float32x4_t vo0x0c${ABC[0:4]} = vld1q_f32(w);
92 float32x4_t vo0x0c${ABC[C:C+4]} = vld1q_f32(w + ${C});
95 float32x4_t vo${Y}x0c${ABC[C:C+4]} = vo0x0c${ABC[C:C+4]};
98 float32x4_t vo${Y}x1c${ABC[C:C+4]} = vo0x0c${ABC[C:C+4]};
101 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE});
109 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_l…
113 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_h…
116 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2});
120 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_l…
[all …]
D3x3s2p0p1c3-neon-x2.c.in8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
90 float32x4_t vo0x0c${ABC[0:4]} = vld1q_f32(w);
92 float32x4_t vo0x0c${ABC[C:C+4]} = vld1q_f32(w + ${C});
95 float32x4_t vo${Y}x0c${ABC[C:C+4]} = vo0x0c${ABC[C:C+4]};
98 float32x4_t vo${Y}x1c${ABC[C:C+4]} = vo0x0c${ABC[C:C+4]};
101 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE});
109 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_l…
113 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_h…
116 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2});
120 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_l…
[all …]
/external/XNNPACK/src/f32-vsqrt/
Dneonfma-nr2fma1adj.c.in8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
31 const float32x4_t vx${ABC[N:N+4]} = vld1q_f32(x); x += 4;
34 const float32x4_t vrsqrtx${ABC[N:N+4]} = vrsqrteq_f32(vx${ABC[N:N+4]});
37 float32x4_t vsqrtx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vx${ABC[N:N+4]});
38 float32x4_t vhalfrsqrtx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vhalf);
41 …float32x4_t vresidual${ABC[N:N+4]} = vfmsq_f32(vhalf, vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]…
44 …vhalfrsqrtx${ABC[N:N+4]} = vfmaq_f32(vhalfrsqrtx${ABC[N:N+4]}, vresidual${ABC[N:N+4]}, vhalfrsqrtx…
45 … vsqrtx${ABC[N:N+4]} = vfmaq_f32(vsqrtx${ABC[N:N+4]}, vresidual${ABC[N:N+4]}, vsqrtx${ABC[N:N+4]});
48 vresidual${ABC[N:N+4]} = vfmsq_f32(vhalf, vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]});
51 …vhalfrsqrtx${ABC[N:N+4]} = vfmaq_f32(vhalfrsqrtx${ABC[N:N+4]}, vresidual${ABC[N:N+4]}, vhalfrsqrtx…
[all …]

12345678910>>...22