/external/XNNPACK/src/qs8-dwconv/ |
D | unipass-sse-mul16.c.in | 6 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 46 __m128i vacc${ABC[0:4]} = _mm_loadu_si128((const __m128i*) w); 48 …__m128i vacc${ABC[C:C+4]} = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + ${C} * sizeof(int32_… 54 const __m128i vi${K}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${K}); 56 const __m128i vi${K}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${K} + ${C})); 58 const __m128i vxi${K}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(vi${K}x${ABC[C:C+8]}); 59 …const __m128i vk${K}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + ${CHANNEL_T… 61 const __m128i vxk${K}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(vk${K}x${ABC[C:C+8]}); 66 …const __m128i vxi${K}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${K}x${ABC[C:C+8]}, _mm_cmpgt_epi8(_mm_s… 67 …const __m128i vxk${K}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vk${K}x${ABC[C:C+8]}, _mm_cmpgt_epi8(_mm_s… [all …]
|
D | unipass-avx512skx-mul32.c.in | 6 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 61 __m512i vacc${ABC[0:16]} = _mm512_loadu_si512(w); 63 …__m512i vacc${ABC[C:C+16]} = _mm512_loadu_si512((const void*) ((uintptr_t) w + ${C} * sizeof(int32… 69 … const __m512i vi${K}x${ABC[0:16]} = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) i${K})); 71 …const __m512i vi${K}x${ABC[C:C+16]} = _mm512_cvtepi8_epi32(_mm_loadu_si128((const __m128i*) (i${K}… 72 …const __m512i vk${K}x${ABC[C:C+16]} = _mm512_cvtepi8_epi32(_mm_load_si128((const __m128i*) ((uintp… 76 …vacc${ABC[C:C+16]} = _mm512_add_epi32(vacc${ABC[C:C+16]}, _mm512_mullo_epi32(vi${K}x${ABC[C:C+16]}… 81 …const __m512i vacc${ABC[C+1:C+16:2]} = _mm512_shuffle_epi32(vacc${ABC[C:C+16]}, _MM_SHUFFLE(3, 3, … 84 …const __m512i vprod${ABC[C:C+16:2]} = _mm512_add_epi64(_mm512_mul_epi32(vacc${ABC[C:C+16]}, vmulti… 85 …const __m512i vprod${ABC[C+1:C+16:2]} = _mm512_add_epi64(_mm512_mul_epi32(vacc${ABC[C+1:C+16:2]}, … [all …]
|
D | unipass-wasmsimd-mul16.c.in | 6 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 44 v128_t vacc${ABC[0:4]} = wasm_v128_load(w); 46 … v128_t vacc${ABC[C:C+4]} = wasm_v128_load((const void*) ((uintptr_t) w + ${C} * sizeof(int32_t))); 52 const v128_t vi${K}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${K}); 54 const v128_t vi${K}x${ABC[C:C+8]} = wasm_i16x8_load_8x8(i${K} + ${C}); 55 …const v128_t vk${K}x${ABC[C:C+8]} = wasm_i16x8_load_8x8((const void*) ((uintptr_t) w + ${CHANNEL_T… 59 … const v128_t vprod${K}x${ABC[C:C+8]} = wasm_i16x8_mul(vi${K}x${ABC[C:C+8]}, vk${K}x${ABC[C:C+8]}); 62 …vacc${ABC[C:C+4]} = wasm_i32x4_add(vacc${ABC[C:C+4]}, wasm_i32x4_widen_low_i16x8(vprod${K}x${ABC[C… 63 …vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vacc${ABC[C+4:C+8]}, wasm_i32x4_widen_high_i16x8(vprod${K}x${… 68 const v128_t vsign${ABC[C:C+4]} = wasm_i32x4_shr(vacc${ABC[C:C+4]}, 31); [all …]
|
D | unipass-avx2-mul16.c.in | 6 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 44 __m256i vacc${ABC[0:8]} = _mm256_loadu_si256((const __m256i*) w); 46 …__m256i vacc${ABC[C:C+8]} = _mm256_loadu_si256((const __m256i*) ((uintptr_t) w + ${C} * sizeof(int… 52 … const __m256i vi${K}x${ABC[0:16]} = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) i${K})); 54 …const __m256i vi${K}x${ABC[C:C+16]} = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) (i${K}… 55 …const __m256i vk${K}x${ABC[C:C+16]} = _mm256_cvtepi8_epi16(_mm_loadu_si128((const __m128i*) ((uint… 59 …const __m256i vprod${K}x${ABC[C:C+16]} = _mm256_mullo_epi16(vi${K}x${ABC[C:C+16]}, vk${K}x${ABC[C… 60 … const __m128i vprod${K}x${ABC[C+8:C+16]} = _mm256_extracti128_si256(vprod${K}x${ABC[C:C+16]}, 1); 61 …vacc${ABC[C:C+8]} = _mm256_add_epi32(vacc${ABC[C:C+8]}, _mm256_cvtepi16_epi32(_mm256_castsi256_si1… 62 …vacc${ABC[C+8:C+16]} = _mm256_add_epi32(vacc${ABC[C+8:C+16]}, _mm256_cvtepi16_epi32(vprod${K}x${AB… [all …]
|
D | unipass-neon-mul16.c.in | 6 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 55 …int32x4_t vacc${ABC[C:C+4]} = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)… 60 const int16x8_t vi${K}x${ABC[C:C+8]} = vmovl_s8(vld1_s8(i${K})); i${K} += 8; 61 …const int16x8_t vk${K}x${ABC[C:C+8]} = vmovl_s8(vld1_s8(w)); w = (const void*) ((uintptr_t) w + 8 … 64 …vacc${ABC[C:C+4]} = vmlal_s16(vacc${ABC[C:C+4]}, vget_low_s16(vi${K}x${ABC[C:C+8]}), vget_low_s16(… 65 …vacc${ABC[C+4:C+8]} = vmlal_s16(vacc${ABC[C+4:C+8]}, vget_high_s16(vi${K}x${ABC[C:C+8]}), vget_hig… 68 vacc${ABC[C:C+4]} = vqrdmulhq_s32(vacc${ABC[C:C+4]}, vmultiplier); 71 …vacc${ABC[C:C+4]} = vsraq_n_s32(vacc${ABC[C:C+4]}, vbicq_s32(vacc${ABC[C:C+4]}, vzero_shift_mask),… 74 vacc${ABC[C:C+4]} = vrshlq_s32(vacc${ABC[C:C+4]}, vright_shift); 78 …const int16x8_t vacc${ABC[C:C+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${ABC[C:C+4]}), vacc… [all …]
|
D | unipass-avx2-mul32.c.in | 6 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 44 __m256i vacc${ABC[0:8]} = _mm256_loadu_si256((const __m256i*) w); 46 …__m256i vacc${ABC[C:C+8]} = _mm256_loadu_si256((const __m256i*) ((uintptr_t) w + ${C} * sizeof(int… 52 … const __m256i vi${K}x${ABC[0:8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) i${K})); 54 …const __m256i vi${K}x${ABC[C:C+8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (i${K} … 55 …const __m256i vk${K}x${ABC[C:C+8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) ((uintp… 59 …vacc${ABC[C:C+8]} = _mm256_add_epi32(vacc${ABC[C:C+8]}, _mm256_mullo_epi32(vi${K}x${ABC[C:C+8]}, v… 67 …const __m256i vacc${ABC[C+1:C+8:2]} = _mm256_shuffle_epi32(vacc${ABC[C:C+8]}, _MM_SHUFFLE(3, 3, 1,… 70 …const __m256i vprod${ABC[C:C+8:2]} = _mm256_add_epi64(_mm256_mul_epi32(vacc${ABC[C:C+8]}, vmultipl… 71 …const __m256i vprod${ABC[C+1:C+8:2]} = _mm256_add_epi64(_mm256_mul_epi32(vacc${ABC[C+1:C+8:2]}, vm… [all …]
|
/external/XNNPACK/src/qs8-vadd/ |
D | sse-mul16-ld64.c.in | 9 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 39 const __m128i vx${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); 40 const __m128i vy${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_y)); 42 …const __m128i vx${ABC[N:N+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_x + ${N}… 43 …const __m128i vy${ABC[N:N+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_y + ${N}… 45 __m128i vx${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) input_x); 46 __m128i vy${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) input_y); 48 __m128i vx${ABC[N:N+8]} = _mm_loadl_epi64((const __m128i*) (input_x + ${N})); 49 __m128i vy${ABC[N:N+8]} = _mm_loadl_epi64((const __m128i*) (input_y + ${N})); 55 …vx${ABC[N:N+8]} = _mm_unpacklo_epi8(vx${ABC[N:N+8]}, _mm_cmpgt_epi8(_mm_setzero_si128(), vx${ABC[N… [all …]
|
D | wasmsimd.c.in | 8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 34 const v128_t vx${ABC[0:8]} = wasm_i16x8_load_8x8(input_x); 35 const v128_t vy${ABC[0:8]} = wasm_i16x8_load_8x8(input_y); 37 const v128_t vx${ABC[N:N+8]} = wasm_i16x8_load_8x8(input_x + ${N}); 38 const v128_t vy${ABC[N:N+8]} = wasm_i16x8_load_8x8(input_y + ${N}); 43 …v128_t vacc${ABC[N:N+4]} = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low… 44 …v128_t vacc${ABC[N+4:N+8]} = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_h… 47 …vacc${ABC[N:N+4]} = wasm_i32x4_add(vacc${ABC[N:N+4]}, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vy… 48 …vacc${ABC[N+4:N+8]} = wasm_i32x4_add(vacc${ABC[N+4:N+8]}, wasm_i32x4_mul(wasm_i32x4_widen_high_i16… 51 …const v128_t vrem${ABC[N:N+4]} = wasm_i32x4_add(wasm_v128_and(vacc${ABC[N:N+4]}, vremainder_mask),… [all …]
|
D | sse-mul32-ld32.c.in | 8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 44 const __m128i vx${ABC[0:4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(input_x)); 45 const __m128i vy${ABC[0:4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(input_y)); 47 const __m128i vx${ABC[N:N+4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(input_x + ${N})); 48 const __m128i vy${ABC[N:N+4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(input_y + ${N})); 54 … __m128i vacc${ABC[N:N+4]} = _mm_macc_epi32(vx${ABC[N:N+4]}, vx_multiplier, vzero_point_product); 57 vacc${ABC[N:N+4]} = _mm_macc_epi32(vy${ABC[N:N+4]}, vy_multiplier, vacc${ABC[N:N+4]}); 60 …__m128i vacc${ABC[N:N+4]} = _mm_add_epi32(vzero_point_product, _mm_mullo_epi32(vx${ABC[N:N+4]}, vx… 63 …vacc${ABC[N:N+4]} = _mm_add_epi32(vacc${ABC[N:N+4]}, _mm_mullo_epi32(vy${ABC[N:N+4]}, vy_multiplie… 66 …__m128i vrem${ABC[N:N+4]} = _mm_add_epi32(_mm_and_si128(vacc${ABC[N:N+4]}, vremainder_mask), _mm_c… [all …]
|
D | avx2-mul32-ld64.c.in | 8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 40 const __m256i vx${ABC[0:8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_x)); 41 const __m256i vy${ABC[0:8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_y)); 43 …const __m256i vx${ABC[N:N+8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (input_x + $… 44 …const __m256i vy${ABC[N:N+8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (input_y + $… 49 …__m256i vacc${ABC[N:N+8]} = _mm256_add_epi32(vzero_point_product, _mm256_mullo_epi32(vx${ABC[N:N+8… 52 …vacc${ABC[N:N+8]} = _mm256_add_epi32(vacc${ABC[N:N+8]}, _mm256_mullo_epi32(vy${ABC[N:N+8]}, vy_mul… 55 …const __m256i vrem${ABC[N:N+8]} = _mm256_add_epi32(_mm256_and_si256(vacc${ABC[N:N+8]}, vremainder_… 58 …vacc${ABC[N:N+8]} = _mm256_sub_epi32(_mm256_sra_epi32(vacc${ABC[N:N+8]}, vshift), _mm256_cmpgt_epi… 62 …_m256i vout${ABC[N:N+4]}${ABC[N+8:N+12]}${ABC[N+4:N+8]}${ABC[N+12:N+16]} = _mm256_adds_epi16(_mm25… [all …]
|
D | neon-ld64.c.in | 8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 35 const int8x8_t vx${ABC[N:N+8]} = vld1_s8(input_x); input_x += 8; 36 const int8x8_t vy${ABC[N:N+8]} = vld1_s8(input_y); input_y += 8; 39 const int16x8_t vex${ABC[N:N+8]} = vsubl_s8(vx${ABC[N:N+8]}, vx_zero_point); 40 const int16x8_t vey${ABC[N:N+8]} = vsubl_s8(vy${ABC[N:N+8]}, vy_zero_point); 43 … int32x4_t vacc${ABC[N:N+4]} = vmulq_s32(vmovl_s16(vget_low_s16(vex${ABC[N:N+8]})), vx_multiplier); 44 …int32x4_t vacc${ABC[N+4:N+8]} = vmulq_s32(vmovl_s16(vget_high_s16(vex${ABC[N:N+8]})), vx_multiplie… 47 …vacc${ABC[N:N+4]} = vmlaq_s32(vacc${ABC[N:N+4]}, vmovl_s16(vget_low_s16(vey${ABC[N:N+8]})), vy_mul… 48 …vacc${ABC[N+4:N+8]} = vmlaq_s32(vacc${ABC[N+4:N+8]}, vmovl_s16(vget_high_s16(vey${ABC[N:N+8]})), v… 51 …vacc${ABC[N:N+4]} = vsraq_n_s32(vacc${ABC[N:N+4]}, vbicq_s32(vacc${ABC[N:N+4]}, vzero_shift_mask),… [all …]
|
/external/XNNPACK/src/qs8-gavgpool/ |
D | unipass-sse.c.in | 12 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 53 … const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M})); 55 …const __m128i vxi${M}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i${M} + … 57 const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); 59 const __m128i vi${M}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${M} + ${C})); 65 …const __m128i vxi${M}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M}x${ABC[C:C+8]}, _mm_cmpgt_epi8(_mm_s… 69 …__m128i vacc${A}x${ABC[C:C+8]} = _mm_add_epi16(vxi${A*2}x${ABC[C:C+8]}, vxi${A*2+1}x${ABC[C:C+8]}); 73 …vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = _mm_add_epi16(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vxi${… 76 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]} 82 …vacc${A}x${ABC[C:C+8]} = _mm_add_epi16(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]}); [all …]
|
D | unipass-neon.c.in | 11 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 61 const int8x8_t vi${M}x${ABC[C:C+8]} = vld1_s8(i${M}); i${M} += 8; 65 … int16x8_t vacc${A}x${ABC[C:C+8]} = vaddl_s8(vi${A*2}x${ABC[C:C+8]}, vi${A*2+1}x${ABC[C:C+8]}); 69 …vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = vaddw_s8(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vi${M}x${A… 72 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]} 78 … vacc${A}x${ABC[C:C+8]} = vaddq_s16(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]}); 82 int32x4_t vacc${ABC[C:C+4]} = vaddw_s16(vbias, vget_low_s16(vacc0x${ABC[C:C+8]})); 83 int32x4_t vacc${ABC[C+4:C+8]} = vaddw_s16(vbias, vget_high_s16(vacc0x${ABC[C:C+8]})); 86 …const int32x4_t vsgnacc${ABC[C:C+4]} = vreinterpretq_s32_u32(vcltq_s32(vacc${ABC[C:C+4]}, vmovq_n_… 90 …const int64x2_t vprod${ABC[C:C+2]} = vmull_s32(vget_low_s32(vacc${ABC[C:C+4]}), vget_low_s32(vmult… [all …]
|
D | multipass-sse.c.in | 15 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 52 … const __m128i vxi${M}x${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) i${M})); 54 …const __m128i vxi${M}x${ABC[C:C+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (i${M} + … 56 const __m128i vi${M}x${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) i${M}); 58 const __m128i vi${M}x${ABC[C:C+8]} = _mm_loadl_epi64((const __m128i*) (i${M} + ${C})); 64 …const __m128i vxi${M}x${ABC[C:C+8]} = _mm_unpacklo_epi8(vi${M}x${ABC[C:C+8]}, _mm_cmpgt_epi8(_mm_s… 68 …__m128i vacc${A}x${ABC[C:C+8]} = _mm_add_epi16(vxi${A*2}x${ABC[C:C+8]}, vxi${A*2+1}x${ABC[C:C+8]}); 72 …vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = _mm_add_epi16(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vxi${… 75 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]} 81 …vacc${A}x${ABC[C:C+8]} = _mm_add_epi16(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]}); [all …]
|
D | unipass-wasmsimd.c.in | 11 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 51 const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M}); 53 const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load_8x8(i${M} + ${C}); 58 …v128_t vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vxi${A*2}x${ABC[C:C+8]}, vxi${A*2+1}x${ABC[C:C+8]}); 62 …vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vxi$… 65 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]} 71 …vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]}… 75 …const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_low_i16x8(vacc0x${ABC[C:C+… 76 …const v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_high_i16x8(vacc0x${ABC[C… 79 const v128_t vabsacc${ABC[C:C+4]} = wasm_i32x4_abs(vacc${ABC[C:C+4]}); [all …]
|
D | multipass-neon.c.in | 14 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 50 const int8x8_t vi${M}x${ABC[C:C+8]} = vld1_s8(i${M}); i${M} += 8; 54 … int16x8_t vacc${A}x${ABC[C:C+8]} = vaddl_s8(vi${A*2}x${ABC[C:C+8]}, vi${A*2+1}x${ABC[C:C+8]}); 58 …vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = vaddw_s8(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vi${M}x${A… 61 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]} 67 … vacc${A}x${ABC[C:C+8]} = vaddq_s16(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]}); 71 const int32x4_t vacc${ABC[C:C+4]} = vaddw_s16(vbias, vget_low_s16(vacc0x${ABC[C:C+8]})); 72 const int32x4_t vacc${ABC[C+4:C+8]} = vaddw_s16(vbias, vget_high_s16(vacc0x${ABC[C:C+8]})); 75 vst1q_s32(b, vacc${ABC[C:C+4]}); b += 4; 81 const int8x8_t vi${M}x${ABC[0:8]} = vld1_s8(i${M}); i${M} += 8; [all …]
|
D | multipass-wasmsimd.c.in | 14 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 49 const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M}); 51 const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load_8x8(i${M} + ${C}); 56 …v128_t vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vxi${A*2}x${ABC[C:C+8]}, vxi${A*2+1}x${ABC[C:C+8]}); 60 …vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vxi$… 63 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]} 69 …vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]}… 73 …const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_low_i16x8(vacc0x${ABC[C:C+… 74 …const v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_high_i16x8(vacc0x${ABC[C… 76 wasm_v128_store(b, vacc${ABC[0:4]}); [all …]
|
/external/XNNPACK/src/qs8-vaddc/ |
D | wasmsimd.c.in | 8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 35 const v128_t vx${ABC[0:8]} = wasm_i16x8_load_8x8(input_x); 37 const v128_t vx${ABC[N:N+8]} = wasm_i16x8_load_8x8(input_x + ${N}); 41 …v128_t vacc${ABC[N:N+4]} = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low… 42 …v128_t vacc${ABC[N+4:N+8]} = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_h… 45 …const v128_t vrem${ABC[N:N+4]} = wasm_i32x4_add(wasm_v128_and(vacc${ABC[N:N+4]}, vremainder_mask),… 48 …vacc${ABC[N:N+4]} = wasm_i32x4_sub(wasm_i32x4_shr(vacc${ABC[N:N+4]}, vshift), wasm_i32x4_gt(vrem${… 51 …v128_t vout${ABC[N:N+8]} = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc${ABC[N:N+4]}, vacc… 55 … v128_t vout${ABC[N:N+16]} = wasm_i8x16_narrow_i16x8(vout${ABC[N:N+8]}, vout${ABC[N+8:N+16]}); 57 …v128_t vout${ABC[N:N+8]}${ABC[N:N+8]} = wasm_i8x16_narrow_i16x8(vout${ABC[N:N+8]}, vout${ABC[N:N+8… [all …]
|
D | sse-mul16-ld64.c.in | 9 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 39 const __m128i vx${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x)); 41 …const __m128i vx${ABC[N:N+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_x + ${N}… 43 __m128i vx${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) input_x); 45 __m128i vx${ABC[N:N+8]} = _mm_loadl_epi64((const __m128i*) (input_x + ${N})); 50 …vx${ABC[N:N+8]} = _mm_unpacklo_epi8(vx${ABC[N:N+8]}, _mm_cmpgt_epi8(_mm_setzero_si128(), vx${ABC[N… 53 __m128i vxprod${ABC[N:N+8]}hi = _mm_mulhi_epu16(vx${ABC[N:N+8]}, vx_multiplier_lo); 54 const __m128i vxprod${ABC[N:N+8]}lo = _mm_mullo_epi16(vx${ABC[N:N+8]}, vx_multiplier_lo); 57 …vxprod${ABC[N:N+8]}hi = _mm_add_epi16(vxprod${ABC[N:N+8]}hi, _mm_mullo_epi16(vx${ABC[N:N+8]}, vx_m… 60 …vxprod${ABC[N:N+8]}hi = _mm_sub_epi16(vxprod${ABC[N:N+8]}hi, _mm_and_si128(_mm_srai_epi16(vx${ABC[… [all …]
|
D | sse-mul32-ld32.c.in | 8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 45 const __m128i vx${ABC[0:4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(input_x)); 47 const __m128i vx${ABC[N:N+4]} = _mm_cvtepi8_epi32(_mm_loadu_si32(input_x + ${N})); 53 … __m128i vacc${ABC[N:N+4]} = _mm_macc_epi32(vx${ABC[N:N+4]}, vx_multiplier, vzero_point_product); 56 …__m128i vacc${ABC[N:N+4]} = _mm_add_epi32(vzero_point_product, _mm_mullo_epi32(vx${ABC[N:N+4]}, vx… 59 …__m128i vrem${ABC[N:N+4]} = _mm_add_epi32(_mm_and_si128(vacc${ABC[N:N+4]}, vremainder_mask), _mm_c… 62 …vacc${ABC[N:N+4]} = _mm_sub_epi32(_mm_sra_epi32(vacc${ABC[N:N+4]}, vshift), _mm_cmpgt_epi32(vrem${… 65 …__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[N:N+4]}, vacc${ABC[N+4:N+8]})… 68 vout${ABC[N:N+8]} = _mm_max_epi16(vout${ABC[N:N+8]}, voutput_min); 71 vout${ABC[N:N+8]} = _mm_min_epi16(vout${ABC[N:N+8]}, voutput_max); [all …]
|
D | avx2-mul32-ld64.c.in | 8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 41 const __m256i vx${ABC[0:8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) input_x)); 43 …const __m256i vx${ABC[N:N+8]} = _mm256_cvtepi8_epi32(_mm_loadl_epi64((const __m128i*) (input_x + $… 47 …__m256i vacc${ABC[N:N+8]} = _mm256_add_epi32(vzero_point_product, _mm256_mullo_epi32(vx${ABC[N:N+8… 50 …const __m256i vrem${ABC[N:N+8]} = _mm256_add_epi32(_mm256_and_si256(vacc${ABC[N:N+8]}, vremainder_… 53 …vacc${ABC[N:N+8]} = _mm256_sub_epi32(_mm256_sra_epi32(vacc${ABC[N:N+8]}, vshift), _mm256_cmpgt_epi… 57 …_m256i vout${ABC[N:N+4]}${ABC[N+8:N+12]}${ABC[N+4:N+8]}${ABC[N+12:N+16]} = _mm256_adds_epi16(_mm25… 59 …__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[N:N+8]… 61 …__m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(_mm256_castsi256_si128(vacc${ABC[N:N+8]… 65 …${ABC[N:N+4]}${ABC[N+8:N+12]}${ABC[N+4:N+8]}${ABC[N+12:N+16]} = _mm256_min_epi16(_mm256_max_epi16(… [all …]
|
D | neon-ld64.c.in | 8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 40 const int8x8_t vx${ABC[N:N+8]} = vld1_s8(input_x); input_x += 8; 43 const int16x8_t vex${ABC[N:N+8]} = vsubl_s8(vx${ABC[N:N+8]}, vx_zero_point); 46 …int32x4_t vacc${ABC[N:N+4]} = vmlaq_s32(vy_bias, vmovl_s16(vget_low_s16(vex${ABC[N:N+8]})), vx_mul… 47 …int32x4_t vacc${ABC[N+4:N+8]} = vmlaq_s32(vy_bias, vmovl_s16(vget_high_s16(vex${ABC[N:N+8]})), vx_… 50 …vacc${ABC[N:N+4]} = vsraq_n_s32(vacc${ABC[N:N+4]}, vbicq_s32(vacc${ABC[N:N+4]}, vzero_shift_mask),… 53 vacc${ABC[N:N+4]} = vrshlq_s32(vacc${ABC[N:N+4]}, vright_shift); 56 …const int16x8_t vacc${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${ABC[N:N+4]}), vqmovn_… 60 …int8x16_t vout${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${ABC[N:N+8]}), vqmovn_s16(vacc${ABC[N+8… 62 int8x8_t vout${ABC[N:N+8]} = vqmovn_s16(vacc${ABC[N:N+8]}); [all …]
|
/external/XNNPACK/src/f32-conv-hwc/ |
D | 3x3s2p1c3-neon-x2.c.in | 8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 90 float32x4_t vo0x0c${ABC[0:4]} = vld1q_f32(w); 92 float32x4_t vo0x0c${ABC[C:C+4]} = vld1q_f32(w + ${C}); 95 float32x4_t vo${Y}x0c${ABC[C:C+4]} = vo0x0c${ABC[C:C+4]}; 98 float32x4_t vo${Y}x1c${ABC[C:C+4]} = vo0x0c${ABC[C:C+4]}; 101 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE}); 109 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_l… 113 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_h… 116 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2}); 120 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_l… [all …]
|
D | 3x3s2p0p1c3-neon-x2.c.in | 8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 90 float32x4_t vo0x0c${ABC[0:4]} = vld1q_f32(w); 92 float32x4_t vo0x0c${ABC[C:C+4]} = vld1q_f32(w + ${C}); 95 float32x4_t vo${Y}x0c${ABC[C:C+4]} = vo0x0c${ABC[C:C+4]}; 98 float32x4_t vo${Y}x1c${ABC[C:C+4]} = vo0x0c${ABC[C:C+4]}; 101 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE}); 109 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_l… 113 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_h… 116 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2}); 120 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_l… [all …]
|
/external/XNNPACK/src/f32-vsqrt/ |
D | neonfma-nr2fma1adj.c.in | 8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 31 const float32x4_t vx${ABC[N:N+4]} = vld1q_f32(x); x += 4; 34 const float32x4_t vrsqrtx${ABC[N:N+4]} = vrsqrteq_f32(vx${ABC[N:N+4]}); 37 float32x4_t vsqrtx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vx${ABC[N:N+4]}); 38 float32x4_t vhalfrsqrtx${ABC[N:N+4]} = vmulq_f32(vrsqrtx${ABC[N:N+4]}, vhalf); 41 …float32x4_t vresidual${ABC[N:N+4]} = vfmsq_f32(vhalf, vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]… 44 …vhalfrsqrtx${ABC[N:N+4]} = vfmaq_f32(vhalfrsqrtx${ABC[N:N+4]}, vresidual${ABC[N:N+4]}, vhalfrsqrtx… 45 … vsqrtx${ABC[N:N+4]} = vfmaq_f32(vsqrtx${ABC[N:N+4]}, vresidual${ABC[N:N+4]}, vsqrtx${ABC[N:N+4]}); 48 vresidual${ABC[N:N+4]} = vfmsq_f32(vhalf, vsqrtx${ABC[N:N+4]}, vhalfrsqrtx${ABC[N:N+4]}); 51 …vhalfrsqrtx${ABC[N:N+4]} = vfmaq_f32(vhalfrsqrtx${ABC[N:N+4]}, vresidual${ABC[N:N+4]}, vhalfrsqrtx… [all …]
|