Lines Matching refs:ABC

14 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
50 const int8x8_t vi${M}x${ABC[C:C+8]} = vld1_s8(i${M}); i${M} += 8;
54 … int16x8_t vacc${A}x${ABC[C:C+8]} = vaddl_s8(vi${A*2}x${ABC[C:C+8]}, vi${A*2+1}x${ABC[C:C+8]});
58 …vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = vaddw_s8(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vi${M}x${A…
61 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]}
67 … vacc${A}x${ABC[C:C+8]} = vaddq_s16(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]});
71 const int32x4_t vacc${ABC[C:C+4]} = vaddw_s16(vbias, vget_low_s16(vacc0x${ABC[C:C+8]}));
72 const int32x4_t vacc${ABC[C+4:C+8]} = vaddw_s16(vbias, vget_high_s16(vacc0x${ABC[C:C+8]}));
75 vst1q_s32(b, vacc${ABC[C:C+4]}); b += 4;
81 const int8x8_t vi${M}x${ABC[0:8]} = vld1_s8(i${M}); i${M} += 8;
84 int16x8_t vacc${A}x${ABC[0:8]} = vaddl_s8(vi${A*2}x${ABC[0:8]}, vi${A*2+1}x${ABC[0:8]});
87 …vacc${M % ACCUMULATORS}x${ABC[0:8]} = vaddw_s8(vacc${M % ACCUMULATORS}x${ABC[0:8]}, vi${M}x${ABC[0…
90 // Add up all accumulators to vacc0x${ABC[0:8]}
95 … vacc${A}x${ABC[0:8]} = vaddq_s16(vacc${A}x${ABC[0:8]}, vacc${A + ACC_SLICE}x${ABC[0:8]});
98 const int32x4_t vacc${ABC[0:4]} = vaddw_s16(vbias, vget_low_s16(vacc0x${ABC[0:8]}));
99 const int32x4_t vacc${ABC[4:8]} = vaddw_s16(vbias, vget_high_s16(vacc0x${ABC[0:8]}));
101 vst1q_s32(b, vacc${ABC[0:4]}); b += 4;
102 vst1q_s32(b, vacc${ABC[4:8]}); b += 4;
117 const int8x8_t vi${M}x${ABC[C:C+8]} = vld1_s8(i${M}); i${M} += 8;
121 … int16x8_t vacc${A}x${ABC[C:C+8]} = vaddl_s8(vi${A*2}x${ABC[C:C+8]}, vi${A*2+1}x${ABC[C:C+8]});
125 …vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = vaddw_s8(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vi${M}x${A…
128 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]}
134 … vacc${A}x${ABC[C:C+8]} = vaddq_s16(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]});
137 int32x4_t vacc${ABC[0:4]} = vld1q_s32(b);
139 int32x4_t vacc${ABC[C:C+4]} = vld1q_s32(b + ${C});
142 vacc${ABC[C:C+4]} = vaddw_s16(vacc${ABC[C:C+4]}, vget_low_s16(vacc0x${ABC[C:C+8]}));
143 vacc${ABC[C+4:C+8]} = vaddw_s16(vacc${ABC[C+4:C+8]}, vget_high_s16(vacc0x${ABC[C:C+8]}));
146 vst1q_s32(b, vacc${ABC[C:C+4]}); b += 4;
152 const int8x8_t vi${M}x${ABC[0:8]} = vld1_s8(i${M}); i${M} += 8;
155 int16x8_t vacc${A}x${ABC[0:8]} = vaddl_s8(vi${A*2}x${ABC[0:8]}, vi${A*2+1}x${ABC[0:8]});
158 …vacc${M % ACCUMULATORS}x${ABC[0:8]} = vaddw_s8(vacc${M % ACCUMULATORS}x${ABC[0:8]}, vi${M}x${ABC[0…
161 // Add up all accumulators to vacc0x${ABC[0:8]}
166 … vacc${A}x${ABC[0:8]} = vaddq_s16(vacc${A}x${ABC[0:8]}, vacc${A + ACC_SLICE}x${ABC[0:8]});
169 int32x4_t vacc${ABC[0:4]} = vld1q_s32(b);
170 int32x4_t vacc${ABC[4:8]} = vld1q_s32(b + 4);
172 vacc${ABC[0:4]} = vaddw_s16(vacc${ABC[0:4]}, vget_low_s16(vacc0x${ABC[0:8]}));
173 vacc${ABC[4:8]} = vaddw_s16(vacc${ABC[4:8]}, vget_high_s16(vacc0x${ABC[0:8]}));
175 vst1q_s32(b, vacc${ABC[0:4]}); b += 4;
176 vst1q_s32(b, vacc${ABC[4:8]}); b += 4;
211 const int8x8_t vi${M}x${ABC[C:C+8]} = vld1_s8(i${M}); i${M} += 8;
215 … int16x8_t vacc${A}x${ABC[C:C+8]} = vaddl_s8(vi${A*2}x${ABC[C:C+8]}, vi${A*2+1}x${ABC[C:C+8]});
219 …vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = vaddw_s8(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vi${M}x${A…
222 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]}
228 … vacc${A}x${ABC[C:C+8]} = vaddq_s16(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]});
232 int32x4_t vacc${ABC[C:C+4]} = vld1q_s32(buffer); buffer += 4;
235 vacc${ABC[C:C+4]} = vaddw_s16(vacc${ABC[C:C+4]}, vget_low_s16(vacc0x${ABC[C:C+8]}));
236 vacc${ABC[C+4:C+8]} = vaddw_s16(vacc${ABC[C+4:C+8]}, vget_high_s16(vacc0x${ABC[C:C+8]}));
239 …const int32x4_t vsgnacc${ABC[C:C+4]} = vreinterpretq_s32_u32(vcltq_s32(vacc${ABC[C:C+4]}, vmovq_n_…
243 …const int64x2_t vprod${ABC[C:C+2]} = vmull_s32(vget_low_s32(vacc${ABC[C:C+4]}), vget_low_s32(vmult…
244 const int64x2_t vprod${ABC[C+2:C+4]} = vmull_high_s32(vacc${ABC[C:C+4]}, vmultiplier);
247 …const int64x2_t vadjprod${ABC[C:C+2]} = vaddw_s32(vprod${ABC[C:C+2]}, vget_low_s32(vsgnacc${ABC[C:…
248 …const int64x2_t vadjprod${ABC[C+2:C+4]} = vaddw_high_s32(vprod${ABC[C+2:C+4]}, vsgnacc${ABC[C:C+4]…
251 const int64x2_t vprod${ABC[C:C+2]} = vmull_s32(vget_low_s32(vacc${ABC[C:C+4]}), vmultiplier);
252 … const int64x2_t vprod${ABC[C+2:C+4]} = vmull_s32(vget_high_s32(vacc${ABC[C:C+4]}), vmultiplier);
255 …const int64x2_t vadjprod${ABC[C:C+2]} = vaddw_s32(vprod${ABC[C:C+2]}, vget_low_s32(vsgnacc${ABC[C:…
256 …const int64x2_t vadjprod${ABC[C+2:C+4]} = vaddw_s32(vprod${ABC[C+2:C+4]}, vget_high_s32(vsgnacc${A…
260 const int64x2_t vacc${ABC[C:C+2]} = vrshlq_s64(vadjprod${ABC[C:C+2]}, vleft_shift);
264 …vacc${ABC[C:C+4]} = vuzp1q_s32(vreinterpretq_s32_s64(vacc${ABC[C:C+2]}), vreinterpretq_s32_s64(vac…
267 …const int16x8_t vacc${ABC[C:C+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${ABC[C:C+4]}), vacc…
271 …int8x16_t vout${ABC[C:C+16]} = vqmovn_high_s16(vqmovn_s16(vacc${ABC[C:C+8]}), vacc${ABC[C+8:C+16]}…
273 int8x8_t vout${ABC[C:C+8]} = vqmovn_s16(vacc${ABC[C:C+8]});
276 … vacc${ABC[C:C+4]} = vcombine_s32(vmovn_s64(vacc${ABC[C:C+2]}), vmovn_s64(vacc${ABC[C+2:C+4]}));
279 …const int16x8_t vacc${ABC[C:C+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${ABC[C:C+4]}), vqmovn_…
283 …int8x16_t vout${ABC[C:C+16]} = vcombine_s8(vqmovn_s16(vacc${ABC[C:C+8]}), vqmovn_s16(vacc${ABC[C+8…
285 int8x8_t vout${ABC[C:C+8]} = vqmovn_s16(vacc${ABC[C:C+8]});
290 vout${ABC[C:C+16]} = vmaxq_s8(vout${ABC[C:C+16]}, voutput_min);
292 vout${ABC[C:C+8]} = vmax_s8(vout${ABC[C:C+8]}, vget_low_s8(voutput_min));
294 vout${ABC[C:C+8]} = vmax_s8(vout${ABC[C:C+8]}, voutput_min);
298 vout${ABC[C:C+16]} = vminq_s8(vout${ABC[C:C+16]}, voutput_max);
300 vout${ABC[C:C+8]} = vmin_s8(vout${ABC[C:C+8]}, vget_low_s8(voutput_max));
302 vout${ABC[C:C+8]} = vmin_s8(vout${ABC[C:C+8]}, voutput_max);
306 vst1q_s8(output, vout${ABC[C:C+16]}); output += 16;
308 vst1_s8(output, vout${ABC[C:C+8]}); output += 8;
316 const int8x8_t vi${M}x${ABC[0:8]} = vld1_s8(i${M}); i${M} += 8;
318 const int8x8_t vi${M}x${ABC[0:8]} = vld1_s8(i${M});
321 int16x8_t vacc${A}x${ABC[0:8]} = vaddl_s8(vi${A*2}x${ABC[0:8]}, vi${A*2+1}x${ABC[0:8]});
324 …vacc${M % ACCUMULATORS}x${ABC[0:8]} = vaddw_s8(vacc${M % ACCUMULATORS}x${ABC[0:8]}, vi${M}x${ABC[0…
327 // Add up all accumulators to vacc0x${ABC[0:8]}
332 … vacc${A}x${ABC[0:8]} = vaddq_s16(vacc${A}x${ABC[0:8]}, vacc${A + ACC_SLICE}x${ABC[0:8]});
335 int32x4_t vacc${ABC[0:4]} = vld1q_s32(buffer); buffer += 4;
336 int32x4_t vacc${ABC[4:8]} = vld1q_s32(buffer); buffer += 4;
338 vacc${ABC[0:4]} = vaddw_s16(vacc${ABC[0:4]}, vget_low_s16(vacc0x${ABC[0:8]}));
339 vacc${ABC[4:8]} = vaddw_s16(vacc${ABC[4:8]}, vget_high_s16(vacc0x${ABC[0:8]}));
341 …const int32x4_t vsgnacc${ABC[0:4]} = vreinterpretq_s32_u32(vcltq_s32(vacc${ABC[0:4]}, vmovq_n_s32(…
342 …const int32x4_t vsgnacc${ABC[4:8]} = vreinterpretq_s32_u32(vcltq_s32(vacc${ABC[4:8]}, vmovq_n_s32(…
345 …const int64x2_t vprod${ABC[0:2]} = vmull_s32(vget_low_s32(vacc${ABC[0:4]}), vget_low_s32(vmultipli…
346 const int64x2_t vprod${ABC[2:4]} = vmull_high_s32(vacc${ABC[0:4]}, vmultiplier);
347 …const int64x2_t vprod${ABC[4:6]} = vmull_s32(vget_low_s32(vacc${ABC[4:8]}), vget_low_s32(vmultipli…
348 const int64x2_t vprod${ABC[6:8]} = vmull_high_s32(vacc${ABC[4:8]}, vmultiplier);
350 …const int64x2_t vadjprod${ABC[0:2]} = vaddw_s32(vprod${ABC[0:2]}, vget_low_s32(vsgnacc${ABC[0:4]})…
351 const int64x2_t vadjprod${ABC[2:4]} = vaddw_high_s32(vprod${ABC[2:4]}, vsgnacc${ABC[0:4]});
352 …const int64x2_t vadjprod${ABC[4:6]} = vaddw_s32(vprod${ABC[4:6]}, vget_low_s32(vsgnacc${ABC[4:8]})…
353 const int64x2_t vadjprod${ABC[6:8]} = vaddw_high_s32(vprod${ABC[6:8]}, vsgnacc${ABC[4:8]});
355 const int64x2_t vprod${ABC[0:2]} = vmull_s32(vget_low_s32(vacc${ABC[0:4]}), vmultiplier);
356 const int64x2_t vprod${ABC[2:4]} = vmull_s32(vget_high_s32(vacc${ABC[0:4]}), vmultiplier);
357 const int64x2_t vprod${ABC[4:6]} = vmull_s32(vget_low_s32(vacc${ABC[4:8]}), vmultiplier);
358 const int64x2_t vprod${ABC[6:8]} = vmull_s32(vget_high_s32(vacc${ABC[4:8]}), vmultiplier);
360 …const int64x2_t vadjprod${ABC[0:2]} = vaddw_s32(vprod${ABC[0:2]}, vget_low_s32(vsgnacc${ABC[0:4]})…
361 …const int64x2_t vadjprod${ABC[2:4]} = vaddw_s32(vprod${ABC[2:4]}, vget_high_s32(vsgnacc${ABC[0:4]}…
362 …const int64x2_t vadjprod${ABC[4:6]} = vaddw_s32(vprod${ABC[4:6]}, vget_low_s32(vsgnacc${ABC[4:8]})…
363 …const int64x2_t vadjprod${ABC[6:8]} = vaddw_s32(vprod${ABC[6:8]}, vget_high_s32(vsgnacc${ABC[4:8]}…
366 const int64x2_t vacc${ABC[0:2]} = vrshlq_s64(vadjprod${ABC[0:2]}, vleft_shift);
367 const int64x2_t vacc${ABC[2:4]} = vrshlq_s64(vadjprod${ABC[2:4]}, vleft_shift);
368 const int64x2_t vacc${ABC[4:6]} = vrshlq_s64(vadjprod${ABC[4:6]}, vleft_shift);
369 const int64x2_t vacc${ABC[6:8]} = vrshlq_s64(vadjprod${ABC[6:8]}, vleft_shift);
372 …vacc${ABC[0:4]} = vuzp1q_s32(vreinterpretq_s32_s64(vacc${ABC[0:2]}), vreinterpretq_s32_s64(vacc${A…
373 …vacc${ABC[4:8]} = vuzp1q_s32(vreinterpretq_s32_s64(vacc${ABC[4:6]}), vreinterpretq_s32_s64(vacc${A…
375 …const int16x8_t vacc${ABC[0:8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${ABC[0:4]}), vacc${AB…
377 int8x8_t vout${ABC[0:8]} = vqmovn_s16(vacc${ABC[0:8]});
379 vacc${ABC[0:4]} = vcombine_s32(vmovn_s64(vacc${ABC[0:2]}), vmovn_s64(vacc${ABC[2:4]}));
380 vacc${ABC[4:8]} = vcombine_s32(vmovn_s64(vacc${ABC[4:6]}), vmovn_s64(vacc${ABC[6:8]}));
382 …const int16x8_t vacc${ABC[0:8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${ABC[0:4]}), vqmovn_s32(…
384 int8x8_t vout${ABC[0:8]} = vqmovn_s16(vacc${ABC[0:8]});
388 vout${ABC[0:8]} = vmax_s8(vout${ABC[0:8]}, vget_low_s8(voutput_min));
389 vout${ABC[0:8]} = vmin_s8(vout${ABC[0:8]}, vget_low_s8(voutput_max));
391 vout${ABC[0:8]} = vmax_s8(vout${ABC[0:8]}, voutput_min);
392 vout${ABC[0:8]} = vmin_s8(vout${ABC[0:8]}, voutput_max);
396 vst1_s8(output, vout${ABC[0:8]}); output += 8;
400 …vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_s8(vout${ABC[0:8]}), 0); outpu…
401 vout${ABC[0:8]} = vext_s8(vout${ABC[0:8]}, vout${ABC[0:8]}, 4);
404 …vst1_lane_u16(__builtin_assume_aligned(output, 1), vreinterpret_u16_s8(vout${ABC[0:8]}), 0); outpu…
405 vout${ABC[0:8]} = vext_s8(vout${ABC[0:8]}, vout${ABC[0:8]}, 2);
408 vst1_lane_s8(output, vout${ABC[0:8]}, 0); output += 1;
414 …vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_s8(vout${ABC[0:8]}), 0); outpu…
415 vout${ABC[0:8]} = vext_s8(vout${ABC[0:8]}, vout${ABC[0:8]}, 4);
418 …vst1_lane_u16(__builtin_assume_aligned(output, 1), vreinterpret_u16_s8(vout${ABC[0:8]}), 0); outpu…
419 vout${ABC[0:8]} = vext_s8(vout${ABC[0:8]}, vout${ABC[0:8]}, 2);
422 vst1_lane_s8(output, vout${ABC[0:8]}, 0);