Lines Matching refs:ABC
8 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
90 float32x4_t vo0x0c${ABC[0:4]} = vld1q_f32(w);
92 float32x4_t vo0x0c${ABC[C:C+4]} = vld1q_f32(w + ${C});
95 float32x4_t vo${Y}x0c${ABC[C:C+4]} = vo0x0c${ABC[C:C+4]};
98 float32x4_t vo${Y}x1c${ABC[C:C+4]} = vo0x0c${ABC[C:C+4]};
101 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE});
109 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_l…
113 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_h…
116 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2});
120 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_l…
124 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_h…
127 const float32x4_t vk20c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 3});
131 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk20c0x${ABC[C:C+4]}, vget_l…
135 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk20c0x${ABC[C:C+4]}, vget_h…
138 const float32x4_t vk00c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 4});
146 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk00c1x${ABC[C:C+4]}, vget_h…
150 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk00c1x${ABC[C:C+4]}, vget_l…
153 const float32x4_t vk10c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 5});
157 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk10c1x${ABC[C:C+4]}, vget_h…
161 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk10c1x${ABC[C:C+4]}, vget_l…
164 const float32x4_t vk20c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 6});
168 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk20c1x${ABC[C:C+4]}, vget_h…
172 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk20c1x${ABC[C:C+4]}, vget_l…
175 const float32x4_t vk00c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 7});
179 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk00c2x${ABC[C:C+4]}, vget_h…
183 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk00c2x${ABC[C:C+4]}, vget_l…
186 const float32x4_t vk10c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 8});
190 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk10c2x${ABC[C:C+4]}, vget_h…
194 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk10c2x${ABC[C:C+4]}, vget_l…
197 const float32x4_t vk20c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 9});
201 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk20c2x${ABC[C:C+4]}, vget_h…
205 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk20c2x${ABC[C:C+4]}, vget_l…
208 const float32x4_t vk01c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 10});
212 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk01c0x${ABC[C:C+4]}, vget_l…
216 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk01c0x${ABC[C:C+4]}, vget_h…
219 const float32x4_t vk11c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 11});
223 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk11c0x${ABC[C:C+4]}, vget_l…
227 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk11c0x${ABC[C:C+4]}, vget_h…
230 const float32x4_t vk21c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 12});
234 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk21c0x${ABC[C:C+4]}, vget_l…
238 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk21c0x${ABC[C:C+4]}, vget_h…
241 const float32x4_t vk01c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 13});
245 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk01c1x${ABC[C:C+4]}, vget_l…
249 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk01c1x${ABC[C:C+4]}, vget_h…
252 const float32x4_t vk11c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 14});
256 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk11c1x${ABC[C:C+4]}, vget_l…
260 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk11c1x${ABC[C:C+4]}, vget_h…
263 const float32x4_t vk21c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 15});
267 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk21c1x${ABC[C:C+4]}, vget_l…
271 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk21c1x${ABC[C:C+4]}, vget_h…
274 const float32x4_t vk01c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 16});
282 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk01c2x${ABC[C:C+4]}, vget_h…
286 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk01c2x${ABC[C:C+4]}, vget_l…
289 const float32x4_t vk11c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 17});
293 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk11c2x${ABC[C:C+4]}, vget_h…
297 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk11c2x${ABC[C:C+4]}, vget_l…
300 const float32x4_t vk21c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 18});
304 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk21c2x${ABC[C:C+4]}, vget_h…
308 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk21c2x${ABC[C:C+4]}, vget_l…
311 const float32x4_t vk02c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 19});
315 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk02c0x${ABC[C:C+4]}, vget_h…
319 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk02c0x${ABC[C:C+4]}, vget_l…
322 const float32x4_t vk12c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 20});
326 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk12c0x${ABC[C:C+4]}, vget_h…
330 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk12c0x${ABC[C:C+4]}, vget_l…
333 const float32x4_t vk22c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 21});
337 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk22c0x${ABC[C:C+4]}, vget_h…
341 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk22c0x${ABC[C:C+4]}, vget_l…
344 const float32x4_t vk02c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 22});
348 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk02c1x${ABC[C:C+4]}, vget_l…
352 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk02c1x${ABC[C:C+4]}, vget_h…
355 const float32x4_t vk12c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 23});
359 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk12c1x${ABC[C:C+4]}, vget_l…
363 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk12c1x${ABC[C:C+4]}, vget_h…
366 const float32x4_t vk22c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 24});
370 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk22c1x${ABC[C:C+4]}, vget_l…
374 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk22c1x${ABC[C:C+4]}, vget_h…
377 const float32x4_t vk02c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 25});
381 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk02c2x${ABC[C:C+4]}, vget_l…
385 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk02c2x${ABC[C:C+4]}, vget_h…
388 const float32x4_t vk12c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 26});
392 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk12c2x${ABC[C:C+4]}, vget_l…
396 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk12c2x${ABC[C:C+4]}, vget_h…
399 const float32x4_t vk22c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 27});
403 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk22c2x${ABC[C:C+4]}, vget_l…
407 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk22c2x${ABC[C:C+4]}, vget_h…
418 vo${Y}x0c${ABC[C:C+4]} = vmaxq_f32(vo${Y}x0c${ABC[C:C+4]}, vmin);
422 vo${Y}x1c${ABC[C:C+4]} = vmaxq_f32(vo${Y}x1c${ABC[C:C+4]}, vmin);
426 vo${Y}x0c${ABC[C:C+4]} = vminq_f32(vo${Y}x0c${ABC[C:C+4]}, vmax);
430 vo${Y}x1c${ABC[C:C+4]} = vminq_f32(vo${Y}x1c${ABC[C:C+4]}, vmax);
434 vst1q_f32(o${Y}, vo${Y}x0c${ABC[0:4]});
436 vst1q_f32(o${Y} + 4, vo${Y}x0c${ABC[C:C+4]});
440 vst1q_f32(o${Y}, vo${Y}x1c${ABC[0:4]});
442 vst1q_f32(o${Y} + 4, vo${Y}x1c${ABC[C:C+4]});
451 float32x2_t vo${Y}x0c${ABC[0:2]} = vget_low_f32(vo${Y}x0c${ABC[0:4]});
453 float32x2_t vo${Y}x1c${ABC[0:2]} = vget_low_f32(vo${Y}x1c${ABC[0:4]});
458 … vst1q_f32((float*) ((uintptr_t) o${Y}_tmp + output_width_stride), vo${Y}x1c${ABC[C:C+4]});
459 … vo${Y}x1c${ABC[C:C+4]} = vo${Y}x1c${ABC[C+(1<<LOG2_CHANNEL_TILE):C+(1<<LOG2_CHANNEL_TILE)+4]};
462 vst1q_f32(o${Y}_tmp, vo${Y}x0c${ABC[C:C+4]}); o${Y}_tmp += 4;
463 … vo${Y}x0c${ABC[C:C+4]} = vo${Y}x0c${ABC[C+(1<<LOG2_CHANNEL_TILE):C+(1<<LOG2_CHANNEL_TILE)+4]};
466 … vst1_f32((float*) ((uintptr_t) o${Y}_tmp + output_width_stride), vo${Y}x1c${ABC[0:2]});
467 vo${Y}x1c${ABC[0:2]} = vget_high_f32(vo${Y}x1c${ABC[0:4]});
470 vst1_f32(o${Y}_tmp, vo${Y}x0c${ABC[0:2]}); o${Y}_tmp += 2;
471 vo${Y}x0c${ABC[0:2]} = vget_high_f32(vo${Y}x0c${ABC[0:4]});
474 vst1_lane_f32(o${Y}_tmp, vo${Y}x0c${ABC[0:2]}, 0);
477 … vst1_lane_f32((float*) ((uintptr_t) o${Y}_tmp + output_width_stride), vo${Y}x1c${ABC[0:2]}, 0);
486 float32x4_t vo0c${ABC[0:4]} = vld1q_f32(w);
488 float32x4_t vo0c${ABC[C:C+4]} = vld1q_f32(w + ${C});
491 float32x4_t vo${Y}c${ABC[C:C+4]} = vo0c${ABC[C:C+4]};
494 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE});
498 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_low_f…
501 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2});
505 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_low_f…
508 const float32x4_t vk20c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 3});
512 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c0x${ABC[C:C+4]}, vget_low_f…
515 const float32x4_t vk00c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 4});
519 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c1x${ABC[C:C+4]}, vget_high_…
522 const float32x4_t vk10c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 5});
526 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c1x${ABC[C:C+4]}, vget_high_…
529 const float32x4_t vk20c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 6});
533 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c1x${ABC[C:C+4]}, vget_high_…
536 const float32x4_t vk00c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 7});
540 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c2x${ABC[C:C+4]}, vget_high_…
543 const float32x4_t vk10c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 8});
547 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c2x${ABC[C:C+4]}, vget_high_…
550 const float32x4_t vk20c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 9});
554 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c2x${ABC[C:C+4]}, vget_high_…
561 const float32x4_t vk01c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 10});
565 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c0x${ABC[C:C+4]}, vget_low_f…
568 const float32x4_t vk11c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 11});
572 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c0x${ABC[C:C+4]}, vget_low_f…
575 const float32x4_t vk21c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 12});
579 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c0x${ABC[C:C+4]}, vget_low_f…
582 const float32x4_t vk01c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 13});
586 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c1x${ABC[C:C+4]}, vget_low_f…
589 const float32x4_t vk11c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 14});
593 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c1x${ABC[C:C+4]}, vget_low_f…
596 const float32x4_t vk21c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 15});
600 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c1x${ABC[C:C+4]}, vget_low_f…
603 const float32x4_t vk01c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 16});
607 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c2x${ABC[C:C+4]}, vget_high_…
610 const float32x4_t vk11c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 17});
614 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c2x${ABC[C:C+4]}, vget_high_…
617 const float32x4_t vk21c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 18});
621 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c2x${ABC[C:C+4]}, vget_high_…
624 const float32x4_t vk02c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 19});
628 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk02c0x${ABC[C:C+4]}, vget_high_…
631 const float32x4_t vk12c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 20});
635 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk12c0x${ABC[C:C+4]}, vget_high_…
638 const float32x4_t vk22c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 21});
642 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk22c0x${ABC[C:C+4]}, vget_high_…
649 const float32x4_t vk02c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 22});
653 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk02c1x${ABC[C:C+4]}, vi${Y*2}x2…
656 const float32x4_t vk12c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 23});
660 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk12c1x${ABC[C:C+4]}, vi${Y*2+1}…
663 const float32x4_t vk22c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 24});
667 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk22c1x${ABC[C:C+4]}, vi${Y*2+2}…
670 const float32x4_t vk02c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 25});
674 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk02c2x${ABC[C:C+4]}, vi${Y*2}x2…
677 const float32x4_t vk12c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 26});
681 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk12c2x${ABC[C:C+4]}, vi${Y*2+1}…
684 const float32x4_t vk22c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 27});
688 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk22c2x${ABC[C:C+4]}, vi${Y*2+2}…
699 vo${Y}c${ABC[C:C+4]} = vmaxq_f32(vo${Y}c${ABC[C:C+4]}, vmin);
703 vo${Y}c${ABC[C:C+4]} = vminq_f32(vo${Y}c${ABC[C:C+4]}, vmax);
707 vst1q_f32(o${Y}, vo${Y}c${ABC[0:4]});
709 vst1q_f32(o${Y} + 4, vo${Y}c${ABC[C:C+4]});
718 float32x2_t vo${Y}c${ABC[0:2]} = vget_low_f32(vo${Y}c${ABC[0:4]});
723 vst1q_f32(o${Y}_tmp, vo${Y}c${ABC[C:C+4]}); o${Y}_tmp += 4;
724 … vo${Y}c${ABC[C:C+4]} = vo${Y}c${ABC[C+(1<<LOG2_CHANNEL_TILE):C+(1<<LOG2_CHANNEL_TILE)+4]};
727 vst1_f32(o${Y}_tmp, vo${Y}c${ABC[0:2]}); o${Y}_tmp += 2;
728 vo${Y}c${ABC[0:2]} = vget_high_f32(vo${Y}c${ABC[0:4]});
731 vst1_lane_f32(o${Y}_tmp, vo${Y}c${ABC[0:2]}, 0);
739 float32x4_t vo0c${ABC[0:4]} = vld1q_f32(w);
741 float32x4_t vo0c${ABC[C:C+4]} = vld1q_f32(w + ${C});
744 float32x4_t vo${Y}c${ABC[C:C+4]} = vo0c${ABC[C:C+4]};
747 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE});
751 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_low_f…
754 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2});
758 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_low_f…
761 const float32x4_t vk20c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 3});
765 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c0x${ABC[C:C+4]}, vget_low_f…
768 const float32x4_t vk00c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 4});
772 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c1x${ABC[C:C+4]}, vget_high_…
775 const float32x4_t vk10c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 5});
779 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c1x${ABC[C:C+4]}, vget_high_…
782 const float32x4_t vk20c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 6});
786 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c1x${ABC[C:C+4]}, vget_high_…
789 const float32x4_t vk00c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 7});
793 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c2x${ABC[C:C+4]}, vget_high_…
796 const float32x4_t vk10c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 8});
800 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c2x${ABC[C:C+4]}, vget_high_…
803 const float32x4_t vk20c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 9});
807 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c2x${ABC[C:C+4]}, vget_high_…
814 const float32x4_t vk01c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 10});
818 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c0x${ABC[C:C+4]}, vget_low_f…
821 const float32x4_t vk11c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 11});
825 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c0x${ABC[C:C+4]}, vget_low_f…
828 const float32x4_t vk21c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 12});
832 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c0x${ABC[C:C+4]}, vget_low_f…
835 const float32x4_t vk01c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 13});
839 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c1x${ABC[C:C+4]}, vget_low_f…
842 const float32x4_t vk11c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 14});
846 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c1x${ABC[C:C+4]}, vget_low_f…
849 const float32x4_t vk21c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 15});
853 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c1x${ABC[C:C+4]}, vget_low_f…
856 const float32x4_t vk01c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 16});
860 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c2x${ABC[C:C+4]}, vget_high_…
863 const float32x4_t vk11c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 17});
867 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c2x${ABC[C:C+4]}, vget_high_…
870 const float32x4_t vk21c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 18});
874 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c2x${ABC[C:C+4]}, vget_high_…
882 vo${Y}c${ABC[C:C+4]} = vmaxq_f32(vo${Y}c${ABC[C:C+4]}, vmin);
886 vo${Y}c${ABC[C:C+4]} = vminq_f32(vo${Y}c${ABC[C:C+4]}, vmax);
890 vst1q_f32(o${Y}, vo${Y}c${ABC[0:4]});
892 vst1q_f32(o${Y} + 4, vo${Y}c${ABC[C:C+4]});
901 float32x2_t vo${Y}c${ABC[0:2]} = vget_low_f32(vo${Y}c${ABC[0:4]});
906 vst1q_f32(o${Y}_tmp, vo${Y}c${ABC[C:C+4]}); o${Y}_tmp += 4;
907 … vo${Y}c${ABC[C:C+4]} = vo${Y}c${ABC[C+(1<<LOG2_CHANNEL_TILE):C+(1<<LOG2_CHANNEL_TILE)+4]};
910 vst1_f32(o${Y}_tmp, vo${Y}c${ABC[0:2]}); o${Y}_tmp += 2;
911 vo${Y}c${ABC[0:2]} = vget_high_f32(vo${Y}c${ABC[0:4]});
914 vst1_lane_f32(o${Y}_tmp, vo${Y}c${ABC[0:2]}, 0);