Lines Matching +full:0 +full:c
6 $assert CHANNEL_TILE % 4 == 0
34 assert(input_width != 0);
37 assert(output_channels != 0);
82 size_t c = output_channels;
90 float32x4_t vo0x0c${ABC[0:4]} = vld1q_f32(w);
91 $for C in range(4, CHANNEL_TILE, 4):
92 float32x4_t vo0x0c${ABC[C:C+4]} = vld1q_f32(w + ${C});
94 $for C in range(0, CHANNEL_TILE, 4):
95 float32x4_t vo${Y}x0c${ABC[C:C+4]} = vo0x0c${ABC[C:C+4]};
97 $for C in range(0, CHANNEL_TILE, 4):
98 float32x4_t vo${Y}x1c${ABC[C:C+4]} = vo0x0c${ABC[C:C+4]};
100 $for C in range(0, CHANNEL_TILE, 4):
101 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE});
107 $for C in range(0, CHANNEL_TILE, 4):
109 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_l…
111 $for C in range(0, CHANNEL_TILE, 4):
113 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_h…
115 $for C in range(0, CHANNEL_TILE, 4):
116 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2});
118 $for C in range(0, CHANNEL_TILE, 4):
120 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_l…
122 $for C in range(0, CHANNEL_TILE, 4):
124 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_h…
126 $for C in range(0, CHANNEL_TILE, 4):
127 const float32x4_t vk20c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 3});
129 $for C in range(0, CHANNEL_TILE, 4):
131 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk20c0x${ABC[C:C+4]}, vget_l…
133 $for C in range(0, CHANNEL_TILE, 4):
135 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk20c0x${ABC[C:C+4]}, vget_h…
137 $for C in range(0, CHANNEL_TILE, 4):
138 const float32x4_t vk00c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 4});
144 $for C in range(0, CHANNEL_TILE, 4):
146 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk00c1x${ABC[C:C+4]}, vget_h…
148 $for C in range(0, CHANNEL_TILE, 4):
150 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk00c1x${ABC[C:C+4]}, vget_l…
152 $for C in range(0, CHANNEL_TILE, 4):
153 const float32x4_t vk10c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 5});
155 $for C in range(0, CHANNEL_TILE, 4):
157 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk10c1x${ABC[C:C+4]}, vget_h…
159 $for C in range(0, CHANNEL_TILE, 4):
161 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk10c1x${ABC[C:C+4]}, vget_l…
163 $for C in range(0, CHANNEL_TILE, 4):
164 const float32x4_t vk20c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 6});
166 $for C in range(0, CHANNEL_TILE, 4):
168 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk20c1x${ABC[C:C+4]}, vget_h…
170 $for C in range(0, CHANNEL_TILE, 4):
172 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk20c1x${ABC[C:C+4]}, vget_l…
174 $for C in range(0, CHANNEL_TILE, 4):
175 const float32x4_t vk00c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 7});
177 $for C in range(0, CHANNEL_TILE, 4):
179 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk00c2x${ABC[C:C+4]}, vget_h…
181 $for C in range(0, CHANNEL_TILE, 4):
183 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk00c2x${ABC[C:C+4]}, vget_l…
185 $for C in range(0, CHANNEL_TILE, 4):
186 const float32x4_t vk10c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 8});
188 $for C in range(0, CHANNEL_TILE, 4):
190 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk10c2x${ABC[C:C+4]}, vget_h…
192 $for C in range(0, CHANNEL_TILE, 4):
194 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk10c2x${ABC[C:C+4]}, vget_l…
196 $for C in range(0, CHANNEL_TILE, 4):
197 const float32x4_t vk20c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 9});
199 $for C in range(0, CHANNEL_TILE, 4):
201 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk20c2x${ABC[C:C+4]}, vget_h…
203 $for C in range(0, CHANNEL_TILE, 4):
205 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk20c2x${ABC[C:C+4]}, vget_l…
207 $for C in range(0, CHANNEL_TILE, 4):
208 const float32x4_t vk01c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 10});
210 $for C in range(0, CHANNEL_TILE, 4):
212 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk01c0x${ABC[C:C+4]}, vget_l…
214 $for C in range(0, CHANNEL_TILE, 4):
216 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk01c0x${ABC[C:C+4]}, vget_h…
218 $for C in range(0, CHANNEL_TILE, 4):
219 const float32x4_t vk11c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 11});
221 $for C in range(0, CHANNEL_TILE, 4):
223 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk11c0x${ABC[C:C+4]}, vget_l…
225 $for C in range(0, CHANNEL_TILE, 4):
227 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk11c0x${ABC[C:C+4]}, vget_h…
229 $for C in range(0, CHANNEL_TILE, 4):
230 const float32x4_t vk21c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 12});
232 $for C in range(0, CHANNEL_TILE, 4):
234 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk21c0x${ABC[C:C+4]}, vget_l…
236 $for C in range(0, CHANNEL_TILE, 4):
238 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk21c0x${ABC[C:C+4]}, vget_h…
240 $for C in range(0, CHANNEL_TILE, 4):
241 const float32x4_t vk01c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 13});
243 $for C in range(0, CHANNEL_TILE, 4):
245 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk01c1x${ABC[C:C+4]}, vget_l…
247 $for C in range(0, CHANNEL_TILE, 4):
249 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk01c1x${ABC[C:C+4]}, vget_h…
251 $for C in range(0, CHANNEL_TILE, 4):
252 const float32x4_t vk11c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 14});
254 $for C in range(0, CHANNEL_TILE, 4):
256 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk11c1x${ABC[C:C+4]}, vget_l…
258 $for C in range(0, CHANNEL_TILE, 4):
260 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk11c1x${ABC[C:C+4]}, vget_h…
262 $for C in range(0, CHANNEL_TILE, 4):
263 const float32x4_t vk21c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 15});
265 $for C in range(0, CHANNEL_TILE, 4):
267 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk21c1x${ABC[C:C+4]}, vget_l…
269 $for C in range(0, CHANNEL_TILE, 4):
271 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk21c1x${ABC[C:C+4]}, vget_h…
273 $for C in range(0, CHANNEL_TILE, 4):
274 const float32x4_t vk01c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 16});
280 $for C in range(0, CHANNEL_TILE, 4):
282 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk01c2x${ABC[C:C+4]}, vget_h…
284 $for C in range(0, CHANNEL_TILE, 4):
286 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk01c2x${ABC[C:C+4]}, vget_l…
288 $for C in range(0, CHANNEL_TILE, 4):
289 const float32x4_t vk11c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 17});
291 $for C in range(0, CHANNEL_TILE, 4):
293 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk11c2x${ABC[C:C+4]}, vget_h…
295 $for C in range(0, CHANNEL_TILE, 4):
297 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk11c2x${ABC[C:C+4]}, vget_l…
299 $for C in range(0, CHANNEL_TILE, 4):
300 const float32x4_t vk21c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 18});
302 $for C in range(0, CHANNEL_TILE, 4):
304 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk21c2x${ABC[C:C+4]}, vget_h…
306 $for C in range(0, CHANNEL_TILE, 4):
308 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk21c2x${ABC[C:C+4]}, vget_l…
310 $for C in range(0, CHANNEL_TILE, 4):
311 const float32x4_t vk02c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 19});
313 $for C in range(0, CHANNEL_TILE, 4):
315 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk02c0x${ABC[C:C+4]}, vget_h…
317 $for C in range(0, CHANNEL_TILE, 4):
319 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk02c0x${ABC[C:C+4]}, vget_l…
321 $for C in range(0, CHANNEL_TILE, 4):
322 const float32x4_t vk12c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 20});
324 $for C in range(0, CHANNEL_TILE, 4):
326 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk12c0x${ABC[C:C+4]}, vget_h…
328 $for C in range(0, CHANNEL_TILE, 4):
330 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk12c0x${ABC[C:C+4]}, vget_l…
332 $for C in range(0, CHANNEL_TILE, 4):
333 const float32x4_t vk22c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 21});
335 $for C in range(0, CHANNEL_TILE, 4):
337 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk22c0x${ABC[C:C+4]}, vget_h…
339 $for C in range(0, CHANNEL_TILE, 4):
341 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk22c0x${ABC[C:C+4]}, vget_l…
343 $for C in range(0, CHANNEL_TILE, 4):
344 const float32x4_t vk02c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 22});
346 $for C in range(0, CHANNEL_TILE, 4):
348 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk02c1x${ABC[C:C+4]}, vget_l…
350 $for C in range(0, CHANNEL_TILE, 4):
352 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk02c1x${ABC[C:C+4]}, vget_h…
354 $for C in range(0, CHANNEL_TILE, 4):
355 const float32x4_t vk12c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 23});
357 $for C in range(0, CHANNEL_TILE, 4):
359 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk12c1x${ABC[C:C+4]}, vget_l…
361 $for C in range(0, CHANNEL_TILE, 4):
363 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk12c1x${ABC[C:C+4]}, vget_h…
365 $for C in range(0, CHANNEL_TILE, 4):
366 const float32x4_t vk22c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 24});
368 $for C in range(0, CHANNEL_TILE, 4):
370 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk22c1x${ABC[C:C+4]}, vget_l…
372 $for C in range(0, CHANNEL_TILE, 4):
374 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk22c1x${ABC[C:C+4]}, vget_h…
376 $for C in range(0, CHANNEL_TILE, 4):
377 const float32x4_t vk02c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 25});
379 $for C in range(0, CHANNEL_TILE, 4):
381 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk02c2x${ABC[C:C+4]}, vget_l…
383 $for C in range(0, CHANNEL_TILE, 4):
385 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk02c2x${ABC[C:C+4]}, vget_h…
387 $for C in range(0, CHANNEL_TILE, 4):
388 const float32x4_t vk12c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 26});
390 $for C in range(0, CHANNEL_TILE, 4):
392 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk12c2x${ABC[C:C+4]}, vget_l…
394 $for C in range(0, CHANNEL_TILE, 4):
396 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk12c2x${ABC[C:C+4]}, vget_h…
398 $for C in range(0, CHANNEL_TILE, 4):
399 const float32x4_t vk22c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 27});
401 $for C in range(0, CHANNEL_TILE, 4):
403 …vo${Y}x0c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x0c${ABC[C:C+4]}, vk22c2x${ABC[C:C+4]}, vget_l…
405 $for C in range(0, CHANNEL_TILE, 4):
407 …vo${Y}x1c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}x1c${ABC[C:C+4]}, vk22c2x${ABC[C:C+4]}, vget_h…
416 $for C in range(0, CHANNEL_TILE, 4):
418 vo${Y}x0c${ABC[C:C+4]} = vmaxq_f32(vo${Y}x0c${ABC[C:C+4]}, vmin);
420 $for C in range(0, CHANNEL_TILE, 4):
422 vo${Y}x1c${ABC[C:C+4]} = vmaxq_f32(vo${Y}x1c${ABC[C:C+4]}, vmin);
424 $for C in range(0, CHANNEL_TILE, 4):
426 vo${Y}x0c${ABC[C:C+4]} = vminq_f32(vo${Y}x0c${ABC[C:C+4]}, vmax);
428 $for C in range(0, CHANNEL_TILE, 4):
430 vo${Y}x1c${ABC[C:C+4]} = vminq_f32(vo${Y}x1c${ABC[C:C+4]}, vmax);
432 if XNN_LIKELY(c >= ${CHANNEL_TILE}) {
434 vst1q_f32(o${Y}, vo${Y}x0c${ABC[0:4]});
435 $for C in range(4, CHANNEL_TILE, 4):
436 vst1q_f32(o${Y} + 4, vo${Y}x0c${ABC[C:C+4]});
440 vst1q_f32(o${Y}, vo${Y}x1c${ABC[0:4]});
441 $for C in range(4, CHANNEL_TILE, 4):
442 vst1q_f32(o${Y} + 4, vo${Y}x1c${ABC[C:C+4]});
451 float32x2_t vo${Y}x0c${ABC[0:2]} = vget_low_f32(vo${Y}x0c${ABC[0:4]});
453 float32x2_t vo${Y}x1c${ABC[0:2]} = vget_low_f32(vo${Y}x1c${ABC[0:4]});
454 if (c & ${1 << LOG2_CHANNEL_TILE}) {
456 $for C in range(0, 1 << (LOG2_CHANNEL_TILE - 1), 4):
458 … vst1q_f32((float*) ((uintptr_t) o${Y}_tmp + output_width_stride), vo${Y}x1c${ABC[C:C+4]});
459 … vo${Y}x1c${ABC[C:C+4]} = vo${Y}x1c${ABC[C+(1<<LOG2_CHANNEL_TILE):C+(1<<LOG2_CHANNEL_TILE)+4]};
462 vst1q_f32(o${Y}_tmp, vo${Y}x0c${ABC[C:C+4]}); o${Y}_tmp += 4;
463 … vo${Y}x0c${ABC[C:C+4]} = vo${Y}x0c${ABC[C+(1<<LOG2_CHANNEL_TILE):C+(1<<LOG2_CHANNEL_TILE)+4]};
466 … vst1_f32((float*) ((uintptr_t) o${Y}_tmp + output_width_stride), vo${Y}x1c${ABC[0:2]});
467 vo${Y}x1c${ABC[0:2]} = vget_high_f32(vo${Y}x1c${ABC[0:4]});
470 vst1_f32(o${Y}_tmp, vo${Y}x0c${ABC[0:2]}); o${Y}_tmp += 2;
471 vo${Y}x0c${ABC[0:2]} = vget_high_f32(vo${Y}x0c${ABC[0:4]});
472 $elif LOG2_CHANNEL_TILE == 0:
474 vst1_lane_f32(o${Y}_tmp, vo${Y}x0c${ABC[0:2]}, 0);
477 … vst1_lane_f32((float*) ((uintptr_t) o${Y}_tmp + output_width_stride), vo${Y}x1c${ABC[0:2]}, 0);
486 float32x4_t vo0c${ABC[0:4]} = vld1q_f32(w);
487 $for C in range(4, CHANNEL_TILE, 4):
488 float32x4_t vo0c${ABC[C:C+4]} = vld1q_f32(w + ${C});
490 $for C in range(0, CHANNEL_TILE, 4):
491 float32x4_t vo${Y}c${ABC[C:C+4]} = vo0c${ABC[C:C+4]};
493 $for C in range(0, CHANNEL_TILE, 4):
494 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE});
496 $for C in range(0, CHANNEL_TILE, 4):
498 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_low_f…
500 $for C in range(0, CHANNEL_TILE, 4):
501 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2});
503 $for C in range(0, CHANNEL_TILE, 4):
505 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_low_f…
507 $for C in range(0, CHANNEL_TILE, 4):
508 const float32x4_t vk20c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 3});
510 $for C in range(0, CHANNEL_TILE, 4):
512 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c0x${ABC[C:C+4]}, vget_low_f…
514 $for C in range(0, CHANNEL_TILE, 4):
515 const float32x4_t vk00c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 4});
517 $for C in range(0, CHANNEL_TILE, 4):
519 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c1x${ABC[C:C+4]}, vget_high_…
521 $for C in range(0, CHANNEL_TILE, 4):
522 const float32x4_t vk10c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 5});
524 $for C in range(0, CHANNEL_TILE, 4):
526 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c1x${ABC[C:C+4]}, vget_high_…
528 $for C in range(0, CHANNEL_TILE, 4):
529 const float32x4_t vk20c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 6});
531 $for C in range(0, CHANNEL_TILE, 4):
533 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c1x${ABC[C:C+4]}, vget_high_…
535 $for C in range(0, CHANNEL_TILE, 4):
536 const float32x4_t vk00c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 7});
538 $for C in range(0, CHANNEL_TILE, 4):
540 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c2x${ABC[C:C+4]}, vget_high_…
542 $for C in range(0, CHANNEL_TILE, 4):
543 const float32x4_t vk10c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 8});
545 $for C in range(0, CHANNEL_TILE, 4):
547 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c2x${ABC[C:C+4]}, vget_high_…
549 $for C in range(0, CHANNEL_TILE, 4):
550 const float32x4_t vk20c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 9});
552 $for C in range(0, CHANNEL_TILE, 4):
554 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c2x${ABC[C:C+4]}, vget_high_…
560 $for C in range(0, CHANNEL_TILE, 4):
561 const float32x4_t vk01c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 10});
563 $for C in range(0, CHANNEL_TILE, 4):
565 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c0x${ABC[C:C+4]}, vget_low_f…
567 $for C in range(0, CHANNEL_TILE, 4):
568 const float32x4_t vk11c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 11});
570 $for C in range(0, CHANNEL_TILE, 4):
572 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c0x${ABC[C:C+4]}, vget_low_f…
574 $for C in range(0, CHANNEL_TILE, 4):
575 const float32x4_t vk21c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 12});
577 $for C in range(0, CHANNEL_TILE, 4):
579 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c0x${ABC[C:C+4]}, vget_low_f…
581 $for C in range(0, CHANNEL_TILE, 4):
582 const float32x4_t vk01c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 13});
584 $for C in range(0, CHANNEL_TILE, 4):
586 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c1x${ABC[C:C+4]}, vget_low_f…
588 $for C in range(0, CHANNEL_TILE, 4):
589 const float32x4_t vk11c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 14});
591 $for C in range(0, CHANNEL_TILE, 4):
593 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c1x${ABC[C:C+4]}, vget_low_f…
595 $for C in range(0, CHANNEL_TILE, 4):
596 const float32x4_t vk21c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 15});
598 $for C in range(0, CHANNEL_TILE, 4):
600 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c1x${ABC[C:C+4]}, vget_low_f…
602 $for C in range(0, CHANNEL_TILE, 4):
603 const float32x4_t vk01c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 16});
605 $for C in range(0, CHANNEL_TILE, 4):
607 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c2x${ABC[C:C+4]}, vget_high_…
609 $for C in range(0, CHANNEL_TILE, 4):
610 const float32x4_t vk11c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 17});
612 $for C in range(0, CHANNEL_TILE, 4):
614 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c2x${ABC[C:C+4]}, vget_high_…
616 $for C in range(0, CHANNEL_TILE, 4):
617 const float32x4_t vk21c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 18});
619 $for C in range(0, CHANNEL_TILE, 4):
621 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c2x${ABC[C:C+4]}, vget_high_…
623 $for C in range(0, CHANNEL_TILE, 4):
624 const float32x4_t vk02c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 19});
626 $for C in range(0, CHANNEL_TILE, 4):
628 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk02c0x${ABC[C:C+4]}, vget_high_…
630 $for C in range(0, CHANNEL_TILE, 4):
631 const float32x4_t vk12c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 20});
633 $for C in range(0, CHANNEL_TILE, 4):
635 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk12c0x${ABC[C:C+4]}, vget_high_…
637 $for C in range(0, CHANNEL_TILE, 4):
638 const float32x4_t vk22c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 21});
640 $for C in range(0, CHANNEL_TILE, 4):
642 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk22c0x${ABC[C:C+4]}, vget_high_…
648 $for C in range(0, CHANNEL_TILE, 4):
649 const float32x4_t vk02c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 22});
651 $for C in range(0, CHANNEL_TILE, 4):
653 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk02c1x${ABC[C:C+4]}, vi${Y*2}x2…
655 $for C in range(0, CHANNEL_TILE, 4):
656 const float32x4_t vk12c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 23});
658 $for C in range(0, CHANNEL_TILE, 4):
660 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk12c1x${ABC[C:C+4]}, vi${Y*2+1}…
662 $for C in range(0, CHANNEL_TILE, 4):
663 const float32x4_t vk22c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 24});
665 $for C in range(0, CHANNEL_TILE, 4):
667 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk22c1x${ABC[C:C+4]}, vi${Y*2+2}…
669 $for C in range(0, CHANNEL_TILE, 4):
670 const float32x4_t vk02c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 25});
672 $for C in range(0, CHANNEL_TILE, 4):
674 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk02c2x${ABC[C:C+4]}, vi${Y*2}x2…
676 $for C in range(0, CHANNEL_TILE, 4):
677 const float32x4_t vk12c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 26});
679 $for C in range(0, CHANNEL_TILE, 4):
681 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk12c2x${ABC[C:C+4]}, vi${Y*2+1}…
683 $for C in range(0, CHANNEL_TILE, 4):
684 const float32x4_t vk22c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 27});
686 $for C in range(0, CHANNEL_TILE, 4):
688 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk22c2x${ABC[C:C+4]}, vi${Y*2+2}…
697 $for C in range(0, CHANNEL_TILE, 4):
699 vo${Y}c${ABC[C:C+4]} = vmaxq_f32(vo${Y}c${ABC[C:C+4]}, vmin);
701 $for C in range(0, CHANNEL_TILE, 4):
703 vo${Y}c${ABC[C:C+4]} = vminq_f32(vo${Y}c${ABC[C:C+4]}, vmax);
705 if XNN_LIKELY(c >= ${CHANNEL_TILE}) {
707 vst1q_f32(o${Y}, vo${Y}c${ABC[0:4]});
708 $for C in range(4, CHANNEL_TILE, 4):
709 vst1q_f32(o${Y} + 4, vo${Y}c${ABC[C:C+4]});
718 float32x2_t vo${Y}c${ABC[0:2]} = vget_low_f32(vo${Y}c${ABC[0:4]});
719 if (c & ${1 << LOG2_CHANNEL_TILE}) {
721 $for C in range(0, 1 << (LOG2_CHANNEL_TILE - 1), 4):
723 vst1q_f32(o${Y}_tmp, vo${Y}c${ABC[C:C+4]}); o${Y}_tmp += 4;
724 … vo${Y}c${ABC[C:C+4]} = vo${Y}c${ABC[C+(1<<LOG2_CHANNEL_TILE):C+(1<<LOG2_CHANNEL_TILE)+4]};
727 vst1_f32(o${Y}_tmp, vo${Y}c${ABC[0:2]}); o${Y}_tmp += 2;
728 vo${Y}c${ABC[0:2]} = vget_high_f32(vo${Y}c${ABC[0:4]});
729 $elif LOG2_CHANNEL_TILE == 0:
731 vst1_lane_f32(o${Y}_tmp, vo${Y}c${ABC[0:2]}, 0);
739 float32x4_t vo0c${ABC[0:4]} = vld1q_f32(w);
740 $for C in range(4, CHANNEL_TILE, 4):
741 float32x4_t vo0c${ABC[C:C+4]} = vld1q_f32(w + ${C});
743 $for C in range(0, CHANNEL_TILE, 4):
744 float32x4_t vo${Y}c${ABC[C:C+4]} = vo0c${ABC[C:C+4]};
746 $for C in range(0, CHANNEL_TILE, 4):
747 const float32x4_t vk00c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE});
749 $for C in range(0, CHANNEL_TILE, 4):
751 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c0x${ABC[C:C+4]}, vget_low_f…
753 $for C in range(0, CHANNEL_TILE, 4):
754 const float32x4_t vk10c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 2});
756 $for C in range(0, CHANNEL_TILE, 4):
758 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c0x${ABC[C:C+4]}, vget_low_f…
760 $for C in range(0, CHANNEL_TILE, 4):
761 const float32x4_t vk20c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 3});
763 $for C in range(0, CHANNEL_TILE, 4):
765 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c0x${ABC[C:C+4]}, vget_low_f…
767 $for C in range(0, CHANNEL_TILE, 4):
768 const float32x4_t vk00c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 4});
770 $for C in range(0, CHANNEL_TILE, 4):
772 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c1x${ABC[C:C+4]}, vget_high_…
774 $for C in range(0, CHANNEL_TILE, 4):
775 const float32x4_t vk10c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 5});
777 $for C in range(0, CHANNEL_TILE, 4):
779 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c1x${ABC[C:C+4]}, vget_high_…
781 $for C in range(0, CHANNEL_TILE, 4):
782 const float32x4_t vk20c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 6});
784 $for C in range(0, CHANNEL_TILE, 4):
786 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c1x${ABC[C:C+4]}, vget_high_…
788 $for C in range(0, CHANNEL_TILE, 4):
789 const float32x4_t vk00c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 7});
791 $for C in range(0, CHANNEL_TILE, 4):
793 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk00c2x${ABC[C:C+4]}, vget_high_…
795 $for C in range(0, CHANNEL_TILE, 4):
796 const float32x4_t vk10c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 8});
798 $for C in range(0, CHANNEL_TILE, 4):
800 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk10c2x${ABC[C:C+4]}, vget_high_…
802 $for C in range(0, CHANNEL_TILE, 4):
803 const float32x4_t vk20c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 9});
805 $for C in range(0, CHANNEL_TILE, 4):
807 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk20c2x${ABC[C:C+4]}, vget_high_…
813 $for C in range(0, CHANNEL_TILE, 4):
814 const float32x4_t vk01c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 10});
816 $for C in range(0, CHANNEL_TILE, 4):
818 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c0x${ABC[C:C+4]}, vget_low_f…
820 $for C in range(0, CHANNEL_TILE, 4):
821 const float32x4_t vk11c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 11});
823 $for C in range(0, CHANNEL_TILE, 4):
825 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c0x${ABC[C:C+4]}, vget_low_f…
827 $for C in range(0, CHANNEL_TILE, 4):
828 const float32x4_t vk21c0x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 12});
830 $for C in range(0, CHANNEL_TILE, 4):
832 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c0x${ABC[C:C+4]}, vget_low_f…
834 $for C in range(0, CHANNEL_TILE, 4):
835 const float32x4_t vk01c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 13});
837 $for C in range(0, CHANNEL_TILE, 4):
839 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c1x${ABC[C:C+4]}, vget_low_f…
841 $for C in range(0, CHANNEL_TILE, 4):
842 const float32x4_t vk11c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 14});
844 $for C in range(0, CHANNEL_TILE, 4):
846 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c1x${ABC[C:C+4]}, vget_low_f…
848 $for C in range(0, CHANNEL_TILE, 4):
849 const float32x4_t vk21c1x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 15});
851 $for C in range(0, CHANNEL_TILE, 4):
853 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c1x${ABC[C:C+4]}, vget_low_f…
855 $for C in range(0, CHANNEL_TILE, 4):
856 const float32x4_t vk01c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 16});
858 $for C in range(0, CHANNEL_TILE, 4):
860 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk01c2x${ABC[C:C+4]}, vget_high_…
862 $for C in range(0, CHANNEL_TILE, 4):
863 const float32x4_t vk11c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 17});
865 $for C in range(0, CHANNEL_TILE, 4):
867 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk11c2x${ABC[C:C+4]}, vget_high_…
869 $for C in range(0, CHANNEL_TILE, 4):
870 const float32x4_t vk21c2x${ABC[C:C+4]} = vld1q_f32(w + ${C + CHANNEL_TILE * 18});
872 $for C in range(0, CHANNEL_TILE, 4):
874 …vo${Y}c${ABC[C:C+4]} = ${VMULADDQ_LANE_F32}(vo${Y}c${ABC[C:C+4]}, vk21c2x${ABC[C:C+4]}, vget_high_…
880 $for C in range(0, CHANNEL_TILE, 4):
882 vo${Y}c${ABC[C:C+4]} = vmaxq_f32(vo${Y}c${ABC[C:C+4]}, vmin);
884 $for C in range(0, CHANNEL_TILE, 4):
886 vo${Y}c${ABC[C:C+4]} = vminq_f32(vo${Y}c${ABC[C:C+4]}, vmax);
888 if XNN_LIKELY(c >= ${CHANNEL_TILE}) {
890 vst1q_f32(o${Y}, vo${Y}c${ABC[0:4]});
891 $for C in range(4, CHANNEL_TILE, 4):
892 vst1q_f32(o${Y} + 4, vo${Y}c${ABC[C:C+4]});
901 float32x2_t vo${Y}c${ABC[0:2]} = vget_low_f32(vo${Y}c${ABC[0:4]});
902 if (c & ${1 << LOG2_CHANNEL_TILE}) {
904 $for C in range(0, 1 << (LOG2_CHANNEL_TILE - 1), 4):
906 vst1q_f32(o${Y}_tmp, vo${Y}c${ABC[C:C+4]}); o${Y}_tmp += 4;
907 … vo${Y}c${ABC[C:C+4]} = vo${Y}c${ABC[C+(1<<LOG2_CHANNEL_TILE):C+(1<<LOG2_CHANNEL_TILE)+4]};
910 vst1_f32(o${Y}_tmp, vo${Y}c${ABC[0:2]}); o${Y}_tmp += 2;
911 vo${Y}c${ABC[0:2]} = vget_high_f32(vo${Y}c${ABC[0:4]});
912 $elif LOG2_CHANNEL_TILE == 0:
914 vst1_lane_f32(o${Y}_tmp, vo${Y}c${ABC[0:2]}, 0);
932 c = doz(c, ${CHANNEL_TILE});
933 } while (c != 0);