1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const void*restrict w,             x5
19#     uint8_t*restrict c,                x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> (x0)
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> x8
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29# x14 a0
30# x15 a1
31# x20 a2
32# x21 a3
33# x22 a4
34# x23 a5
35
36# C pointers
37#  x6 c0
38# x16 c1
39# x17 c2
40# x10 c3
41# x13 c4
42#  x7 c5
43
44# Vector register usage
45# A0   v0  v6
46# A1   v1  v7
47# A2   v2  v8
48# A3   v3  v9
49# A4   v4 v10
50# A5   v5 v11
51# B   v12 v13 v14 v15
52# B   v16 v17 v18 v19
53# C   v20 v21
54# C   v22 v23
55# C   v24 v25
56# C   v26 v27
57# C   v28 v29
58# C   v30 v31
59# Clamp v6 v7
60
61BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57
62
63        # Clamp C pointers / Save d8-d15 on stack
64        STP  d8,  d9, [sp, -96]!
65        CMP x0, 2                // if mr < 2
66        ADD x16, x6, x7          // c1 = c0 + cm_stride
67        CSEL x16, x6, x16, LO    //   c1 = c0
68
69        STP d10, d11, [sp, 16]
70        ADD x17, x16, x7         // c2 = c1 + cm_stride
71                                 // if mr <= 2
72        CSEL x17, x16, x17, LS   //   c2 = c1
73
74        STP d12, d13, [sp, 32]
75        CMP x0, 4                // if mr < 4
76        ADD x10, x17, x7         // c3 = c2 + cm_stride
77        CSEL x10, x17, x10, LO   //   c3 = c2
78
79        STP d14, d15, [sp, 48]
80        ADD x13, x10, x7         // c4 = c3 + cm_stride
81                                 // if mr <= 4
82        CSEL x13, x10, x13, LS   //   c4 = c3
83
84        # Save x20,x21,x22,x23 on stack
85        STP x20, x21, [sp, 64]
86        STP x22, x23, [sp, 80]
87
88        CMP x0, 6                // if mr < 6
89        ADD x7, x13, x7          // c5 = c4 + cm_stride
90        CSEL x7, x13, x7, LO     //   c5 = c4
91
92        # Load a_offset
93        LDR x11, [sp, 104]
94
95        # Load zero, params pointer
96        LDP x12, x8, [sp, 112]
97
980:
99        # Load initial bias from w into accumulators
100        LDP q20, q21, [x5], 32
101        MOV v22.16b, v20.16b
102        MOV v23.16b, v21.16b
103        MOV v24.16b, v20.16b
104        MOV v25.16b, v21.16b
105        MOV v26.16b, v20.16b
106        MOV v27.16b, v21.16b
107        MOV v28.16b, v20.16b
108        MOV v29.16b, v21.16b
109        MOV v30.16b, v20.16b
110        MOV v31.16b, v21.16b
111
112        MOV x9, x3  // p = ks
113
1141:
115        # Load next 6 A pointers
116        LDP x14, x15, [x4], 16
117        LDP x20, x21, [x4], 16
118        LDP x22, x23, [x4], 16
119
120        CMP x14, x12            // if a0 == zero
121        ADD x14, x14, x11       // a0 += a_offset
122        CSEL x14, x12, x14, EQ  //   a0 = zero, else += a0 + a_offset
123        CMP x15, x12            // if a1 == zero
124        ADD x15, x15, x11       // a1 += a_offset
125        CSEL x15, x12, x15, EQ  //   a1 = zero, else += a1 + a_offset
126        CMP x20, x12            // if a2 == zero
127        ADD x20, x20, x11       // a2 += a_offset
128        CSEL x20, x12, x20, EQ  //   a2 = zero, else += a2 + a_offset
129        CMP x21, x12            // if a3 == zero
130        ADD x21, x21, x11       // a3 += a_offset
131        CSEL x21, x12, x21, EQ  //   a3 = zero, else += a3 + a_offset
132        CMP x22, x12            // if a4 == zero
133        ADD x22, x22, x11       // a4 += a_offset
134        CSEL x22, x12, x22, EQ  //   a4 = zero, else += a4 + a_offset
135        CMP x23, x12            // if a5 == zero
136        ADD x23, x23, x11       // a5 += a_offset
137        CSEL x23, x12, x23, EQ  //   a5 = zero, else += a5 + a_offset
138
139        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
140        SUBS x0, x2, 32  // k = kc - 32
141        B.LO 5f
142
143        # Prologue - loads for main loop of 96 FMA
144        LDR   q0, [x14], 16
145        LDR   q1, [x15], 16
146        LDR   q2, [x20], 16
147        LDR   q3, [x21], 16
148        LDR   q4, [x22], 16
149        LDR   q5, [x23], 16
150        LDP  q12, q13, [x5], 32  // Fetch 3 B (4th deferred)
151        LDP  q14, q15, [x5], 32
152        LDP  q16, q17, [x5], 32
153
154        # Is there at least 8 floats (32 bytes) for main loop?
155        SUBS x0, x0, 32
156        B.LO 3f
157
158        # Main loop - 8 floats of A (32 bytes)
159        # 96 FMA + 6 LDP A + 8 LDP B
1602:
161        # First group of 4 A.  48 FMA.
162        FMLA v20.4s, v12.4s,  v0.s[0]
163        LDP  q18, q19, [x5], 32        // Load last B
164        FMLA v22.4s, v12.4s,  v1.s[0]
165        FMLA v24.4s, v12.4s,  v2.s[0]
166        FMLA v26.4s, v12.4s,  v3.s[0]
167        FMLA v28.4s, v12.4s,  v4.s[0]
168        FMLA v30.4s, v12.4s,  v5.s[0]
169        FMLA v21.4s, v13.4s,  v0.s[0]
170        FMLA v23.4s, v13.4s,  v1.s[0]
171        FMLA v25.4s, v13.4s,  v2.s[0]
172        FMLA v27.4s, v13.4s,  v3.s[0]
173        FMLA v29.4s, v13.4s,  v4.s[0]
174
175        FMLA v31.4s, v13.4s,  v5.s[0]
176        FMLA v20.4s, v14.4s,  v0.s[1]
177        FMLA v22.4s, v14.4s,  v1.s[1]
178        FMLA v24.4s, v14.4s,  v2.s[1]
179        FMLA v26.4s, v14.4s,  v3.s[1]
180        FMLA v28.4s, v14.4s,  v4.s[1]
181        FMLA v30.4s, v14.4s,  v5.s[1]
182        FMLA v21.4s, v15.4s,  v0.s[1]
183        FMLA v23.4s, v15.4s,  v1.s[1]
184        FMLA v25.4s, v15.4s,  v2.s[1]
185        LDR   q6, [x14], 16            // Load next 6 A
186        FMLA v27.4s, v15.4s,  v3.s[1]
187        FMLA v29.4s, v15.4s,  v4.s[1]
188        FMLA v31.4s, v15.4s,  v5.s[1]
189        LDR   q7, [x15], 16
190
191        FMLA v20.4s, v16.4s,  v0.s[2]
192        FMLA v22.4s, v16.4s,  v1.s[2]
193        FMLA v24.4s, v16.4s,  v2.s[2]
194        LDR   q8, [x20], 16
195        FMLA v26.4s, v16.4s,  v3.s[2]
196        FMLA v28.4s, v16.4s,  v4.s[2]
197        FMLA v30.4s, v16.4s,  v5.s[2]
198        LDR   q9, [x21], 16
199        FMLA v21.4s, v17.4s,  v0.s[2]
200        FMLA v23.4s, v17.4s,  v1.s[2]
201        FMLA v25.4s, v17.4s,  v2.s[2]
202        LDR   q10, [x22], 16
203        FMLA v27.4s, v17.4s,  v3.s[2]
204        FMLA v29.4s, v17.4s,  v4.s[2]
205        FMLA v31.4s, v17.4s,  v5.s[2]
206        LDR  q11, [x23], 16
207
208        FMLA v20.4s, v18.4s,  v0.s[3]
209        FMLA v22.4s, v18.4s,  v1.s[3]
210        FMLA v24.4s, v18.4s,  v2.s[3]
211        LDP  q12, q13, [x5], 32        // Load 4 B
212        FMLA v26.4s, v18.4s,  v3.s[3]
213        FMLA v28.4s, v18.4s,  v4.s[3]
214        FMLA v30.4s, v18.4s,  v5.s[3]
215        LDP  q14, q15, [x5], 32
216        FMLA v21.4s, v19.4s,  v0.s[3]
217        FMLA v23.4s, v19.4s,  v1.s[3]
218        FMLA v25.4s, v19.4s,  v2.s[3]
219        LDP  q16, q17, [x5], 32
220        FMLA v27.4s, v19.4s,  v3.s[3]
221        FMLA v29.4s, v19.4s,  v4.s[3]
222        FMLA v31.4s, v19.4s,  v5.s[3]
223        LDP  q18, q19, [x5], 32
224
225        # Second group of 4 A.  48 FMA.
226        FMLA v20.4s, v12.4s,  v6.s[0]
227        FMLA v22.4s, v12.4s,  v7.s[0]
228        FMLA v24.4s, v12.4s,  v8.s[0]
229        LDR   q0, [x14], 16            // Load next 6 A
230        FMLA v26.4s, v12.4s,  v9.s[0]
231        FMLA v28.4s, v12.4s, v10.s[0]
232        FMLA v30.4s, v12.4s, v11.s[0]
233        LDR   q1, [x15], 16
234        FMLA v21.4s, v13.4s,  v6.s[0]
235        FMLA v23.4s, v13.4s,  v7.s[0]
236        FMLA v25.4s, v13.4s,  v8.s[0]
237        LDR   q2, [x20], 16
238        FMLA v27.4s, v13.4s,  v9.s[0]
239        FMLA v29.4s, v13.4s, v10.s[0]
240        FMLA v31.4s, v13.4s, v11.s[0]
241        LDR   q3, [x21], 16
242
243        FMLA v20.4s, v14.4s,  v6.s[1]
244        FMLA v22.4s, v14.4s,  v7.s[1]
245        FMLA v24.4s, v14.4s,  v8.s[1]
246        LDR   q4, [x22], 16
247        FMLA v26.4s, v14.4s,  v9.s[1]
248        FMLA v28.4s, v14.4s, v10.s[1]
249        FMLA v30.4s, v14.4s, v11.s[1]
250        LDR   q5, [x23], 16
251        FMLA v21.4s, v15.4s,  v6.s[1]
252        FMLA v23.4s, v15.4s,  v7.s[1]
253        FMLA v25.4s, v15.4s,  v8.s[1]
254        LDP  q12, q13, [x5], 32        // Load next 3 B (not last)
255        FMLA v27.4s, v15.4s,  v9.s[1]
256        FMLA v29.4s, v15.4s, v10.s[1]
257        FMLA v31.4s, v15.4s, v11.s[1]
258        LDP  q14, q15, [x5], 32
259
260        FMLA v20.4s, v16.4s,  v6.s[2]
261        FMLA v22.4s, v16.4s,  v7.s[2]
262        FMLA v24.4s, v16.4s,  v8.s[2]
263        FMLA v26.4s, v16.4s,  v9.s[2]
264        FMLA v28.4s, v16.4s, v10.s[2]
265        FMLA v30.4s, v16.4s, v11.s[2]
266        FMLA v21.4s, v17.4s,  v6.s[2]
267        FMLA v23.4s, v17.4s,  v7.s[2]
268        FMLA v25.4s, v17.4s,  v8.s[2]
269        FMLA v27.4s, v17.4s,  v9.s[2]
270        FMLA v29.4s, v17.4s, v10.s[2]
271        FMLA v31.4s, v17.4s, v11.s[2]
272        LDP  q16,  q17, [x5], 32
273
274        FMLA v20.4s, v18.4s,  v6.s[3]
275        FMLA v22.4s, v18.4s,  v7.s[3]
276        SUBS x0, x0, 32
277        FMLA v24.4s, v18.4s,  v8.s[3]
278        FMLA v26.4s, v18.4s,  v9.s[3]
279        FMLA v28.4s, v18.4s, v10.s[3]
280        FMLA v30.4s, v18.4s, v11.s[3]
281        FMLA v21.4s, v19.4s,  v6.s[3]
282        FMLA v23.4s, v19.4s,  v7.s[3]
283        FMLA v25.4s, v19.4s,  v8.s[3]
284        FMLA v27.4s, v19.4s,  v9.s[3]
285        FMLA v29.4s, v19.4s, v10.s[3]
286        FMLA v31.4s, v19.4s, v11.s[3]
287        B.HS 2b
288
289        # Epilogue - 8 floats of A (32 bytes)
290        # 96 FMA + 6 LDP A + 8 LDP B
291        # First block same as main loop.  Second block has no preloads.
2923:
293        # First group of 4 A.  48 FMA.
294        FMLA v20.4s, v12.4s,  v0.s[0]
295        LDP  q18, q19, [x5], 32        // Load last B
296        FMLA v22.4s, v12.4s,  v1.s[0]
297        FMLA v24.4s, v12.4s,  v2.s[0]
298        FMLA v26.4s, v12.4s,  v3.s[0]
299        FMLA v28.4s, v12.4s,  v4.s[0]
300        FMLA v30.4s, v12.4s,  v5.s[0]
301        FMLA v21.4s, v13.4s,  v0.s[0]
302        FMLA v23.4s, v13.4s,  v1.s[0]
303        FMLA v25.4s, v13.4s,  v2.s[0]
304        FMLA v27.4s, v13.4s,  v3.s[0]
305        FMLA v29.4s, v13.4s,  v4.s[0]
306
307        FMLA v31.4s, v13.4s,  v5.s[0]
308        FMLA v20.4s, v14.4s,  v0.s[1]
309        FMLA v22.4s, v14.4s,  v1.s[1]
310        FMLA v24.4s, v14.4s,  v2.s[1]
311        FMLA v26.4s, v14.4s,  v3.s[1]
312        FMLA v28.4s, v14.4s,  v4.s[1]
313        FMLA v30.4s, v14.4s,  v5.s[1]
314        FMLA v21.4s, v15.4s,  v0.s[1]
315        FMLA v23.4s, v15.4s,  v1.s[1]
316        FMLA v25.4s, v15.4s,  v2.s[1]
317        LDR   q6, [x14], 16            // Load next 6 A
318        FMLA v27.4s, v15.4s,  v3.s[1]
319        FMLA v29.4s, v15.4s,  v4.s[1]
320        FMLA v31.4s, v15.4s,  v5.s[1]
321        LDR   q7, [x15], 16
322
323        FMLA v20.4s, v16.4s,  v0.s[2]
324        FMLA v22.4s, v16.4s,  v1.s[2]
325        FMLA v24.4s, v16.4s,  v2.s[2]
326        LDR   q8, [x20], 16
327        FMLA v26.4s, v16.4s,  v3.s[2]
328        FMLA v28.4s, v16.4s,  v4.s[2]
329        FMLA v30.4s, v16.4s,  v5.s[2]
330        LDR   q9, [x21], 16
331        FMLA v21.4s, v17.4s,  v0.s[2]
332        FMLA v23.4s, v17.4s,  v1.s[2]
333        FMLA v25.4s, v17.4s,  v2.s[2]
334        LDR   q10, [x22], 16
335        FMLA v27.4s, v17.4s,  v3.s[2]
336        FMLA v29.4s, v17.4s,  v4.s[2]
337        FMLA v31.4s, v17.4s,  v5.s[2]
338        LDR  q11, [x23], 16
339
340        FMLA v20.4s, v18.4s,  v0.s[3]
341        FMLA v22.4s, v18.4s,  v1.s[3]
342        FMLA v24.4s, v18.4s,  v2.s[3]
343        LDP  q12, q13, [x5], 32        // Load 4 B
344        FMLA v26.4s, v18.4s,  v3.s[3]
345        FMLA v28.4s, v18.4s,  v4.s[3]
346        FMLA v30.4s, v18.4s,  v5.s[3]
347        LDP  q14, q15, [x5], 32
348        FMLA v21.4s, v19.4s,  v0.s[3]
349        FMLA v23.4s, v19.4s,  v1.s[3]
350        FMLA v25.4s, v19.4s,  v2.s[3]
351        LDP  q16, q17, [x5], 32
352        FMLA v27.4s, v19.4s,  v3.s[3]
353        FMLA v29.4s, v19.4s,  v4.s[3]
354        FMLA v31.4s, v19.4s,  v5.s[3]
355        LDP  q18, q19, [x5], 32
356
357        # Second group of 4 A.  48 FMA.
358        FMLA v20.4s, v12.4s,  v6.s[0]
359        FMLA v22.4s, v12.4s,  v7.s[0]
360        FMLA v24.4s, v12.4s,  v8.s[0]
361        FMLA v26.4s, v12.4s,  v9.s[0]
362        FMLA v28.4s, v12.4s, v10.s[0]
363        FMLA v30.4s, v12.4s, v11.s[0]
364        FMLA v21.4s, v13.4s,  v6.s[0]
365        FMLA v23.4s, v13.4s,  v7.s[0]
366        FMLA v25.4s, v13.4s,  v8.s[0]
367        FMLA v27.4s, v13.4s,  v9.s[0]
368        FMLA v29.4s, v13.4s, v10.s[0]
369        FMLA v31.4s, v13.4s, v11.s[0]
370
371        FMLA v20.4s, v14.4s,  v6.s[1]
372        FMLA v22.4s, v14.4s,  v7.s[1]
373        FMLA v24.4s, v14.4s,  v8.s[1]
374        FMLA v26.4s, v14.4s,  v9.s[1]
375        FMLA v28.4s, v14.4s, v10.s[1]
376        FMLA v30.4s, v14.4s, v11.s[1]
377        FMLA v21.4s, v15.4s,  v6.s[1]
378        FMLA v23.4s, v15.4s,  v7.s[1]
379        FMLA v25.4s, v15.4s,  v8.s[1]
380        FMLA v27.4s, v15.4s,  v9.s[1]
381        FMLA v29.4s, v15.4s, v10.s[1]
382        FMLA v31.4s, v15.4s, v11.s[1]
383
384        FMLA v20.4s, v16.4s,  v6.s[2]
385        FMLA v22.4s, v16.4s,  v7.s[2]
386        FMLA v24.4s, v16.4s,  v8.s[2]
387        FMLA v26.4s, v16.4s,  v9.s[2]
388        FMLA v28.4s, v16.4s, v10.s[2]
389        FMLA v30.4s, v16.4s, v11.s[2]
390        FMLA v21.4s, v17.4s,  v6.s[2]
391        FMLA v23.4s, v17.4s,  v7.s[2]
392        FMLA v25.4s, v17.4s,  v8.s[2]
393        FMLA v27.4s, v17.4s,  v9.s[2]
394        FMLA v29.4s, v17.4s, v10.s[2]
395        FMLA v31.4s, v17.4s, v11.s[2]
396
397        FMLA v20.4s, v18.4s,  v6.s[3]
398        FMLA v22.4s, v18.4s,  v7.s[3]
399        FMLA v24.4s, v18.4s,  v8.s[3]
400        FMLA v26.4s, v18.4s,  v9.s[3]
401        FMLA v28.4s, v18.4s, v10.s[3]
402        FMLA v30.4s, v18.4s, v11.s[3]
403        FMLA v21.4s, v19.4s,  v6.s[3]
404        FMLA v23.4s, v19.4s,  v7.s[3]
405
406        # Load min/max values
407        LD2R {v6.4s, v7.4s}, [x8]
408
409        FMLA v25.4s, v19.4s,  v8.s[3]
410        FMLA v27.4s, v19.4s,  v9.s[3]
411        # Is there a remainder?- 4 floats of A (16 bytes) or less
412        TST x0, 31
413        FMLA v29.4s, v19.4s, v10.s[3]
414        FMLA v31.4s, v19.4s, v11.s[3]
415        B.NE 5f
416
4174:
418        # ks loop
419        SUBS x9, x9, 48  // ks -= MR * sizeof(void*)
420        B.HI 1b
421
422        # Clamp
423        FMAX v20.4s, v20.4s, v6.4s
424        # Load cn_stride
425        LDR x0, [sp, 96]
426        FMAX v21.4s, v21.4s, v6.4s
427        FMAX v22.4s, v22.4s, v6.4s
428        FMAX v23.4s, v23.4s, v6.4s
429        FMAX v24.4s, v24.4s, v6.4s
430        FMAX v25.4s, v25.4s, v6.4s
431        FMAX v26.4s, v26.4s, v6.4s
432        FMAX v27.4s, v27.4s, v6.4s
433        FMAX v28.4s, v28.4s, v6.4s
434        FMAX v29.4s, v29.4s, v6.4s
435        FMAX v30.4s, v30.4s, v6.4s
436        FMAX v31.4s, v31.4s, v6.4s
437        SUBS x1, x1, 8
438        FMIN v20.4s, v20.4s, v7.4s
439        FMIN v21.4s, v21.4s, v7.4s
440        FMIN v22.4s, v22.4s, v7.4s
441        FMIN v23.4s, v23.4s, v7.4s
442        FMIN v24.4s, v24.4s, v7.4s
443        FMIN v25.4s, v25.4s, v7.4s
444        FMIN v26.4s, v26.4s, v7.4s
445        FMIN v27.4s, v27.4s, v7.4s
446        FMIN v28.4s, v28.4s, v7.4s
447        FMIN v29.4s, v29.4s, v7.4s
448        FMIN v30.4s, v30.4s, v7.4s
449        FMIN v31.4s, v31.4s, v7.4s
450
451        # Store full 6 x 8
452        B.LO 8f
453
454        STP q30, q31,  [x7]
455        ADD x7, x7, x0
456        STP q28, q29, [x13]
457        ADD x13, x13, x0
458        STP q26, q27, [x10]
459        ADD x10, x10, x0
460        STP q24, q25, [x17]
461        ADD x17, x17, x0
462        STP q22, q23, [x16]
463        ADD x16, x16, x0
464        STP q20, q21,  [x6]
465        ADD  x6,  x6, x0
466
467        SUB x4, x4, x3  // a -= ks
468
469        # nc loop
470        B.HI 0b
471
472        # Restore x20,x21,x22,x23 from stack
473        LDP x22, x23, [sp, 80]
474        LDP x20, x21, [sp, 64]
475
476        # Restore d8-d15 from stack
477        LDP d14, d15, [sp, 48]
478        LDP d12, d13, [sp, 32]
479        LDP d10, d11, [sp, 16]
480        LDP  d8,  d9, [sp], 96
481        RET
482
4835:
484        # Load min/max values
485        LD2R {v6.4s, v7.4s}, [x8]
486
487        # Is there a remainder?- 4 floats of A (16 bytes)
488        TBZ x0, 4, 6f
489
490        # Remainder- 4 floats of A (16 bytes)
491        # Load A
492        LDR   q0, [x14], 16
493        LDR   q1, [x15], 16
494        LDR   q2, [x20], 16
495        LDR   q3, [x21], 16
496        LDR   q4, [x22], 16
497        LDR   q5, [x23], 16
498        # Load B
499        LDP  q12, q13, [x5], 32
500        LDP  q14, q15, [x5], 32
501        LDP  q16, q17, [x5], 32
502        LDP  q18, q19, [x5], 32
503
504        FMLA v20.4s, v12.4s,  v0.s[0]
505        FMLA v22.4s, v12.4s,  v1.s[0]
506        FMLA v24.4s, v12.4s,  v2.s[0]
507        FMLA v26.4s, v12.4s,  v3.s[0]
508        FMLA v28.4s, v12.4s,  v4.s[0]
509        FMLA v30.4s, v12.4s,  v5.s[0]
510        FMLA v21.4s, v13.4s,  v0.s[0]
511        FMLA v23.4s, v13.4s,  v1.s[0]
512        FMLA v25.4s, v13.4s,  v2.s[0]
513        FMLA v27.4s, v13.4s,  v3.s[0]
514        FMLA v29.4s, v13.4s,  v4.s[0]
515        FMLA v31.4s, v13.4s,  v5.s[0]
516
517        FMLA v20.4s, v14.4s,  v0.s[1]
518        FMLA v22.4s, v14.4s,  v1.s[1]
519        FMLA v24.4s, v14.4s,  v2.s[1]
520        FMLA v26.4s, v14.4s,  v3.s[1]
521        FMLA v28.4s, v14.4s,  v4.s[1]
522        FMLA v30.4s, v14.4s,  v5.s[1]
523        FMLA v21.4s, v15.4s,  v0.s[1]
524        FMLA v23.4s, v15.4s,  v1.s[1]
525        FMLA v25.4s, v15.4s,  v2.s[1]
526        FMLA v27.4s, v15.4s,  v3.s[1]
527        FMLA v29.4s, v15.4s,  v4.s[1]
528        FMLA v31.4s, v15.4s,  v5.s[1]
529
530        FMLA v20.4s, v16.4s,  v0.s[2]
531        FMLA v22.4s, v16.4s,  v1.s[2]
532        FMLA v24.4s, v16.4s,  v2.s[2]
533        FMLA v26.4s, v16.4s,  v3.s[2]
534        FMLA v28.4s, v16.4s,  v4.s[2]
535        FMLA v30.4s, v16.4s,  v5.s[2]
536        FMLA v21.4s, v17.4s,  v0.s[2]
537        FMLA v23.4s, v17.4s,  v1.s[2]
538        FMLA v25.4s, v17.4s,  v2.s[2]
539        FMLA v27.4s, v17.4s,  v3.s[2]
540        FMLA v29.4s, v17.4s,  v4.s[2]
541        FMLA v31.4s, v17.4s,  v5.s[2]
542
543        FMLA v20.4s, v18.4s,  v0.s[3]
544        FMLA v22.4s, v18.4s,  v1.s[3]
545        FMLA v24.4s, v18.4s,  v2.s[3]
546        FMLA v26.4s, v18.4s,  v3.s[3]
547        FMLA v28.4s, v18.4s,  v4.s[3]
548        FMLA v30.4s, v18.4s,  v5.s[3]
549        FMLA v21.4s, v19.4s,  v0.s[3]
550        FMLA v23.4s, v19.4s,  v1.s[3]
551        FMLA v25.4s, v19.4s,  v2.s[3]
552        FMLA v27.4s, v19.4s,  v3.s[3]
553        FMLA v29.4s, v19.4s,  v4.s[3]
554        FMLA v31.4s, v19.4s,  v5.s[3]
555
556        # Is there a remainder?- 2 floats of A (8 bytes)
5576:
558        TBZ x0, 3, 7f
559
560        # Remainder- 2 floats of A (8 bytes)
561        # Load A
562        LDR   d0, [x14], 8
563        LDR   d1, [x15], 8
564        LDR   d2, [x20], 8
565        LDR   d3, [x21], 8
566        LDR   d4, [x22], 8
567        LDR   d5, [x23], 8
568        # Load B
569        LDP  q12, q13, [x5], 32
570        LDP  q14, q15, [x5], 32
571
572        FMLA v20.4s, v12.4s,  v0.s[0]
573        FMLA v22.4s, v12.4s,  v1.s[0]
574        FMLA v24.4s, v12.4s,  v2.s[0]
575        FMLA v26.4s, v12.4s,  v3.s[0]
576        FMLA v28.4s, v12.4s,  v4.s[0]
577        FMLA v30.4s, v12.4s,  v5.s[0]
578        FMLA v21.4s, v13.4s,  v0.s[0]
579        FMLA v23.4s, v13.4s,  v1.s[0]
580        FMLA v25.4s, v13.4s,  v2.s[0]
581        FMLA v27.4s, v13.4s,  v3.s[0]
582        FMLA v29.4s, v13.4s,  v4.s[0]
583        FMLA v31.4s, v13.4s,  v5.s[0]
584
585        FMLA v20.4s, v14.4s,  v0.s[1]
586        FMLA v22.4s, v14.4s,  v1.s[1]
587        FMLA v24.4s, v14.4s,  v2.s[1]
588        FMLA v26.4s, v14.4s,  v3.s[1]
589        FMLA v28.4s, v14.4s,  v4.s[1]
590        FMLA v30.4s, v14.4s,  v5.s[1]
591        FMLA v21.4s, v15.4s,  v0.s[1]
592        FMLA v23.4s, v15.4s,  v1.s[1]
593        FMLA v25.4s, v15.4s,  v2.s[1]
594        FMLA v27.4s, v15.4s,  v3.s[1]
595        FMLA v29.4s, v15.4s,  v4.s[1]
596        FMLA v31.4s, v15.4s,  v5.s[1]
597
598        # Is there a remainder?- 1 float of A (4 bytes)
5997:
600        TBZ x0, 2, 4b
601
602        # Remainder- 1 float of A (4 bytes)
603        # Load A
604        LDR   s0, [x14], 4
605        LDR   s1, [x15], 4
606        LDR   s2, [x20], 4
607        LDR   s3, [x21], 4
608        LDR   s4, [x22], 4
609        LDR   s5, [x23], 4
610        # Load B
611        LDP  q12, q13, [x5], 32
612
613        FMLA v20.4s, v12.4s,  v0.s[0]
614        FMLA v22.4s, v12.4s,  v1.s[0]
615        FMLA v24.4s, v12.4s,  v2.s[0]
616        FMLA v26.4s, v12.4s,  v3.s[0]
617        FMLA v28.4s, v12.4s,  v4.s[0]
618        FMLA v30.4s, v12.4s,  v5.s[0]
619        FMLA v21.4s, v13.4s,  v0.s[0]
620        FMLA v23.4s, v13.4s,  v1.s[0]
621        FMLA v25.4s, v13.4s,  v2.s[0]
622        FMLA v27.4s, v13.4s,  v3.s[0]
623        FMLA v29.4s, v13.4s,  v4.s[0]
624        FMLA v31.4s, v13.4s,  v5.s[0]
625        B 4b
626
627        # Store odd width
6288:
629        TBZ x1, 2, 9f
630        STR q30,  [x7], 16
631        MOV v30.16b, v31.16b
632        STR q28, [x13], 16
633        MOV v28.16b, v29.16b
634        STR q26, [x10], 16
635        MOV v26.16b, v27.16b
636        STR q24, [x17], 16
637        MOV v24.16b, v25.16b
638        STR q22, [x16], 16
639        MOV v22.16b, v23.16b
640        STR q20,  [x6], 16
641        MOV v20.16b, v21.16b
6429:
643        TBZ x1, 1, 10f
644        STR d30,  [x7], 8
645        DUP d30, v30.d[1]
646        STR d28, [x13], 8
647        DUP d28, v28.d[1]
648        STR d26, [x10], 8
649        DUP d26, v26.d[1]
650        STR d24, [x17], 8
651        DUP d24, v24.d[1]
652        STR d22, [x16], 8
653        DUP d22, v22.d[1]
654        STR d20,  [x6], 8
655        DUP d20, v20.d[1]
656
65710:
658        TBZ x1, 0, 11f
659        STR s30,  [x7]
660        STR s28, [x13]
661        STR s26, [x10]
662        STR s24, [x17]
663        STR s22, [x16]
664        STR s20,  [x6]
66511:
666        # Restore x20,x21,x22,x23 from stack
667        LDP x22, x23, [sp, 80]
668        LDP x20, x21, [sp, 64]
669
670        # Restore d8-d15 from stack
671        LDP d14, d15, [sp, 48]
672        LDP d12, d13, [sp, 32]
673        LDP d10, d11, [sp, 16]
674        LDP  d8,  d9, [sp], 96
675        RET
676
677END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57
678
679#ifdef __ELF__
680.section ".note.GNU-stack","",%progbits
681#endif
682