1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31# x12 a4
32#  x4 a5
33
34# C pointers
35#  x6 c0
36# x16 c1
37# x17 c2
38# x14 c3
39# x13 c4
40#  x7 c5
41
42# Vector register usage
43# A0   v0  v6
44# A1   v1  v7
45# A2   v2  v8
46# A3   v3  v9
47# A4   v4 v10
48# A5   v5 v11
49# B   v12 v13 v14 v15
50# B   v16 v17 v18 v19
51# C   v20 v21
52# C   v22 v23
53# C   v24 v25
54# C   v26 v27
55# C   v28 v29
56# C   v30 v31
57# Clamp v6 v7
58
59BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57
60
61        # Load params pointer
62        LDR x8, [sp, 8]
63
64        # Clamp A and C pointers / Save d8-d15 on stack
65        STP  d8,  d9, [sp, -64]!
66        CMP x0, 2                // if mr < 2
67        ADD x9, x3, x4           // a1 = a0 + a_stride
68        ADD x16, x6, x7          // c1 = c0 + cm_stride
69        CSEL x9, x3, x9, LO      //   a1 = a0
70        CSEL x16, x6, x16, LO    //   c1 = c0
71
72        STP d10, d11, [sp, 16]
73        ADD x10, x9, x4          // a2 = a1 + a_stride
74        ADD x17, x16, x7         // c2 = c1 + cm_stride
75                                 // if mr <= 2
76        CSEL x10, x9, x10, LS    //   a2 = a1
77        CSEL x17, x16, x17, LS   //   c2 = c1
78
79        STP d12, d13, [sp, 32]
80        CMP x0, 4                // if mr < 4
81        ADD x11, x10, x4         // a3 = a2 + a_stride
82        ADD x14, x17, x7         // c3 = c2 + cm_stride
83        CSEL x11, x10, x11, LO   //   a3 = a2
84        CSEL x14, x17, x14, LO   //   c3 = c2
85
86        STP d14, d15, [sp, 48]
87        ADD x12, x11, x4         // a4 = a3 + a_stride
88        ADD x13, x14, x7         // c4 = c3 + cm_stride
89                                 // if mr <= 4
90        CSEL x12, x11, x12, LS   //   a4 = a3
91        CSEL x13, x14, x13, LS   //   c4 = c3
92
93        CMP x0, 6                // if mr < 6
94        ADD x4, x12, x4          // a5 = a4 + a_stride
95        ADD x7, x13, x7          // c5 = c4 + cm_stride
96        CSEL x4, x12, x4, LO     //   a5 = a4
97        CSEL x7, x13, x7, LO     //   c5 = c4
98
990:
100        # Load initial bias from w into accumulators
101        LDP q20, q21, [x5], 32
102        MOV v22.16b, v20.16b
103        MOV v23.16b, v21.16b
104        MOV v24.16b, v20.16b
105        MOV v25.16b, v21.16b
106        MOV v26.16b, v20.16b
107        MOV v27.16b, v21.16b
108        MOV v28.16b, v20.16b
109        MOV v29.16b, v21.16b
110        MOV v30.16b, v20.16b
111        MOV v31.16b, v21.16b
112
113        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
114        SUBS x0, x2, 32  // k = kc - 32
115        B.LO 4f
116
117        # Prologue - loads for main loop of 96 FMA
118        LDR   q0,  [x3], 16
119        LDR   q1,  [x9], 16
120        LDR   q2, [x10], 16
121        LDR   q3, [x11], 16
122        LDR   q4, [x12], 16
123        LDR   q5,  [x4], 16
124        LDP  q12,  q13, [x5], 32  // Fetch 3 B (4th deferred)
125        LDP  q14,  q15, [x5], 32
126        LDP  q16,  q17, [x5], 32
127
128        # Is there at least 8 floats (32 bytes) for main loop?
129        SUBS x0, x0, 32
130        B.LO 2f
131
132        # Main loop - 8 floats of A (32 bytes)
133        # 96 FMA + 6 LDP A + 8 LDP B
1341:
135        # First group of 4 A.  48 FMA.
136        FMLA v20.4s, v12.4s,  v0.s[0]
137        LDP  q18,  q19, [x5], 32      // Load last B
138        FMLA v22.4s, v12.4s,  v1.s[0]
139        FMLA v24.4s, v12.4s,  v2.s[0]
140        FMLA v26.4s, v12.4s,  v3.s[0]
141        FMLA v28.4s, v12.4s,  v4.s[0]
142        FMLA v30.4s, v12.4s,  v5.s[0]
143        FMLA v21.4s, v13.4s,  v0.s[0]
144        FMLA v23.4s, v13.4s,  v1.s[0]
145        FMLA v25.4s, v13.4s,  v2.s[0]
146        FMLA v27.4s, v13.4s,  v3.s[0]
147        FMLA v29.4s, v13.4s,  v4.s[0]
148
149        FMLA v31.4s, v13.4s,  v5.s[0]
150        FMLA v20.4s, v14.4s,  v0.s[1]
151        FMLA v22.4s, v14.4s,  v1.s[1]
152        FMLA v24.4s, v14.4s,  v2.s[1]
153        FMLA v26.4s, v14.4s,  v3.s[1]
154        FMLA v28.4s, v14.4s,  v4.s[1]
155        FMLA v30.4s, v14.4s,  v5.s[1]
156        FMLA v21.4s, v15.4s,  v0.s[1]
157        FMLA v23.4s, v15.4s,  v1.s[1]
158        FMLA v25.4s, v15.4s,  v2.s[1]
159        LDR   q6,  [x3], 16            // Load next 6 A
160        FMLA v27.4s, v15.4s,  v3.s[1]
161        FMLA v29.4s, v15.4s,  v4.s[1]
162        FMLA v31.4s, v15.4s,  v5.s[1]
163        LDR   q7,  [x9], 16
164
165        FMLA v20.4s, v16.4s,  v0.s[2]
166        FMLA v22.4s, v16.4s,  v1.s[2]
167        FMLA v24.4s, v16.4s,  v2.s[2]
168        LDR   q8, [x10], 16
169        FMLA v26.4s, v16.4s,  v3.s[2]
170        FMLA v28.4s, v16.4s,  v4.s[2]
171        FMLA v30.4s, v16.4s,  v5.s[2]
172        LDR   q9, [x11], 16
173        FMLA v21.4s, v17.4s,  v0.s[2]
174        FMLA v23.4s, v17.4s,  v1.s[2]
175        FMLA v25.4s, v17.4s,  v2.s[2]
176        LDR   q10, [x12], 16
177        FMLA v27.4s, v17.4s,  v3.s[2]
178        FMLA v29.4s, v17.4s,  v4.s[2]
179        FMLA v31.4s, v17.4s,  v5.s[2]
180        LDR  q11,  [x4], 16
181
182        FMLA v20.4s, v18.4s,  v0.s[3]
183        FMLA v22.4s, v18.4s,  v1.s[3]
184        FMLA v24.4s, v18.4s,  v2.s[3]
185        LDP  q12,  q13, [x5], 32       // Load 4 B
186        FMLA v26.4s, v18.4s,  v3.s[3]
187        FMLA v28.4s, v18.4s,  v4.s[3]
188        FMLA v30.4s, v18.4s,  v5.s[3]
189        LDP  q14,  q15, [x5], 32
190        FMLA v21.4s, v19.4s,  v0.s[3]
191        FMLA v23.4s, v19.4s,  v1.s[3]
192        FMLA v25.4s, v19.4s,  v2.s[3]
193        LDP  q16,  q17, [x5], 32
194        FMLA v27.4s, v19.4s,  v3.s[3]
195        FMLA v29.4s, v19.4s,  v4.s[3]
196        FMLA v31.4s, v19.4s,  v5.s[3]
197        LDP  q18,  q19, [x5], 32
198
199        # Second group of 4 A.  48 FMA.
200        FMLA v20.4s, v12.4s,  v6.s[0]
201        FMLA v22.4s, v12.4s,  v7.s[0]
202        FMLA v24.4s, v12.4s,  v8.s[0]
203        LDR   q0,  [x3], 16           // Load next 6 A
204        FMLA v26.4s, v12.4s,  v9.s[0]
205        FMLA v28.4s, v12.4s, v10.s[0]
206        FMLA v30.4s, v12.4s, v11.s[0]
207        LDR   q1,  [x9], 16
208        FMLA v21.4s, v13.4s,  v6.s[0]
209        FMLA v23.4s, v13.4s,  v7.s[0]
210        FMLA v25.4s, v13.4s,  v8.s[0]
211        LDR   q2, [x10], 16
212        FMLA v27.4s, v13.4s,  v9.s[0]
213        FMLA v29.4s, v13.4s, v10.s[0]
214        FMLA v31.4s, v13.4s, v11.s[0]
215        LDR   q3, [x11], 16
216
217        FMLA v20.4s, v14.4s,  v6.s[1]
218        FMLA v22.4s, v14.4s,  v7.s[1]
219        FMLA v24.4s, v14.4s,  v8.s[1]
220        LDR   q4, [x12], 16
221        FMLA v26.4s, v14.4s,  v9.s[1]
222        FMLA v28.4s, v14.4s, v10.s[1]
223        FMLA v30.4s, v14.4s, v11.s[1]
224        LDR   q5,  [x4], 16
225        FMLA v21.4s, v15.4s,  v6.s[1]
226        FMLA v23.4s, v15.4s,  v7.s[1]
227        FMLA v25.4s, v15.4s,  v8.s[1]
228        LDP  q12,  q13, [x5], 32       // Load next 3 B (not last)
229        FMLA v27.4s, v15.4s,  v9.s[1]
230        FMLA v29.4s, v15.4s, v10.s[1]
231        FMLA v31.4s, v15.4s, v11.s[1]
232        LDP  q14,  q15, [x5], 32
233
234        FMLA v20.4s, v16.4s,  v6.s[2]
235        FMLA v22.4s, v16.4s,  v7.s[2]
236        FMLA v24.4s, v16.4s,  v8.s[2]
237        FMLA v26.4s, v16.4s,  v9.s[2]
238        FMLA v28.4s, v16.4s, v10.s[2]
239        FMLA v30.4s, v16.4s, v11.s[2]
240        FMLA v21.4s, v17.4s,  v6.s[2]
241        FMLA v23.4s, v17.4s,  v7.s[2]
242        FMLA v25.4s, v17.4s,  v8.s[2]
243        FMLA v27.4s, v17.4s,  v9.s[2]
244        FMLA v29.4s, v17.4s, v10.s[2]
245        FMLA v31.4s, v17.4s, v11.s[2]
246        LDP  q16,  q17, [x5], 32
247
248        FMLA v20.4s, v18.4s,  v6.s[3]
249        FMLA v22.4s, v18.4s,  v7.s[3]
250        SUBS x0, x0, 32
251        FMLA v24.4s, v18.4s,  v8.s[3]
252        FMLA v26.4s, v18.4s,  v9.s[3]
253        FMLA v28.4s, v18.4s, v10.s[3]
254        FMLA v30.4s, v18.4s, v11.s[3]
255        FMLA v21.4s, v19.4s,  v6.s[3]
256        FMLA v23.4s, v19.4s,  v7.s[3]
257        FMLA v25.4s, v19.4s,  v8.s[3]
258        FMLA v27.4s, v19.4s,  v9.s[3]
259        FMLA v29.4s, v19.4s, v10.s[3]
260        FMLA v31.4s, v19.4s, v11.s[3]
261        B.HS 1b
262
263        # Epilogue - 8 floats of A (32 bytes)
264        # 96 FMA + 6 LDP A + 8 LDP B
265        # First block same as main loop.  Second block has no preloads.
2662:
267        # First group of 4 A.  48 FMA.
268        FMLA v20.4s, v12.4s,  v0.s[0]
269        LDP  q18,  q19, [x5], 32      // Load last B
270        FMLA v22.4s, v12.4s,  v1.s[0]
271        FMLA v24.4s, v12.4s,  v2.s[0]
272        FMLA v26.4s, v12.4s,  v3.s[0]
273        FMLA v28.4s, v12.4s,  v4.s[0]
274        FMLA v30.4s, v12.4s,  v5.s[0]
275        FMLA v21.4s, v13.4s,  v0.s[0]
276        FMLA v23.4s, v13.4s,  v1.s[0]
277        FMLA v25.4s, v13.4s,  v2.s[0]
278        FMLA v27.4s, v13.4s,  v3.s[0]
279        FMLA v29.4s, v13.4s,  v4.s[0]
280
281        FMLA v31.4s, v13.4s,  v5.s[0]
282        FMLA v20.4s, v14.4s,  v0.s[1]
283        FMLA v22.4s, v14.4s,  v1.s[1]
284        FMLA v24.4s, v14.4s,  v2.s[1]
285        FMLA v26.4s, v14.4s,  v3.s[1]
286        FMLA v28.4s, v14.4s,  v4.s[1]
287        FMLA v30.4s, v14.4s,  v5.s[1]
288        FMLA v21.4s, v15.4s,  v0.s[1]
289        FMLA v23.4s, v15.4s,  v1.s[1]
290        FMLA v25.4s, v15.4s,  v2.s[1]
291        LDR   q6,  [x3], 16            // Load next 6 A
292        FMLA v27.4s, v15.4s,  v3.s[1]
293        FMLA v29.4s, v15.4s,  v4.s[1]
294        FMLA v31.4s, v15.4s,  v5.s[1]
295        LDR   q7,  [x9], 16
296
297        FMLA v20.4s, v16.4s,  v0.s[2]
298        FMLA v22.4s, v16.4s,  v1.s[2]
299        FMLA v24.4s, v16.4s,  v2.s[2]
300        LDR   q8, [x10], 16
301        FMLA v26.4s, v16.4s,  v3.s[2]
302        FMLA v28.4s, v16.4s,  v4.s[2]
303        FMLA v30.4s, v16.4s,  v5.s[2]
304        LDR   q9, [x11], 16
305        FMLA v21.4s, v17.4s,  v0.s[2]
306        FMLA v23.4s, v17.4s,  v1.s[2]
307        FMLA v25.4s, v17.4s,  v2.s[2]
308        LDR   q10, [x12], 16
309        FMLA v27.4s, v17.4s,  v3.s[2]
310        FMLA v29.4s, v17.4s,  v4.s[2]
311        FMLA v31.4s, v17.4s,  v5.s[2]
312        LDR  q11,  [x4], 16
313
314        FMLA v20.4s, v18.4s,  v0.s[3]
315        FMLA v22.4s, v18.4s,  v1.s[3]
316        FMLA v24.4s, v18.4s,  v2.s[3]
317        LDP  q12,  q13, [x5], 32       // Load 4 B
318        FMLA v26.4s, v18.4s,  v3.s[3]
319        FMLA v28.4s, v18.4s,  v4.s[3]
320        FMLA v30.4s, v18.4s,  v5.s[3]
321        LDP  q14,  q15, [x5], 32
322        FMLA v21.4s, v19.4s,  v0.s[3]
323        FMLA v23.4s, v19.4s,  v1.s[3]
324        FMLA v25.4s, v19.4s,  v2.s[3]
325        LDP  q16,  q17, [x5], 32
326        FMLA v27.4s, v19.4s,  v3.s[3]
327        FMLA v29.4s, v19.4s,  v4.s[3]
328        FMLA v31.4s, v19.4s,  v5.s[3]
329        LDP  q18,  q19, [x5], 32
330
331        # Second group of 4 A.  48 FMA.
332        FMLA v20.4s, v12.4s,  v6.s[0]
333        FMLA v22.4s, v12.4s,  v7.s[0]
334        FMLA v24.4s, v12.4s,  v8.s[0]
335        FMLA v26.4s, v12.4s,  v9.s[0]
336        FMLA v28.4s, v12.4s, v10.s[0]
337        FMLA v30.4s, v12.4s, v11.s[0]
338        FMLA v21.4s, v13.4s,  v6.s[0]
339        FMLA v23.4s, v13.4s,  v7.s[0]
340        FMLA v25.4s, v13.4s,  v8.s[0]
341        FMLA v27.4s, v13.4s,  v9.s[0]
342        FMLA v29.4s, v13.4s, v10.s[0]
343        FMLA v31.4s, v13.4s, v11.s[0]
344
345        FMLA v20.4s, v14.4s,  v6.s[1]
346        FMLA v22.4s, v14.4s,  v7.s[1]
347        FMLA v24.4s, v14.4s,  v8.s[1]
348        FMLA v26.4s, v14.4s,  v9.s[1]
349        FMLA v28.4s, v14.4s, v10.s[1]
350        FMLA v30.4s, v14.4s, v11.s[1]
351        FMLA v21.4s, v15.4s,  v6.s[1]
352        FMLA v23.4s, v15.4s,  v7.s[1]
353        FMLA v25.4s, v15.4s,  v8.s[1]
354        FMLA v27.4s, v15.4s,  v9.s[1]
355        FMLA v29.4s, v15.4s, v10.s[1]
356        FMLA v31.4s, v15.4s, v11.s[1]
357
358        FMLA v20.4s, v16.4s,  v6.s[2]
359        FMLA v22.4s, v16.4s,  v7.s[2]
360        FMLA v24.4s, v16.4s,  v8.s[2]
361        FMLA v26.4s, v16.4s,  v9.s[2]
362        FMLA v28.4s, v16.4s, v10.s[2]
363        FMLA v30.4s, v16.4s, v11.s[2]
364        FMLA v21.4s, v17.4s,  v6.s[2]
365        FMLA v23.4s, v17.4s,  v7.s[2]
366        FMLA v25.4s, v17.4s,  v8.s[2]
367        FMLA v27.4s, v17.4s,  v9.s[2]
368        FMLA v29.4s, v17.4s, v10.s[2]
369        FMLA v31.4s, v17.4s, v11.s[2]
370
371        FMLA v20.4s, v18.4s,  v6.s[3]
372        FMLA v22.4s, v18.4s,  v7.s[3]
373        FMLA v24.4s, v18.4s,  v8.s[3]
374        FMLA v26.4s, v18.4s,  v9.s[3]
375        FMLA v28.4s, v18.4s, v10.s[3]
376        FMLA v30.4s, v18.4s, v11.s[3]
377        FMLA v21.4s, v19.4s,  v6.s[3]
378        FMLA v23.4s, v19.4s,  v7.s[3]
379
380        # Load min/max values
381        LD2R {v6.4s, v7.4s}, [x8]
382
383        FMLA v25.4s, v19.4s,  v8.s[3]
384        FMLA v27.4s, v19.4s,  v9.s[3]
385        # Is there a remainder?- 4 floats of A (16 bytes) or less
386        TST x0, 31
387        FMLA v29.4s, v19.4s, v10.s[3]
388        FMLA v31.4s, v19.4s, v11.s[3]
389        B.NE 4f
390
391        # Clamp
3923:
393        FMAX v20.4s, v20.4s, v6.4s
394        # Load cn_stride
395        LDR x0, [sp, 64]
396        FMAX v21.4s, v21.4s, v6.4s
397        FMAX v22.4s, v22.4s, v6.4s
398        FMAX v23.4s, v23.4s, v6.4s
399        FMAX v24.4s, v24.4s, v6.4s
400        FMAX v25.4s, v25.4s, v6.4s
401        FMAX v26.4s, v26.4s, v6.4s
402        FMAX v27.4s, v27.4s, v6.4s
403        FMAX v28.4s, v28.4s, v6.4s
404        FMAX v29.4s, v29.4s, v6.4s
405        FMAX v30.4s, v30.4s, v6.4s
406        FMAX v31.4s, v31.4s, v6.4s
407        SUBS x1, x1, 8
408        FMIN v20.4s, v20.4s, v7.4s
409        FMIN v21.4s, v21.4s, v7.4s
410        FMIN v22.4s, v22.4s, v7.4s
411        FMIN v23.4s, v23.4s, v7.4s
412        FMIN v24.4s, v24.4s, v7.4s
413        FMIN v25.4s, v25.4s, v7.4s
414        FMIN v26.4s, v26.4s, v7.4s
415        FMIN v27.4s, v27.4s, v7.4s
416        FMIN v28.4s, v28.4s, v7.4s
417        FMIN v29.4s, v29.4s, v7.4s
418        FMIN v30.4s, v30.4s, v7.4s
419        FMIN v31.4s, v31.4s, v7.4s
420
421        # Store full 6 x 8
422        B.LO 7f
423
424        STP q20, q21,  [x6]
425        ADD  x6,  x6, x0
426        SUB  x3,  x3, x2 // a0 -= kc
427        STP q22, q23, [x16]
428        ADD x16, x16, x0
429        SUB  x9,  x9, x2 // a1 -= kc
430        STP q24, q25, [x17]
431        ADD x17, x17, x0
432        SUB x10, x10, x2 // a2 -= kc
433        STP q26, q27, [x14]
434        ADD x14, x14, x0
435        SUB x11, x11, x2 // a3 -= kc
436        STP q28, q29, [x13]
437        ADD x13, x13, x0
438        SUB x12, x12, x2 // a4 -= kc
439        STP q30, q31,  [x7]
440        ADD x7, x7, x0
441        SUB  x4,  x4, x2 // a5 -= kc
442
443        B.HI 0b
444
445        # Restore d8-d15 from stack
446        LDP d14, d15, [sp, 48]
447        LDP d12, d13, [sp, 32]
448        LDP d10, d11, [sp, 16]
449        LDP  d8,  d9, [sp], 64
450        RET
451
4524:
453        # Load min/max values
454        LD2R {v6.4s, v7.4s}, [x8]
455
456        # Is there a remainder?- 4 floats of A (16 bytes)
457        TBZ x0, 4, 5f
458
459        # Remainder- 4 floats of A (16 bytes)
460        # Load A
461        LDR   q0,  [x3], 16
462        LDR   q1,  [x9], 16
463        LDR   q2, [x10], 16
464        LDR   q3, [x11], 16
465        LDR   q4, [x12], 16
466        LDR   q5,  [x4], 16
467        # Load B
468        LDP  q12,  q13, [x5], 32
469        LDP  q14,  q15, [x5], 32
470        LDP  q16,  q17, [x5], 32
471        LDP  q18,  q19, [x5], 32
472
473        FMLA v20.4s, v12.4s,  v0.s[0]
474        FMLA v22.4s, v12.4s,  v1.s[0]
475        FMLA v24.4s, v12.4s,  v2.s[0]
476        FMLA v26.4s, v12.4s,  v3.s[0]
477        FMLA v28.4s, v12.4s,  v4.s[0]
478        FMLA v30.4s, v12.4s,  v5.s[0]
479        FMLA v21.4s, v13.4s,  v0.s[0]
480        FMLA v23.4s, v13.4s,  v1.s[0]
481        FMLA v25.4s, v13.4s,  v2.s[0]
482        FMLA v27.4s, v13.4s,  v3.s[0]
483        FMLA v29.4s, v13.4s,  v4.s[0]
484        FMLA v31.4s, v13.4s,  v5.s[0]
485
486        FMLA v20.4s, v14.4s,  v0.s[1]
487        FMLA v22.4s, v14.4s,  v1.s[1]
488        FMLA v24.4s, v14.4s,  v2.s[1]
489        FMLA v26.4s, v14.4s,  v3.s[1]
490        FMLA v28.4s, v14.4s,  v4.s[1]
491        FMLA v30.4s, v14.4s,  v5.s[1]
492        FMLA v21.4s, v15.4s,  v0.s[1]
493        FMLA v23.4s, v15.4s,  v1.s[1]
494        FMLA v25.4s, v15.4s,  v2.s[1]
495        FMLA v27.4s, v15.4s,  v3.s[1]
496        FMLA v29.4s, v15.4s,  v4.s[1]
497        FMLA v31.4s, v15.4s,  v5.s[1]
498
499        FMLA v20.4s, v16.4s,  v0.s[2]
500        FMLA v22.4s, v16.4s,  v1.s[2]
501        FMLA v24.4s, v16.4s,  v2.s[2]
502        FMLA v26.4s, v16.4s,  v3.s[2]
503        FMLA v28.4s, v16.4s,  v4.s[2]
504        FMLA v30.4s, v16.4s,  v5.s[2]
505        FMLA v21.4s, v17.4s,  v0.s[2]
506        FMLA v23.4s, v17.4s,  v1.s[2]
507        FMLA v25.4s, v17.4s,  v2.s[2]
508        FMLA v27.4s, v17.4s,  v3.s[2]
509        FMLA v29.4s, v17.4s,  v4.s[2]
510        FMLA v31.4s, v17.4s,  v5.s[2]
511
512        FMLA v20.4s, v18.4s,  v0.s[3]
513        FMLA v22.4s, v18.4s,  v1.s[3]
514        FMLA v24.4s, v18.4s,  v2.s[3]
515        FMLA v26.4s, v18.4s,  v3.s[3]
516        FMLA v28.4s, v18.4s,  v4.s[3]
517        FMLA v30.4s, v18.4s,  v5.s[3]
518        FMLA v21.4s, v19.4s,  v0.s[3]
519        FMLA v23.4s, v19.4s,  v1.s[3]
520        FMLA v25.4s, v19.4s,  v2.s[3]
521        FMLA v27.4s, v19.4s,  v3.s[3]
522        FMLA v29.4s, v19.4s,  v4.s[3]
523        FMLA v31.4s, v19.4s,  v5.s[3]
524
525        # Is there a remainder?- 2 floats of A (8 bytes)
5265:
527        TBZ x0, 3, 6f
528
529        # Remainder- 2 floats of A (8 bytes)
530        # Load A
531        LDR   d0,  [x3], 8
532        LDR   d1,  [x9], 8
533        LDR   d2, [x10], 8
534        LDR   d3, [x11], 8
535        LDR   d4, [x12], 8
536        LDR   d5,  [x4], 8
537        # Load B
538        LDP  q12,  q13, [x5], 32
539        LDP  q14,  q15, [x5], 32
540
541        FMLA v20.4s, v12.4s,  v0.s[0]
542        FMLA v22.4s, v12.4s,  v1.s[0]
543        FMLA v24.4s, v12.4s,  v2.s[0]
544        FMLA v26.4s, v12.4s,  v3.s[0]
545        FMLA v28.4s, v12.4s,  v4.s[0]
546        FMLA v30.4s, v12.4s,  v5.s[0]
547        FMLA v21.4s, v13.4s,  v0.s[0]
548        FMLA v23.4s, v13.4s,  v1.s[0]
549        FMLA v25.4s, v13.4s,  v2.s[0]
550        FMLA v27.4s, v13.4s,  v3.s[0]
551        FMLA v29.4s, v13.4s,  v4.s[0]
552        FMLA v31.4s, v13.4s,  v5.s[0]
553
554        FMLA v20.4s, v14.4s,  v0.s[1]
555        FMLA v22.4s, v14.4s,  v1.s[1]
556        FMLA v24.4s, v14.4s,  v2.s[1]
557        FMLA v26.4s, v14.4s,  v3.s[1]
558        FMLA v28.4s, v14.4s,  v4.s[1]
559        FMLA v30.4s, v14.4s,  v5.s[1]
560        FMLA v21.4s, v15.4s,  v0.s[1]
561        FMLA v23.4s, v15.4s,  v1.s[1]
562        FMLA v25.4s, v15.4s,  v2.s[1]
563        FMLA v27.4s, v15.4s,  v3.s[1]
564        FMLA v29.4s, v15.4s,  v4.s[1]
565        FMLA v31.4s, v15.4s,  v5.s[1]
566
567        # Is there a remainder?- 1 float of A (4 bytes)
5686:
569        TBZ x0, 2, 3b
570
571        # Remainder- 1 float of A (4 bytes)
572        # Load A
573        LDR   s0,  [x3], 4
574        LDR   s1,  [x9], 4
575        LDR   s2, [x10], 4
576        LDR   s3, [x11], 4
577        LDR   s4, [x12], 4
578        LDR   s5,  [x4], 4
579        # Load B
580        LDP  q12,  q13, [x5], 32
581
582        FMLA v20.4s, v12.4s,  v0.s[0]
583        FMLA v22.4s, v12.4s,  v1.s[0]
584        FMLA v24.4s, v12.4s,  v2.s[0]
585        FMLA v26.4s, v12.4s,  v3.s[0]
586        FMLA v28.4s, v12.4s,  v4.s[0]
587        FMLA v30.4s, v12.4s,  v5.s[0]
588        FMLA v21.4s, v13.4s,  v0.s[0]
589        FMLA v23.4s, v13.4s,  v1.s[0]
590        FMLA v25.4s, v13.4s,  v2.s[0]
591        FMLA v27.4s, v13.4s,  v3.s[0]
592        FMLA v29.4s, v13.4s,  v4.s[0]
593        FMLA v31.4s, v13.4s,  v5.s[0]
594        B 3b
595
596        # Store odd width
5977:
598        TBZ x1, 2, 8f
599        STR q20,  [x6], 16
600        MOV v20.16b, v21.16b
601        STR q22, [x16], 16
602        MOV v22.16b, v23.16b
603        STR q24, [x17], 16
604        MOV v24.16b, v25.16b
605        STR q26, [x14], 16
606        MOV v26.16b, v27.16b
607        STR q28, [x13], 16
608        MOV v28.16b, v29.16b
609        STR q30,  [x7], 16
610        MOV v30.16b, v31.16b
6118:
612        TBZ x1, 1, 9f
613        STR d20,  [x6], 8
614        DUP d20, v20.d[1]
615        STR d22, [x16], 8
616        DUP d22, v22.d[1]
617        STR d24, [x17], 8
618        DUP d24, v24.d[1]
619        STR d26, [x14], 8
620        DUP d26, v26.d[1]
621        STR d28, [x13], 8
622        DUP d28, v28.d[1]
623        STR d30,  [x7], 8
624        DUP d30, v30.d[1]
625
6269:
627        TBZ x1, 0, 10f
628        STR s20,  [x6]
629        STR s22, [x16]
630        STR s24, [x17]
631        STR s26, [x14]
632        STR s28, [x13]
633        STR s30,  [x7]
63410:
635        # Restore d8-d15 from stack
636        LDP d14, d15, [sp, 48]
637        LDP d12, d13, [sp, 32]
638        LDP d10, d11, [sp, 16]
639        LDP  d8,  d9, [sp], 64
640        RET
641
642END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57
643
644#ifdef __ELF__
645.section ".note.GNU-stack","",%progbits
646#endif
647