1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_minmax_ukernel_5x8__aarch64_neonfma_cortex_a57(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
24
25# unused compared to 5x8
26#  x4 a5
27#  x7 c5
28# A5  v10 v11
29# C   v30 v31
30
31# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
32
33# A pointers
34#  x3 a0
35#  x9 a1
36# x10 a2
37# x11 a3
38# x12 a4
39
40# C pointers
41#  x6 c0
42# x16 c1
43# x17 c2
44# x13 c3
45#  x7 c4
46
47# Vector register usage
48# A0   v0  v1
49# A1   v2  v3
50# A2   v4  v5
51# A3   v6  v7
52# A4   v8  v9
53# B   v12 v13 v14 v15
54# B   v16 v17 v18 v19
55# C   v20 v21
56# C   v22 v23
57# C   v24 v25
58# C   v26 v27
59# C   v28 v29
60# Clamp v30 v31
61
62BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_5x8__aarch64_neonfma_cortex_a57
63
64        # Load cn_stride, acc
65        LDP x14, x15, [sp]
66        # Load params pointer
67        LDR x8, [sp, 16]
68
69        # Clamp A and C pointers / Save d8-d15 on stack
70        STP  d8,  d9, [sp, -48]!
71        CMP x0, 2                // if mr < 2
72        ADD x9, x3, x4           // a1 = a0 + a_stride
73        ADD x16, x6, x7          // c1 = c0 + cm_stride
74        CSEL x9, x3, x9, LO      //   a1 = a0
75        CSEL x16, x6, x16, LO    //   c1 = c0
76
77        STP d12, d13, [sp, 16]
78        ADD x10, x9, x4          // a2 = a1 + a_stride
79        ADD x17, x16, x7         // c2 = c1 + cm_stride
80                                 // if mr <= 2
81        CSEL x10, x9, x10, LS    //   a2 = a1
82        CSEL x17, x16, x17, LS   //   c2 = c1
83
84        STP d14, d15, [sp, 32]
85        CMP x0, 4                // if mr < 4
86        ADD x11, x10, x4         // a3 = a2 + a_stride
87        ADD x13, x17, x7         // c3 = c2 + cm_stride
88        CSEL x11, x10, x11, LO   //   a3 = a2
89        CSEL x13, x17, x13, LO   //   c3 = c2
90
91        ADD x12, x11, x4         // a4 = a3 + a_stride
92        ADD x7, x13, x7         // c4 = c3 + cm_stride
93                                 // if mr <= 4
94        CSEL x12, x11, x12, LS   //   a4 = a3
95        CSEL x7, x13, x7, LS   //   c4 = c3
96
97        # Load clamp values
98        LD2R {v30.4s, v31.4s}, [x8]
99
1000:
101        # Load initial accumulators
102        LDP q20, q21, [x15], 32
103        LDP q22, q23, [x15], 32
104        LDP q24, q25, [x15], 32
105        LDP q26, q27, [x15], 32
106        LDP q28, q29, [x15], 32
107
108        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
109        SUBS x0, x2, 32  // k = kc - 32
110        B.LO 4f
111
112        # Prologue - loads for main loop of 80 FMA
113        LDR   q0,  [x3], 16
114        LDR   q2,  [x9], 16
115        LDR   q4, [x10], 16
116        LDR   q6, [x11], 16
117        LDR   q8, [x12], 16
118        LDP  q12,  q13, [x5], 32  // Fetch 3 B (4th deferred)
119        LDP  q14,  q15, [x5], 32
120        LDP  q16,  q17, [x5], 32
121
122        # Is there at least 8 floats (32 bytes) for main loop?
123        SUBS x0, x0, 32
124        B.LO 2f
125
126        # Main loop - 8 floats of A (32 bytes)
127        # 80 FMA + 5 LDP A + 8 LDP B
1281:
129        # First group of 4 A.  40 FMA.
130        FMLA v20.4s, v12.4s,  v0.s[0]
131        LDP  q18,  q19, [x5], 32      // Load last B
132        FMLA v22.4s, v12.4s,  v2.s[0]
133        FMLA v24.4s, v12.4s,  v4.s[0]
134        FMLA v26.4s, v12.4s,  v6.s[0]
135        FMLA v28.4s, v12.4s,  v8.s[0]
136        FMLA v21.4s, v13.4s,  v0.s[0]
137        FMLA v23.4s, v13.4s,  v2.s[0]
138        FMLA v25.4s, v13.4s,  v4.s[0]
139        FMLA v27.4s, v13.4s,  v6.s[0]
140        FMLA v29.4s, v13.4s,  v8.s[0]
141        LDR   q1,  [x3], 16            // Load next 5 A
142
143        FMLA v20.4s, v14.4s,  v0.s[1]
144        FMLA v22.4s, v14.4s,  v2.s[1]
145        FMLA v24.4s, v14.4s,  v4.s[1]
146        LDR   q3,  [x9], 16
147        FMLA v26.4s, v14.4s,  v6.s[1]
148        FMLA v28.4s, v14.4s,  v8.s[1]
149        FMLA v21.4s, v15.4s,  v0.s[1]
150        LDR   q5, [x10], 16
151        FMLA v23.4s, v15.4s,  v2.s[1]
152        FMLA v25.4s, v15.4s,  v4.s[1]
153        FMLA v27.4s, v15.4s,  v6.s[1]
154        LDR   q7, [x11], 16
155        FMLA v29.4s, v15.4s,  v8.s[1]
156
157        FMLA v20.4s, v16.4s,  v0.s[2]
158        FMLA v22.4s, v16.4s,  v2.s[2]
159        LDR   q9, [x12], 16
160        FMLA v24.4s, v16.4s,  v4.s[2]
161        FMLA v26.4s, v16.4s,  v6.s[2]
162        FMLA v28.4s, v16.4s,  v8.s[2]
163        LDP  q12,  q13, [x5], 32       // Load 4 B
164        FMLA v21.4s, v17.4s,  v0.s[2]
165        FMLA v23.4s, v17.4s,  v2.s[2]
166        FMLA v25.4s, v17.4s,  v4.s[2]
167        LDP  q14,  q15, [x5], 32
168        FMLA v27.4s, v17.4s,  v6.s[2]
169        FMLA v29.4s, v17.4s,  v8.s[2]
170
171        FMLA v20.4s, v18.4s,  v0.s[3]
172        LDP  q16,  q17, [x5], 32
173        FMLA v22.4s, v18.4s,  v2.s[3]
174        FMLA v24.4s, v18.4s,  v4.s[3]
175        FMLA v26.4s, v18.4s,  v6.s[3]
176        FMLA v28.4s, v18.4s,  v8.s[3]
177        FMLA v21.4s, v19.4s,  v0.s[3]
178        FMLA v23.4s, v19.4s,  v2.s[3]
179        FMLA v25.4s, v19.4s,  v4.s[3]
180        FMLA v27.4s, v19.4s,  v6.s[3]
181        FMLA v29.4s, v19.4s,  v8.s[3]
182        LDP  q18,  q19, [x5], 32
183
184        # Second group of 4 A.  40 FMA.
185        FMLA v20.4s, v12.4s,  v1.s[0]
186        FMLA v22.4s, v12.4s,  v3.s[0]
187        FMLA v24.4s, v12.4s,  v5.s[0]
188        LDR   q0,  [x3], 16           // Load next 5 A
189        FMLA v26.4s, v12.4s,  v7.s[0]
190        FMLA v28.4s, v12.4s,  v9.s[0]
191        FMLA v21.4s, v13.4s,  v1.s[0]
192        LDR   q2,  [x9], 16
193        FMLA v23.4s, v13.4s,  v3.s[0]
194        FMLA v25.4s, v13.4s,  v5.s[0]
195        FMLA v27.4s, v13.4s,  v7.s[0]
196        LDR   q4, [x10], 16
197        FMLA v29.4s, v13.4s,  v9.s[0]
198
199        FMLA v20.4s, v14.4s,  v1.s[1]
200        FMLA v22.4s, v14.4s,  v3.s[1]
201        LDR   q6, [x11], 16
202        FMLA v24.4s, v14.4s,  v5.s[1]
203        FMLA v26.4s, v14.4s,  v7.s[1]
204        FMLA v28.4s, v14.4s,  v9.s[1]
205        LDR   q8, [x12], 16
206        FMLA v21.4s, v15.4s,  v1.s[1]
207        FMLA v23.4s, v15.4s,  v3.s[1]
208        FMLA v25.4s, v15.4s,  v5.s[1]
209        LDP  q12,  q13, [x5], 32       // Load next 3 B (not last)
210        FMLA v27.4s, v15.4s,  v7.s[1]
211        FMLA v29.4s, v15.4s,  v9.s[1]
212
213        FMLA v20.4s, v16.4s,  v1.s[2]
214        LDP  q14,  q15, [x5], 32
215        FMLA v22.4s, v16.4s,  v3.s[2]
216        FMLA v24.4s, v16.4s,  v5.s[2]
217        FMLA v26.4s, v16.4s,  v7.s[2]
218        FMLA v28.4s, v16.4s,  v9.s[2]
219        FMLA v21.4s, v17.4s,  v1.s[2]
220        FMLA v23.4s, v17.4s,  v3.s[2]
221        FMLA v25.4s, v17.4s,  v5.s[2]
222        FMLA v27.4s, v17.4s,  v7.s[2]
223        FMLA v29.4s, v17.4s,  v9.s[2]
224        LDP  q16,  q17, [x5], 32
225
226        FMLA v20.4s, v18.4s,  v1.s[3]
227        FMLA v22.4s, v18.4s,  v3.s[3]
228        SUBS x0, x0, 32
229        FMLA v24.4s, v18.4s,  v5.s[3]
230        FMLA v26.4s, v18.4s,  v7.s[3]
231        FMLA v28.4s, v18.4s,  v9.s[3]
232        FMLA v21.4s, v19.4s,  v1.s[3]
233        FMLA v23.4s, v19.4s,  v3.s[3]
234        FMLA v25.4s, v19.4s,  v5.s[3]
235        FMLA v27.4s, v19.4s,  v7.s[3]
236        FMLA v29.4s, v19.4s,  v9.s[3]
237        B.HS 1b
238
239        # Epilogue - 8 floats of A (32 bytes)
240        # 80 FMA + 5 LDP A + 8 LDP B
241        # First block same as main loop.  Second block has no preloads.
2422:
243        # First group of 4 A.  40 FMA.
244        FMLA v20.4s, v12.4s,  v0.s[0]
245        LDP  q18,  q19, [x5], 32      // Load last B
246        FMLA v22.4s, v12.4s,  v2.s[0]
247        FMLA v24.4s, v12.4s,  v4.s[0]
248        FMLA v26.4s, v12.4s,  v6.s[0]
249        FMLA v28.4s, v12.4s,  v8.s[0]
250        FMLA v21.4s, v13.4s,  v0.s[0]
251        FMLA v23.4s, v13.4s,  v2.s[0]
252        FMLA v25.4s, v13.4s,  v4.s[0]
253        FMLA v27.4s, v13.4s,  v6.s[0]
254        FMLA v29.4s, v13.4s,  v8.s[0]
255        LDR   q1,  [x3], 16            // Load next 5 A
256
257        FMLA v20.4s, v14.4s,  v0.s[1]
258        FMLA v22.4s, v14.4s,  v2.s[1]
259        FMLA v24.4s, v14.4s,  v4.s[1]
260        LDR   q3,  [x9], 16
261        FMLA v26.4s, v14.4s,  v6.s[1]
262        FMLA v28.4s, v14.4s,  v8.s[1]
263        FMLA v21.4s, v15.4s,  v0.s[1]
264        LDR   q5, [x10], 16
265        FMLA v23.4s, v15.4s,  v2.s[1]
266        FMLA v25.4s, v15.4s,  v4.s[1]
267        FMLA v27.4s, v15.4s,  v6.s[1]
268        LDR   q7, [x11], 16
269        FMLA v29.4s, v15.4s,  v8.s[1]
270
271        FMLA v20.4s, v16.4s,  v0.s[2]
272        FMLA v22.4s, v16.4s,  v2.s[2]
273        LDR   q9, [x12], 16
274        FMLA v24.4s, v16.4s,  v4.s[2]
275        FMLA v26.4s, v16.4s,  v6.s[2]
276        FMLA v28.4s, v16.4s,  v8.s[2]
277        LDP  q12,  q13, [x5], 32       // Load 4 B
278        FMLA v21.4s, v17.4s,  v0.s[2]
279        FMLA v23.4s, v17.4s,  v2.s[2]
280        FMLA v25.4s, v17.4s,  v4.s[2]
281        LDP  q14,  q15, [x5], 32
282        FMLA v27.4s, v17.4s,  v6.s[2]
283        FMLA v29.4s, v17.4s,  v8.s[2]
284
285        FMLA v20.4s, v18.4s,  v0.s[3]
286        LDP  q16,  q17, [x5], 32
287        FMLA v22.4s, v18.4s,  v2.s[3]
288        FMLA v24.4s, v18.4s,  v4.s[3]
289        FMLA v26.4s, v18.4s,  v6.s[3]
290        FMLA v28.4s, v18.4s,  v8.s[3]
291        FMLA v21.4s, v19.4s,  v0.s[3]
292        FMLA v23.4s, v19.4s,  v2.s[3]
293        FMLA v25.4s, v19.4s,  v4.s[3]
294        FMLA v27.4s, v19.4s,  v6.s[3]
295        FMLA v29.4s, v19.4s,  v8.s[3]
296        LDP  q18,  q19, [x5], 32
297
298        # Second group of 4 A.  40 FMA.
299        FMLA v20.4s, v12.4s,  v1.s[0]
300        FMLA v22.4s, v12.4s,  v3.s[0]
301        FMLA v24.4s, v12.4s,  v5.s[0]
302        FMLA v26.4s, v12.4s,  v7.s[0]
303        FMLA v28.4s, v12.4s,  v9.s[0]
304        FMLA v21.4s, v13.4s,  v1.s[0]
305        FMLA v23.4s, v13.4s,  v3.s[0]
306        FMLA v25.4s, v13.4s,  v5.s[0]
307        FMLA v27.4s, v13.4s,  v7.s[0]
308        FMLA v29.4s, v13.4s,  v9.s[0]
309
310        FMLA v20.4s, v14.4s,  v1.s[1]
311        FMLA v22.4s, v14.4s,  v3.s[1]
312        FMLA v24.4s, v14.4s,  v5.s[1]
313        FMLA v26.4s, v14.4s,  v7.s[1]
314        FMLA v28.4s, v14.4s,  v9.s[1]
315        FMLA v21.4s, v15.4s,  v1.s[1]
316        FMLA v23.4s, v15.4s,  v3.s[1]
317        FMLA v25.4s, v15.4s,  v5.s[1]
318        FMLA v27.4s, v15.4s,  v7.s[1]
319        FMLA v29.4s, v15.4s,  v9.s[1]
320
321        FMLA v20.4s, v16.4s,  v1.s[2]
322        FMLA v22.4s, v16.4s,  v3.s[2]
323        FMLA v24.4s, v16.4s,  v5.s[2]
324        FMLA v26.4s, v16.4s,  v7.s[2]
325        FMLA v28.4s, v16.4s,  v9.s[2]
326        FMLA v21.4s, v17.4s,  v1.s[2]
327        FMLA v23.4s, v17.4s,  v3.s[2]
328        FMLA v25.4s, v17.4s,  v5.s[2]
329        FMLA v27.4s, v17.4s,  v7.s[2]
330        FMLA v29.4s, v17.4s,  v9.s[2]
331        TST x0, 31
332
333        FMLA v20.4s, v18.4s,  v1.s[3]
334        FMLA v22.4s, v18.4s,  v3.s[3]
335        FMLA v24.4s, v18.4s,  v5.s[3]
336        FMLA v26.4s, v18.4s,  v7.s[3]
337        FMLA v28.4s, v18.4s,  v9.s[3]
338        FMLA v21.4s, v19.4s,  v1.s[3]
339        FMLA v23.4s, v19.4s,  v3.s[3]
340        FMLA v25.4s, v19.4s,  v5.s[3]
341        FMLA v27.4s, v19.4s,  v7.s[3]
342        FMLA v29.4s, v19.4s,  v9.s[3]
343        B.NE 4f
344
345        # Clamp
3463:
347        FMAX v20.4s, v20.4s, v30.4s
348        SUBS x1, x1, 8
349        FMAX v21.4s, v21.4s, v30.4s
350        FMAX v22.4s, v22.4s, v30.4s
351        FMAX v23.4s, v23.4s, v30.4s
352        FMAX v24.4s, v24.4s, v30.4s
353        FMAX v25.4s, v25.4s, v30.4s
354        FMAX v26.4s, v26.4s, v30.4s
355        FMAX v27.4s, v27.4s, v30.4s
356        FMAX v28.4s, v28.4s, v30.4s
357        FMAX v29.4s, v29.4s, v30.4s
358        FMIN v20.4s, v20.4s, v31.4s
359        FMIN v21.4s, v21.4s, v31.4s
360        FMIN v22.4s, v22.4s, v31.4s
361        FMIN v23.4s, v23.4s, v31.4s
362        FMIN v24.4s, v24.4s, v31.4s
363        FMIN v25.4s, v25.4s, v31.4s
364        FMIN v26.4s, v26.4s, v31.4s
365        FMIN v27.4s, v27.4s, v31.4s
366        FMIN v28.4s, v28.4s, v31.4s
367        FMIN v29.4s, v29.4s, v31.4s
368
369        # Store full 5 x 8
370        B.LO 7f
371
372        SUB  x3,  x3, x2 // a0 -= kc
373        STP q28, q29, [x7]
374        ADD x7, x7, x14
375        SUB  x9,  x9, x2 // a1 -= kc
376        STP q26, q27, [x13]
377        ADD x13, x13, x14
378        SUB x10, x10, x2 // a2 -= kc
379        STP q24, q25, [x17]
380        ADD x17, x17, x14
381        SUB x11, x11, x2 // a3 -= kc
382        STP q22, q23, [x16]
383        ADD x16, x16, x14
384        SUB x12, x12, x2 // a4 -= kc
385        STP q20, q21,  [x6]
386        ADD  x6,  x6, x14
387
388        B.HI 0b
389
390        # Restore d8-d15 from stack
391        LDP d14, d15, [sp, 32]
392        LDP d12, d13, [sp, 16]
393        LDP  d8,  d9, [sp], 48
394        RET
395
396        # Load clamp values
3974:
398        # Is there a remainder?- 4 floats of A (16 bytes)
399        TBZ x0, 4, 5f
400
401        # Remainder- 4 floats of A (16 bytes)
402        # Load A
403        LDR   q0,  [x3], 16
404        LDR   q2,  [x9], 16
405        LDR   q4, [x10], 16
406        LDR   q6, [x11], 16
407        LDR   q8, [x12], 16
408        # Load B
409        LDP  q12,  q13, [x5], 32
410        LDP  q14,  q15, [x5], 32
411        LDP  q16,  q17, [x5], 32
412        LDP  q18,  q19, [x5], 32
413
414        FMLA v20.4s, v12.4s,  v0.s[0]
415        FMLA v22.4s, v12.4s,  v2.s[0]
416        FMLA v24.4s, v12.4s,  v4.s[0]
417        FMLA v26.4s, v12.4s,  v6.s[0]
418        FMLA v28.4s, v12.4s,  v8.s[0]
419        FMLA v21.4s, v13.4s,  v0.s[0]
420        FMLA v23.4s, v13.4s,  v2.s[0]
421        FMLA v25.4s, v13.4s,  v4.s[0]
422        FMLA v27.4s, v13.4s,  v6.s[0]
423        FMLA v29.4s, v13.4s,  v8.s[0]
424
425        FMLA v20.4s, v14.4s,  v0.s[1]
426        FMLA v22.4s, v14.4s,  v2.s[1]
427        FMLA v24.4s, v14.4s,  v4.s[1]
428        FMLA v26.4s, v14.4s,  v6.s[1]
429        FMLA v28.4s, v14.4s,  v8.s[1]
430        FMLA v21.4s, v15.4s,  v0.s[1]
431        FMLA v23.4s, v15.4s,  v2.s[1]
432        FMLA v25.4s, v15.4s,  v4.s[1]
433        FMLA v27.4s, v15.4s,  v6.s[1]
434        FMLA v29.4s, v15.4s,  v8.s[1]
435
436        FMLA v20.4s, v16.4s,  v0.s[2]
437        FMLA v22.4s, v16.4s,  v2.s[2]
438        FMLA v24.4s, v16.4s,  v4.s[2]
439        FMLA v26.4s, v16.4s,  v6.s[2]
440        FMLA v28.4s, v16.4s,  v8.s[2]
441        FMLA v21.4s, v17.4s,  v0.s[2]
442        FMLA v23.4s, v17.4s,  v2.s[2]
443        FMLA v25.4s, v17.4s,  v4.s[2]
444        FMLA v27.4s, v17.4s,  v6.s[2]
445        FMLA v29.4s, v17.4s,  v8.s[2]
446
447        FMLA v20.4s, v18.4s,  v0.s[3]
448        FMLA v22.4s, v18.4s,  v2.s[3]
449        FMLA v24.4s, v18.4s,  v4.s[3]
450        FMLA v26.4s, v18.4s,  v6.s[3]
451        FMLA v28.4s, v18.4s,  v8.s[3]
452        FMLA v21.4s, v19.4s,  v0.s[3]
453        FMLA v23.4s, v19.4s,  v2.s[3]
454        FMLA v25.4s, v19.4s,  v4.s[3]
455        FMLA v27.4s, v19.4s,  v6.s[3]
456        FMLA v29.4s, v19.4s,  v8.s[3]
457
458        # Is there a remainder?- 2 floats of A (8 bytes)
4595:
460        TBZ x0, 3, 6f
461
462        # Remainder- 2 floats of A (8 bytes)
463        # Load A
464        LDR   d0,  [x3], 8
465        LDR   d2,  [x9], 8
466        LDR   d4, [x10], 8
467        LDR   d6, [x11], 8
468        LDR   d8, [x12], 8
469        # Load B
470        LDP  q12,  q13, [x5], 32
471        LDP  q14,  q15, [x5], 32
472
473        FMLA v20.4s, v12.4s,  v0.s[0]
474        FMLA v22.4s, v12.4s,  v2.s[0]
475        FMLA v24.4s, v12.4s,  v4.s[0]
476        FMLA v26.4s, v12.4s,  v6.s[0]
477        FMLA v28.4s, v12.4s,  v8.s[0]
478        FMLA v21.4s, v13.4s,  v0.s[0]
479        FMLA v23.4s, v13.4s,  v2.s[0]
480        FMLA v25.4s, v13.4s,  v4.s[0]
481        FMLA v27.4s, v13.4s,  v6.s[0]
482        FMLA v29.4s, v13.4s,  v8.s[0]
483
484        FMLA v20.4s, v14.4s,  v0.s[1]
485        FMLA v22.4s, v14.4s,  v2.s[1]
486        FMLA v24.4s, v14.4s,  v4.s[1]
487        FMLA v26.4s, v14.4s,  v6.s[1]
488        FMLA v28.4s, v14.4s,  v8.s[1]
489        FMLA v21.4s, v15.4s,  v0.s[1]
490        FMLA v23.4s, v15.4s,  v2.s[1]
491        FMLA v25.4s, v15.4s,  v4.s[1]
492        FMLA v27.4s, v15.4s,  v6.s[1]
493        FMLA v29.4s, v15.4s,  v8.s[1]
494
495        # Is there a remainder?- 1 float of A (4 bytes)
4966:
497        TBZ x0, 2, 3b
498
499        # Remainder- 1 float of A (4 bytes)
500        # Load A
501        LDR   s0,  [x3], 4
502        LDR   s2,  [x9], 4
503        LDR   s4, [x10], 4
504        LDR   s6, [x11], 4
505        LDR   s8, [x12], 4
506        # Load B
507        LDP  q12,  q13, [x5], 32
508
509        FMLA v20.4s, v12.4s,  v0.s[0]
510        FMLA v22.4s, v12.4s,  v2.s[0]
511        FMLA v24.4s, v12.4s,  v4.s[0]
512        FMLA v26.4s, v12.4s,  v6.s[0]
513        FMLA v28.4s, v12.4s,  v8.s[0]
514        FMLA v21.4s, v13.4s,  v0.s[0]
515        FMLA v23.4s, v13.4s,  v2.s[0]
516        FMLA v25.4s, v13.4s,  v4.s[0]
517        FMLA v27.4s, v13.4s,  v6.s[0]
518        FMLA v29.4s, v13.4s,  v8.s[0]
519        B 3b
520
521        # Store odd width
5227:
523        TBZ x1, 2, 8f
524        STR q28, [x7], 16
525        MOV v28.16b, v29.16b
526        STR q26, [x13], 16
527        MOV v26.16b, v27.16b
528        STR q24, [x17], 16
529        MOV v24.16b, v25.16b
530        STR q22, [x16], 16
531        MOV v22.16b, v23.16b
532        STR q20,  [x6], 16
533        MOV v20.16b, v21.16b
5348:
535        TBZ x1, 1, 9f
536        STR d28, [x7], 8
537        DUP d28, v28.d[1]
538        STR d26, [x13], 8
539        DUP d26, v26.d[1]
540        STR d24, [x17], 8
541        DUP d24, v24.d[1]
542        STR d22, [x16], 8
543        DUP d22, v22.d[1]
544        STR d20,  [x6], 8
545        DUP d20, v20.d[1]
546
5479:
548        TBZ x1, 0, 10f
549        STR s28, [x7]
550        STR s26, [x13]
551        STR s24, [x17]
552        STR s22, [x16]
553        STR s20,  [x6]
55410:
555        # Restore d8-d15 from stack
556        LDP d14, d15, [sp, 32]
557        LDP d12, d13, [sp, 16]
558        LDP  d8,  d9, [sp], 48
559        RET
560
561END_FUNCTION xnn_f32_gemminc_minmax_ukernel_5x8__aarch64_neonfma_cortex_a57
562
563#ifdef __ELF__
564.section ".note.GNU-stack","",%progbits
565#endif
566