1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31# x12 a4
32#  x4 a5
33
34# C pointers
35#  x6 c0
36# x16 c1
37# x17 c2
38# x14 c3
39# x13 c4
40#  x7 c5
41
42# Vector register usage
43# A0   v0  v6
44# A1   v1  v7
45# A2   v2  v8
46# A3   v3  v9
47# A4   v4 v10
48# A5   v5 v11
49# B   v12 v13 v14 v15
50# B   v16 v17 v18 v19
51# C   v20 v21
52# C   v22 v23
53# C   v24 v25
54# C   v26 v27
55# C   v28 v29
56# C   v30 v31
57# Clamp v6 v7
58
59BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75
60
61        # Load params pointer
62        LDR x8, [sp, 8]
63
64        # Clamp A and C pointers / Save d8-d15 on stack
65        STP  d8,  d9, [sp, -64]!
66        CMP x0, 2                // if mr < 2
67        ADD x9, x3, x4           // a1 = a0 + a_stride
68        ADD x16, x6, x7          // c1 = c0 + cm_stride
69        CSEL x9, x3, x9, LO      //   a1 = a0
70        CSEL x16, x6, x16, LO    //   c1 = c0
71
72        STP d10, d11, [sp, 16]
73        ADD x10, x9, x4          // a2 = a1 + a_stride
74        ADD x17, x16, x7         // c2 = c1 + cm_stride
75                                 // if mr <= 2
76        CSEL x10, x9, x10, LS    //   a2 = a1
77        CSEL x17, x16, x17, LS   //   c2 = c1
78
79        STP d12, d13, [sp, 32]
80        CMP x0, 4                // if mr < 4
81        ADD x11, x10, x4         // a3 = a2 + a_stride
82        ADD x14, x17, x7         // c3 = c2 + cm_stride
83        CSEL x11, x10, x11, LO   //   a3 = a2
84        CSEL x14, x17, x14, LO   //   c3 = c2
85
86        STP d14, d15, [sp, 48]
87        ADD x12, x11, x4         // a4 = a3 + a_stride
88        ADD x13, x14, x7         // c4 = c3 + cm_stride
89                                 // if mr <= 4
90        CSEL x12, x11, x12, LS   //   a4 = a3
91        CSEL x13, x14, x13, LS   //   c4 = c3
92
93        CMP x0, 6                // if mr < 6
94        ADD x4, x12, x4          // a5 = a4 + a_stride
95        ADD x7, x13, x7          // c5 = c4 + cm_stride
96        CSEL x4, x12, x4, LO     //   a5 = a4
97        CSEL x7, x13, x7, LO     //   c5 = c4
98
990:
100        # Load initial bias from w into accumulators
101        LDP q20, q21, [x5], 32
102        MOV v22.16b, v20.16b
103        PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
104        MOV v23.16b, v21.16b
105        PRFM PLDL1KEEP, [x5, 64]
106        MOV v24.16b, v20.16b
107        PRFM PLDL1KEEP, [x5, 128]
108        MOV v25.16b, v21.16b
109        PRFM PLDL1KEEP, [x5, 192]
110        MOV v26.16b, v20.16b
111        PRFM PLDL1KEEP,  [x3]    // Prefetch A
112        MOV v27.16b, v21.16b
113        PRFM PLDL1KEEP,  [x9]
114        MOV v28.16b, v20.16b
115        PRFM PLDL1KEEP, [x10]
116        MOV v29.16b, v21.16b
117        PRFM PLDL1KEEP, [x11]
118        MOV v30.16b, v20.16b
119        PRFM PLDL1KEEP, [x12]
120        MOV v31.16b, v21.16b
121        PRFM PLDL1KEEP,  [x4]
122
123        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
124        SUBS x0, x2, 32  // k = kc - 32
125        B.LO 4f
126
127        # Prologue - loads for main loop of 96 FMA
128        LDR   q0,  [x3], 16
129        LDR   q1,  [x9], 16
130        LDR   q2, [x10], 16
131        LDR   q3, [x11], 16
132        LDR   q4, [x12], 16
133        LDR   q5,  [x4], 16
134        LDP  q12,  q13, [x5], 32  // Fetch 3 B (4th deferred)
135        LDP  q14,  q15, [x5], 32
136        LDP  q16,  q17, [x5], 32
137
138        # Is there at least 8 floats (32 bytes) for main loop?
139        SUBS x0, x0, 32
140        B.LO 2f
141
142        # Main loop - 8 floats of A (32 bytes)
143        # 96 FMA + 6 LDP A + 8 LDP B
1441:
145        # First group of 4 A.  48 FMA.
146        FMLA v20.4s, v12.4s,  v0.s[0]
147        LDP  q18,  q19, [x5], 32      // Load last B
148        FMLA v22.4s, v12.4s,  v1.s[0]
149        FMLA v24.4s, v12.4s,  v2.s[0]
150        FMLA v26.4s, v12.4s,  v3.s[0]
151        FMLA v28.4s, v12.4s,  v4.s[0]
152        FMLA v30.4s, v12.4s,  v5.s[0]
153        FMLA v21.4s, v13.4s,  v0.s[0]
154        FMLA v23.4s, v13.4s,  v1.s[0]
155        FMLA v25.4s, v13.4s,  v2.s[0]
156        FMLA v27.4s, v13.4s,  v3.s[0]
157        FMLA v29.4s, v13.4s,  v4.s[0]
158
159        FMLA v31.4s, v13.4s,  v5.s[0]
160        FMLA v20.4s, v14.4s,  v0.s[1]
161        PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
162        FMLA v22.4s, v14.4s,  v1.s[1]
163        FMLA v24.4s, v14.4s,  v2.s[1]
164        FMLA v26.4s, v14.4s,  v3.s[1]
165        FMLA v28.4s, v14.4s,  v4.s[1]
166        PRFM PLDL1KEEP, [x5, 256]
167        FMLA v30.4s, v14.4s,  v5.s[1]
168        FMLA v21.4s, v15.4s,  v0.s[1]
169        FMLA v23.4s, v15.4s,  v1.s[1]
170        FMLA v25.4s, v15.4s,  v2.s[1]
171        LDR   q6,  [x3], 16            // Load next 6 A
172        FMLA v27.4s, v15.4s,  v3.s[1]
173        FMLA v29.4s, v15.4s,  v4.s[1]
174        FMLA v31.4s, v15.4s,  v5.s[1]
175        LDR   q7,  [x9], 16
176
177        FMLA v20.4s, v16.4s,  v0.s[2]
178        FMLA v22.4s, v16.4s,  v1.s[2]
179        FMLA v24.4s, v16.4s,  v2.s[2]
180        LDR   q8, [x10], 16
181        FMLA v26.4s, v16.4s,  v3.s[2]
182        FMLA v28.4s, v16.4s,  v4.s[2]
183        FMLA v30.4s, v16.4s,  v5.s[2]
184        LDR   q9, [x11], 16
185        FMLA v21.4s, v17.4s,  v0.s[2]
186        FMLA v23.4s, v17.4s,  v1.s[2]
187        FMLA v25.4s, v17.4s,  v2.s[2]
188        LDR   q10, [x12], 16
189        FMLA v27.4s, v17.4s,  v3.s[2]
190        FMLA v29.4s, v17.4s,  v4.s[2]
191        FMLA v31.4s, v17.4s,  v5.s[2]
192        LDR  q11,  [x4], 16
193
194        FMLA v20.4s, v18.4s,  v0.s[3]
195        FMLA v22.4s, v18.4s,  v1.s[3]
196        FMLA v24.4s, v18.4s,  v2.s[3]
197        LDP  q12,  q13, [x5], 32       // Load 4 B
198        FMLA v26.4s, v18.4s,  v3.s[3]
199        FMLA v28.4s, v18.4s,  v4.s[3]
200        FMLA v30.4s, v18.4s,  v5.s[3]
201        LDP  q14,  q15, [x5], 32
202        FMLA v21.4s, v19.4s,  v0.s[3]
203        FMLA v23.4s, v19.4s,  v1.s[3]
204        FMLA v25.4s, v19.4s,  v2.s[3]
205        LDP  q16,  q17, [x5], 32
206        FMLA v27.4s, v19.4s,  v3.s[3]
207        FMLA v29.4s, v19.4s,  v4.s[3]
208        FMLA v31.4s, v19.4s,  v5.s[3]
209        LDP  q18,  q19, [x5], 32
210
211        # Second group of 4 A.  48 FMA.
212        FMLA v20.4s, v12.4s,  v6.s[0]
213        FMLA v22.4s, v12.4s,  v7.s[0]
214        FMLA v24.4s, v12.4s,  v8.s[0]
215        LDR   q0,  [x3], 16           // Load next 6 A
216        FMLA v26.4s, v12.4s,  v9.s[0]
217        FMLA v28.4s, v12.4s, v10.s[0]
218        FMLA v30.4s, v12.4s, v11.s[0]
219        LDR   q1,  [x9], 16
220        FMLA v21.4s, v13.4s,  v6.s[0]
221        FMLA v23.4s, v13.4s,  v7.s[0]
222        FMLA v25.4s, v13.4s,  v8.s[0]
223        LDR   q2, [x10], 16
224        FMLA v27.4s, v13.4s,  v9.s[0]
225        FMLA v29.4s, v13.4s, v10.s[0]
226        FMLA v31.4s, v13.4s, v11.s[0]
227        LDR   q3, [x11], 16
228
229        FMLA v20.4s, v14.4s,  v6.s[1]
230        FMLA v22.4s, v14.4s,  v7.s[1]
231        FMLA v24.4s, v14.4s,  v8.s[1]
232        LDR   q4, [x12], 16
233        FMLA v26.4s, v14.4s,  v9.s[1]
234        FMLA v28.4s, v14.4s, v10.s[1]
235        FMLA v30.4s, v14.4s, v11.s[1]
236        LDR   q5,  [x4], 16
237        FMLA v21.4s, v15.4s,  v6.s[1]
238        FMLA v23.4s, v15.4s,  v7.s[1]
239        FMLA v25.4s, v15.4s,  v8.s[1]
240        LDP  q12,  q13, [x5], 32       // Load next 3 B (not last)
241        FMLA v27.4s, v15.4s,  v9.s[1]
242        FMLA v29.4s, v15.4s, v10.s[1]
243        FMLA v31.4s, v15.4s, v11.s[1]
244        LDP  q14,  q15, [x5], 32
245
246        FMLA v20.4s, v16.4s,  v6.s[2]
247        FMLA v22.4s, v16.4s,  v7.s[2]
248        FMLA v24.4s, v16.4s,  v8.s[2]
249        FMLA v26.4s, v16.4s,  v9.s[2]
250        FMLA v28.4s, v16.4s, v10.s[2]
251        FMLA v30.4s, v16.4s, v11.s[2]
252        FMLA v21.4s, v17.4s,  v6.s[2]
253        FMLA v23.4s, v17.4s,  v7.s[2]
254        FMLA v25.4s, v17.4s,  v8.s[2]
255        FMLA v27.4s, v17.4s,  v9.s[2]
256        FMLA v29.4s, v17.4s, v10.s[2]
257        FMLA v31.4s, v17.4s, v11.s[2]
258        LDP  q16,  q17, [x5], 32
259
260        FMLA v20.4s, v18.4s,  v6.s[3]
261        FMLA v22.4s, v18.4s,  v7.s[3]
262        SUBS x0, x0, 32
263        FMLA v24.4s, v18.4s,  v8.s[3]
264        FMLA v26.4s, v18.4s,  v9.s[3]
265        FMLA v28.4s, v18.4s, v10.s[3]
266        FMLA v30.4s, v18.4s, v11.s[3]
267        FMLA v21.4s, v19.4s,  v6.s[3]
268        FMLA v23.4s, v19.4s,  v7.s[3]
269        FMLA v25.4s, v19.4s,  v8.s[3]
270        FMLA v27.4s, v19.4s,  v9.s[3]
271        FMLA v29.4s, v19.4s, v10.s[3]
272        FMLA v31.4s, v19.4s, v11.s[3]
273        B.HS 1b
274
275        # Epilogue - 8 floats of A (32 bytes)
276        # 96 FMA + 6 LDP A + 8 LDP B
277        # First block same as main loop.  Second block has no preloads.
2782:
279        # First group of 4 A.  48 FMA.
280        FMLA v20.4s, v12.4s,  v0.s[0]
281        LDP  q18,  q19, [x5], 32      // Load last B
282        FMLA v22.4s, v12.4s,  v1.s[0]
283        FMLA v24.4s, v12.4s,  v2.s[0]
284        FMLA v26.4s, v12.4s,  v3.s[0]
285        FMLA v28.4s, v12.4s,  v4.s[0]
286        FMLA v30.4s, v12.4s,  v5.s[0]
287        FMLA v21.4s, v13.4s,  v0.s[0]
288        FMLA v23.4s, v13.4s,  v1.s[0]
289        FMLA v25.4s, v13.4s,  v2.s[0]
290        FMLA v27.4s, v13.4s,  v3.s[0]
291        FMLA v29.4s, v13.4s,  v4.s[0]
292
293        FMLA v31.4s, v13.4s,  v5.s[0]
294        FMLA v20.4s, v14.4s,  v0.s[1]
295        PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
296        FMLA v22.4s, v14.4s,  v1.s[1]
297        FMLA v24.4s, v14.4s,  v2.s[1]
298        FMLA v26.4s, v14.4s,  v3.s[1]
299        FMLA v28.4s, v14.4s,  v4.s[1]
300        PRFM PLDL1KEEP, [x5, 256]
301        FMLA v30.4s, v14.4s,  v5.s[1]
302        FMLA v21.4s, v15.4s,  v0.s[1]
303        FMLA v23.4s, v15.4s,  v1.s[1]
304        FMLA v25.4s, v15.4s,  v2.s[1]
305        LDR   q6,  [x3], 16            // Load next 6 A
306        FMLA v27.4s, v15.4s,  v3.s[1]
307        FMLA v29.4s, v15.4s,  v4.s[1]
308        FMLA v31.4s, v15.4s,  v5.s[1]
309        LDR   q7,  [x9], 16
310
311        FMLA v20.4s, v16.4s,  v0.s[2]
312        FMLA v22.4s, v16.4s,  v1.s[2]
313        FMLA v24.4s, v16.4s,  v2.s[2]
314        LDR   q8, [x10], 16
315        FMLA v26.4s, v16.4s,  v3.s[2]
316        FMLA v28.4s, v16.4s,  v4.s[2]
317        FMLA v30.4s, v16.4s,  v5.s[2]
318        LDR   q9, [x11], 16
319        FMLA v21.4s, v17.4s,  v0.s[2]
320        FMLA v23.4s, v17.4s,  v1.s[2]
321        FMLA v25.4s, v17.4s,  v2.s[2]
322        LDR   q10, [x12], 16
323        FMLA v27.4s, v17.4s,  v3.s[2]
324        FMLA v29.4s, v17.4s,  v4.s[2]
325        FMLA v31.4s, v17.4s,  v5.s[2]
326        LDR  q11,  [x4], 16
327
328        FMLA v20.4s, v18.4s,  v0.s[3]
329        FMLA v22.4s, v18.4s,  v1.s[3]
330        FMLA v24.4s, v18.4s,  v2.s[3]
331        LDP  q12,  q13, [x5], 32       // Load 4 B
332        FMLA v26.4s, v18.4s,  v3.s[3]
333        FMLA v28.4s, v18.4s,  v4.s[3]
334        FMLA v30.4s, v18.4s,  v5.s[3]
335        LDP  q14,  q15, [x5], 32
336        FMLA v21.4s, v19.4s,  v0.s[3]
337        FMLA v23.4s, v19.4s,  v1.s[3]
338        FMLA v25.4s, v19.4s,  v2.s[3]
339        LDP  q16,  q17, [x5], 32
340        FMLA v27.4s, v19.4s,  v3.s[3]
341        FMLA v29.4s, v19.4s,  v4.s[3]
342        FMLA v31.4s, v19.4s,  v5.s[3]
343        LDP  q18,  q19, [x5], 32
344
345        # Second group of 4 A.  48 FMA.
346        FMLA v20.4s, v12.4s,  v6.s[0]
347        FMLA v22.4s, v12.4s,  v7.s[0]
348        FMLA v24.4s, v12.4s,  v8.s[0]
349        FMLA v26.4s, v12.4s,  v9.s[0]
350        FMLA v28.4s, v12.4s, v10.s[0]
351        FMLA v30.4s, v12.4s, v11.s[0]
352        FMLA v21.4s, v13.4s,  v6.s[0]
353        FMLA v23.4s, v13.4s,  v7.s[0]
354        FMLA v25.4s, v13.4s,  v8.s[0]
355        FMLA v27.4s, v13.4s,  v9.s[0]
356        FMLA v29.4s, v13.4s, v10.s[0]
357        FMLA v31.4s, v13.4s, v11.s[0]
358
359        FMLA v20.4s, v14.4s,  v6.s[1]
360        FMLA v22.4s, v14.4s,  v7.s[1]
361        FMLA v24.4s, v14.4s,  v8.s[1]
362        FMLA v26.4s, v14.4s,  v9.s[1]
363        FMLA v28.4s, v14.4s, v10.s[1]
364        FMLA v30.4s, v14.4s, v11.s[1]
365        FMLA v21.4s, v15.4s,  v6.s[1]
366        FMLA v23.4s, v15.4s,  v7.s[1]
367        FMLA v25.4s, v15.4s,  v8.s[1]
368        FMLA v27.4s, v15.4s,  v9.s[1]
369        FMLA v29.4s, v15.4s, v10.s[1]
370        FMLA v31.4s, v15.4s, v11.s[1]
371
372        FMLA v20.4s, v16.4s,  v6.s[2]
373        FMLA v22.4s, v16.4s,  v7.s[2]
374        FMLA v24.4s, v16.4s,  v8.s[2]
375        FMLA v26.4s, v16.4s,  v9.s[2]
376        FMLA v28.4s, v16.4s, v10.s[2]
377        FMLA v30.4s, v16.4s, v11.s[2]
378        FMLA v21.4s, v17.4s,  v6.s[2]
379        FMLA v23.4s, v17.4s,  v7.s[2]
380        FMLA v25.4s, v17.4s,  v8.s[2]
381        FMLA v27.4s, v17.4s,  v9.s[2]
382        FMLA v29.4s, v17.4s, v10.s[2]
383        FMLA v31.4s, v17.4s, v11.s[2]
384
385        FMLA v20.4s, v18.4s,  v6.s[3]
386        FMLA v22.4s, v18.4s,  v7.s[3]
387        FMLA v24.4s, v18.4s,  v8.s[3]
388        FMLA v26.4s, v18.4s,  v9.s[3]
389        FMLA v28.4s, v18.4s, v10.s[3]
390        FMLA v30.4s, v18.4s, v11.s[3]
391        FMLA v21.4s, v19.4s,  v6.s[3]
392        FMLA v23.4s, v19.4s,  v7.s[3]
393
394        # Load min/max values
395        LD2R {v6.4s, v7.4s}, [x8]
396
397        FMLA v25.4s, v19.4s,  v8.s[3]
398        FMLA v27.4s, v19.4s,  v9.s[3]
399        # Is there a remainder?- 4 floats of A (16 bytes) or less
400        TST x0, 31
401        FMLA v29.4s, v19.4s, v10.s[3]
402        FMLA v31.4s, v19.4s, v11.s[3]
403        B.NE 4f
404
405        # Clamp
4063:
407        FMAX v20.4s, v20.4s, v6.4s
408        # Load cn_stride
409        LDR x0, [sp, 64]
410        FMAX v21.4s, v21.4s, v6.4s
411        FMAX v22.4s, v22.4s, v6.4s
412        FMAX v23.4s, v23.4s, v6.4s
413        FMAX v24.4s, v24.4s, v6.4s
414        FMAX v25.4s, v25.4s, v6.4s
415        FMAX v26.4s, v26.4s, v6.4s
416        FMAX v27.4s, v27.4s, v6.4s
417        FMAX v28.4s, v28.4s, v6.4s
418        FMAX v29.4s, v29.4s, v6.4s
419        FMAX v30.4s, v30.4s, v6.4s
420        FMAX v31.4s, v31.4s, v6.4s
421        SUBS x1, x1, 8
422        FMIN v20.4s, v20.4s, v7.4s
423        FMIN v21.4s, v21.4s, v7.4s
424        FMIN v22.4s, v22.4s, v7.4s
425        FMIN v23.4s, v23.4s, v7.4s
426        FMIN v24.4s, v24.4s, v7.4s
427        FMIN v25.4s, v25.4s, v7.4s
428        FMIN v26.4s, v26.4s, v7.4s
429        FMIN v27.4s, v27.4s, v7.4s
430        FMIN v28.4s, v28.4s, v7.4s
431        FMIN v29.4s, v29.4s, v7.4s
432        FMIN v30.4s, v30.4s, v7.4s
433        FMIN v31.4s, v31.4s, v7.4s
434
435        # Store full 6 x 8
436        B.LO 7f
437
438        STP q20, q21,  [x6]
439        ADD  x6,  x6, x0
440        SUB  x3,  x3, x2 // a0 -= kc
441        STP q22, q23, [x16]
442        ADD x16, x16, x0
443        SUB  x9,  x9, x2 // a1 -= kc
444        STP q24, q25, [x17]
445        ADD x17, x17, x0
446        SUB x10, x10, x2 // a2 -= kc
447        STP q26, q27, [x14]
448        ADD x14, x14, x0
449        SUB x11, x11, x2 // a3 -= kc
450        STP q28, q29, [x13]
451        ADD x13, x13, x0
452        SUB x12, x12, x2 // a4 -= kc
453        STP q30, q31,  [x7]
454        ADD x7, x7, x0
455        SUB  x4,  x4, x2 // a5 -= kc
456
457        B.HI 0b
458
459        # Restore d8-d15 from stack
460        LDP d14, d15, [sp, 48]
461        LDP d12, d13, [sp, 32]
462        LDP d10, d11, [sp, 16]
463        LDP  d8,  d9, [sp], 64
464        RET
465
4664:
467        # Load min/max values
468        LD2R {v6.4s, v7.4s}, [x8]
469
470        # Is there a remainder?- 4 floats of A (16 bytes)
471        TBZ x0, 4, 5f
472
473        # Remainder- 4 floats of A (16 bytes)
474        # Load A
475        LDR   q0,  [x3], 16
476        LDR   q1,  [x9], 16
477        LDR   q2, [x10], 16
478        LDR   q3, [x11], 16
479        LDR   q4, [x12], 16
480        LDR   q5,  [x4], 16
481        # Load B
482        LDP  q12,  q13, [x5], 32
483        LDP  q14,  q15, [x5], 32
484        LDP  q16,  q17, [x5], 32
485        LDP  q18,  q19, [x5], 32
486
487        FMLA v20.4s, v12.4s,  v0.s[0]
488        FMLA v22.4s, v12.4s,  v1.s[0]
489        FMLA v24.4s, v12.4s,  v2.s[0]
490        FMLA v26.4s, v12.4s,  v3.s[0]
491        FMLA v28.4s, v12.4s,  v4.s[0]
492        FMLA v30.4s, v12.4s,  v5.s[0]
493        FMLA v21.4s, v13.4s,  v0.s[0]
494        FMLA v23.4s, v13.4s,  v1.s[0]
495        FMLA v25.4s, v13.4s,  v2.s[0]
496        FMLA v27.4s, v13.4s,  v3.s[0]
497        FMLA v29.4s, v13.4s,  v4.s[0]
498        FMLA v31.4s, v13.4s,  v5.s[0]
499
500        FMLA v20.4s, v14.4s,  v0.s[1]
501        FMLA v22.4s, v14.4s,  v1.s[1]
502        FMLA v24.4s, v14.4s,  v2.s[1]
503        FMLA v26.4s, v14.4s,  v3.s[1]
504        FMLA v28.4s, v14.4s,  v4.s[1]
505        FMLA v30.4s, v14.4s,  v5.s[1]
506        FMLA v21.4s, v15.4s,  v0.s[1]
507        FMLA v23.4s, v15.4s,  v1.s[1]
508        FMLA v25.4s, v15.4s,  v2.s[1]
509        FMLA v27.4s, v15.4s,  v3.s[1]
510        FMLA v29.4s, v15.4s,  v4.s[1]
511        FMLA v31.4s, v15.4s,  v5.s[1]
512
513        FMLA v20.4s, v16.4s,  v0.s[2]
514        FMLA v22.4s, v16.4s,  v1.s[2]
515        FMLA v24.4s, v16.4s,  v2.s[2]
516        FMLA v26.4s, v16.4s,  v3.s[2]
517        FMLA v28.4s, v16.4s,  v4.s[2]
518        FMLA v30.4s, v16.4s,  v5.s[2]
519        FMLA v21.4s, v17.4s,  v0.s[2]
520        FMLA v23.4s, v17.4s,  v1.s[2]
521        FMLA v25.4s, v17.4s,  v2.s[2]
522        FMLA v27.4s, v17.4s,  v3.s[2]
523        FMLA v29.4s, v17.4s,  v4.s[2]
524        FMLA v31.4s, v17.4s,  v5.s[2]
525
526        FMLA v20.4s, v18.4s,  v0.s[3]
527        FMLA v22.4s, v18.4s,  v1.s[3]
528        FMLA v24.4s, v18.4s,  v2.s[3]
529        FMLA v26.4s, v18.4s,  v3.s[3]
530        FMLA v28.4s, v18.4s,  v4.s[3]
531        FMLA v30.4s, v18.4s,  v5.s[3]
532        FMLA v21.4s, v19.4s,  v0.s[3]
533        FMLA v23.4s, v19.4s,  v1.s[3]
534        FMLA v25.4s, v19.4s,  v2.s[3]
535        FMLA v27.4s, v19.4s,  v3.s[3]
536        FMLA v29.4s, v19.4s,  v4.s[3]
537        FMLA v31.4s, v19.4s,  v5.s[3]
538
539        # Is there a remainder?- 2 floats of A (8 bytes)
5405:
541        TBZ x0, 3, 6f
542
543        # Remainder- 2 floats of A (8 bytes)
544        # Load A
545        LDR   d0,  [x3], 8
546        LDR   d1,  [x9], 8
547        LDR   d2, [x10], 8
548        LDR   d3, [x11], 8
549        LDR   d4, [x12], 8
550        LDR   d5,  [x4], 8
551        # Load B
552        LDP  q12,  q13, [x5], 32
553        LDP  q14,  q15, [x5], 32
554
555        FMLA v20.4s, v12.4s,  v0.s[0]
556        FMLA v22.4s, v12.4s,  v1.s[0]
557        FMLA v24.4s, v12.4s,  v2.s[0]
558        FMLA v26.4s, v12.4s,  v3.s[0]
559        FMLA v28.4s, v12.4s,  v4.s[0]
560        FMLA v30.4s, v12.4s,  v5.s[0]
561        FMLA v21.4s, v13.4s,  v0.s[0]
562        FMLA v23.4s, v13.4s,  v1.s[0]
563        FMLA v25.4s, v13.4s,  v2.s[0]
564        FMLA v27.4s, v13.4s,  v3.s[0]
565        FMLA v29.4s, v13.4s,  v4.s[0]
566        FMLA v31.4s, v13.4s,  v5.s[0]
567
568        FMLA v20.4s, v14.4s,  v0.s[1]
569        FMLA v22.4s, v14.4s,  v1.s[1]
570        FMLA v24.4s, v14.4s,  v2.s[1]
571        FMLA v26.4s, v14.4s,  v3.s[1]
572        FMLA v28.4s, v14.4s,  v4.s[1]
573        FMLA v30.4s, v14.4s,  v5.s[1]
574        FMLA v21.4s, v15.4s,  v0.s[1]
575        FMLA v23.4s, v15.4s,  v1.s[1]
576        FMLA v25.4s, v15.4s,  v2.s[1]
577        FMLA v27.4s, v15.4s,  v3.s[1]
578        FMLA v29.4s, v15.4s,  v4.s[1]
579        FMLA v31.4s, v15.4s,  v5.s[1]
580
581        # Is there a remainder?- 1 float of A (4 bytes)
5826:
583        TBZ x0, 2, 3b
584
585        # Remainder- 1 float of A (4 bytes)
586        # Load A
587        LDR   s0,  [x3], 4
588        LDR   s1,  [x9], 4
589        LDR   s2, [x10], 4
590        LDR   s3, [x11], 4
591        LDR   s4, [x12], 4
592        LDR   s5,  [x4], 4
593        # Load B
594        LDP  q12,  q13, [x5], 32
595
596        FMLA v20.4s, v12.4s,  v0.s[0]
597        FMLA v22.4s, v12.4s,  v1.s[0]
598        FMLA v24.4s, v12.4s,  v2.s[0]
599        FMLA v26.4s, v12.4s,  v3.s[0]
600        FMLA v28.4s, v12.4s,  v4.s[0]
601        FMLA v30.4s, v12.4s,  v5.s[0]
602        FMLA v21.4s, v13.4s,  v0.s[0]
603        FMLA v23.4s, v13.4s,  v1.s[0]
604        FMLA v25.4s, v13.4s,  v2.s[0]
605        FMLA v27.4s, v13.4s,  v3.s[0]
606        FMLA v29.4s, v13.4s,  v4.s[0]
607        FMLA v31.4s, v13.4s,  v5.s[0]
608        B 3b
609
610        # Store odd width
6117:
612        TBZ x1, 2, 8f
613        STR q20,  [x6], 16
614        MOV v20.16b, v21.16b
615        STR q22, [x16], 16
616        MOV v22.16b, v23.16b
617        STR q24, [x17], 16
618        MOV v24.16b, v25.16b
619        STR q26, [x14], 16
620        MOV v26.16b, v27.16b
621        STR q28, [x13], 16
622        MOV v28.16b, v29.16b
623        STR q30,  [x7], 16
624        MOV v30.16b, v31.16b
6258:
626        TBZ x1, 1, 9f
627        STR d20,  [x6], 8
628        DUP d20, v20.d[1]
629        STR d22, [x16], 8
630        DUP d22, v22.d[1]
631        STR d24, [x17], 8
632        DUP d24, v24.d[1]
633        STR d26, [x14], 8
634        DUP d26, v26.d[1]
635        STR d28, [x13], 8
636        DUP d28, v28.d[1]
637        STR d30,  [x7], 8
638        DUP d30, v30.d[1]
639
6409:
641        TBZ x1, 0, 10f
642        STR s20,  [x6]
643        STR s22, [x16]
644        STR s24, [x17]
645        STR s26, [x14]
646        STR s28, [x13]
647        STR s30,  [x7]
64810:
649        # Restore d8-d15 from stack
650        LDP d14, d15, [sp, 48]
651        LDP d12, d13, [sp, 32]
652        LDP d10, d11, [sp, 16]
653        LDP  d8,  d9, [sp], 64
654        RET
655
656END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75
657
658#ifdef __ELF__
659.section ".note.GNU-stack","",%progbits
660#endif
661