1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> (x0)
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31# x12 a4
32#  x4 a5
33
34# C pointers
35#  x6 c0
36# x16 c1
37# x17 c2
38# x14 c3
39# x13 c4
40#  x7 c5
41
42# Vector register usage
43# A0   v0  v6
44# A1   v1  v7
45# A2   v2  v8
46# A3   v3  v9
47# A4   v4 v10
48# A5   v5 v11
49# B   v12 v13 v14 v15
50# B   v16 v17 v18 v19
51# C   v20 v21
52# C   v22 v23
53# C   v24 v25
54# C   v26 v27
55# C   v28 v29
56# C   v30 v31
57# Clamp v6 v7
58
59BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73
60
61        $if INC:
62          # Load acc, params pointer
63          LDP x15, x8, [sp, 8]
64        $else:
65          # Load params pointer
66          LDR x8, [sp, 8]
67
68        # Clamp A and C pointers / Save d8-d15 on stack
69        STP  d8,  d9, [sp, -64]!
70        CMP x0, 2                // if mr < 2
71        ADD x9, x3, x4           // a1 = a0 + a_stride
72        ADD x16, x6, x7          // c1 = c0 + cm_stride
73        CSEL x9, x3, x9, LO      //   a1 = a0
74        CSEL x16, x6, x16, LO    //   c1 = c0
75
76        STP d10, d11, [sp, 16]
77        ADD x10, x9, x4          // a2 = a1 + a_stride
78        ADD x17, x16, x7         // c2 = c1 + cm_stride
79                                 // if mr <= 2
80        CSEL x10, x9, x10, LS    //   a2 = a1
81        CSEL x17, x16, x17, LS   //   c2 = c1
82
83        STP d12, d13, [sp, 32]
84        CMP x0, 4                // if mr < 4
85        ADD x11, x10, x4         // a3 = a2 + a_stride
86        ADD x14, x17, x7         // c3 = c2 + cm_stride
87        CSEL x11, x10, x11, LO   //   a3 = a2
88        CSEL x14, x17, x14, LO   //   c3 = c2
89
90        STP d14, d15, [sp, 48]
91        ADD x12, x11, x4         // a4 = a3 + a_stride
92        ADD x13, x14, x7         // c4 = c3 + cm_stride
93                                 // if mr <= 4
94        CSEL x12, x11, x12, LS   //   a4 = a3
95        CSEL x13, x14, x13, LS   //   c4 = c3
96
97        CMP x0, 6                // if mr < 6
98        ADD x4, x12, x4          // a5 = a4 + a_stride
99        ADD x7, x13, x7          // c5 = c4 + cm_stride
100        CSEL x4, x12, x4, LO     //   a5 = a4
101        CSEL x7, x13, x7, LO     //   c5 = c4
102
103        .p2align 3
1040:
105        $if INC:
106          # Load initial accumulators
107          LDP q20, q21, [x15], 32
108          LDP q22, q23, [x15], 32
109          LDP q24, q25, [x15], 32
110          LDP q26, q27, [x15], 32
111          LDP q28, q29, [x15], 32
112          LDP q30, q31, [x15], 32
113          PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
114          PRFM PLDL1KEEP, [x5, 64]
115          PRFM PLDL1KEEP, [x5, 128]
116          PRFM PLDL1KEEP, [x5, 192]
117          PRFM PLDL1KEEP,  [x3]    // Prefetch A
118          PRFM PLDL1KEEP,  [x9]
119          PRFM PLDL1KEEP, [x10]
120          PRFM PLDL1KEEP, [x11]
121          PRFM PLDL1KEEP, [x12]
122          PRFM PLDL1KEEP,  [x4]
123        $else:
124          # Load initial bias from w into accumulators
125          LDP q20, q21, [x5], 32
126          MOV v22.16b, v20.16b
127          PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
128          MOV v23.16b, v21.16b
129          PRFM PLDL1KEEP, [x5, 64]
130          MOV v24.16b, v20.16b
131          PRFM PLDL1KEEP, [x5, 128]
132          MOV v25.16b, v21.16b
133          PRFM PLDL1KEEP, [x5, 192]
134          MOV v26.16b, v20.16b
135          PRFM PLDL1KEEP,  [x3]    // Prefetch A
136          MOV v27.16b, v21.16b
137          PRFM PLDL1KEEP,  [x9]
138          MOV v28.16b, v20.16b
139          PRFM PLDL1KEEP, [x10]
140          MOV v29.16b, v21.16b
141          PRFM PLDL1KEEP, [x11]
142          MOV v30.16b, v20.16b
143          PRFM PLDL1KEEP, [x12]
144          MOV v31.16b, v21.16b
145          PRFM PLDL1KEEP,  [x4]
146
147        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
148        SUBS x0, x2, 32  // k = kc - 32
149        B.LO 4f
150
151        # Prologue - loads for main loop of 96 FMA
152        # load A0 to A4 but not A5
153        LDP  q0,  q6,  [x3], 32
154        LDP  q1,  q7,  [x9], 32
155        LDP  q2,  q8, [x10], 32
156        LDP  q3,  q9, [x11], 32
157        LDP  q4,  q10, [x12], 32
158        # load first set of B
159        LDP q12, q13, [x5], 32
160        LDP q14, q15, [x5], 32
161
162        # Is there at least 8 floats (32 bytes) for main loop?
163        SUBS x0, x0, 32
164        B.LO 2f
165
166        # Main loop - 8 floats of A (32 bytes)
167        # 96 FMA + 6 LDP A + 8 LDP B
168        .p2align 3
1691:
170        # First group of 4 A.  48 FMA.  Loads A5
171
172        LDP  q5, q11, [x4], 32
173        FMLA v20.4s, v12.4s,  v0.s[0]
174        FMLA v22.4s, v12.4s,  v1.s[0]
175        LDP  q16,  q17, [x5], 32
176        FMLA v24.4s, v12.4s,  v2.s[0]
177        FMLA v26.4s, v12.4s,  v3.s[0]
178        LDP  q18,  q19, [x5], 32
179        FMLA v28.4s, v12.4s,  v4.s[0]
180        FMLA v30.4s, v12.4s,  v5.s[0]
181        FMLA v21.4s, v13.4s,  v0.s[0]
182        FMLA v23.4s, v13.4s,  v1.s[0]
183        FMLA v25.4s, v13.4s,  v2.s[0]
184        FMLA v27.4s, v13.4s,  v3.s[0]
185        FMLA v29.4s, v13.4s,  v4.s[0]
186        FMLA v31.4s, v13.4s,  v5.s[0]
187
188        FMLA v20.4s, v14.4s,  v0.s[1]
189        FMLA v22.4s, v14.4s,  v1.s[1]
190        FMLA v24.4s, v14.4s,  v2.s[1]
191        FMLA v26.4s, v14.4s,  v3.s[1]
192        FMLA v28.4s, v14.4s,  v4.s[1]
193        FMLA v30.4s, v14.4s,  v5.s[1]
194        FMLA v21.4s, v15.4s,  v0.s[1]
195        FMLA v23.4s, v15.4s,  v1.s[1]
196        FMLA v25.4s, v15.4s,  v2.s[1]
197        FMLA v27.4s, v15.4s,  v3.s[1]
198        FMLA v29.4s, v15.4s,  v4.s[1]
199        FMLA v31.4s, v15.4s,  v5.s[1]
200
201        LDP  q12,  q13, [x5], 32
202        FMLA v20.4s, v16.4s,  v0.s[2]
203        FMLA v22.4s, v16.4s,  v1.s[2]
204        LDP  q14,  q15, [x5], 32
205        FMLA v24.4s, v16.4s,  v2.s[2]
206        FMLA v26.4s, v16.4s,  v3.s[2]
207        PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
208        FMLA v28.4s, v16.4s,  v4.s[2]
209        FMLA v30.4s, v16.4s,  v5.s[2]
210        PRFM PLDL1KEEP, [x5, 256]
211        FMLA v21.4s, v17.4s,  v0.s[2]
212        FMLA v23.4s, v17.4s,  v1.s[2]
213        FMLA v25.4s, v17.4s,  v2.s[2]
214        FMLA v27.4s, v17.4s,  v3.s[2]
215        FMLA v29.4s, v17.4s,  v4.s[2]
216        FMLA v31.4s, v17.4s,  v5.s[2]
217
218        FMLA v20.4s, v18.4s,  v0.s[3]
219        FMLA v22.4s, v18.4s,  v1.s[3]
220        FMLA v24.4s, v18.4s,  v2.s[3]
221        FMLA v26.4s, v18.4s,  v3.s[3]
222        FMLA v28.4s, v18.4s,  v4.s[3]
223        FMLA v30.4s, v18.4s,  v5.s[3]
224        FMLA v21.4s, v19.4s,  v0.s[3]
225        FMLA v23.4s, v19.4s,  v1.s[3]
226        FMLA v25.4s, v19.4s,  v2.s[3]
227        FMLA v27.4s, v19.4s,  v3.s[3]
228        FMLA v29.4s, v19.4s,  v4.s[3]
229        FMLA v31.4s, v19.4s,  v5.s[3]
230
231        # Second group of 4 A.  48 FMA.  Loads A0 - A4
232
233        LDP  q16,  q17, [x5], 32
234        FMLA v20.4s, v12.4s,  v6.s[0]
235        FMLA v22.4s, v12.4s,  v7.s[0]
236        LDP  q18,  q19, [x5], 32
237        FMLA v24.4s, v12.4s,  v8.s[0]
238        FMLA v26.4s, v12.4s,  v9.s[0]
239        FMLA v28.4s, v12.4s, v10.s[0]
240        FMLA v30.4s, v12.4s, v11.s[0]
241        FMLA v21.4s, v13.4s,  v6.s[0]
242        FMLA v23.4s, v13.4s,  v7.s[0]
243        FMLA v25.4s, v13.4s,  v8.s[0]
244        FMLA v27.4s, v13.4s,  v9.s[0]
245        FMLA v29.4s, v13.4s, v10.s[0]
246        FMLA v31.4s, v13.4s, v11.s[0]
247
248        FMLA v20.4s, v14.4s,  v6.s[1]
249        FMLA v22.4s, v14.4s,  v7.s[1]
250        FMLA v24.4s, v14.4s,  v8.s[1]
251        FMLA v26.4s, v14.4s,  v9.s[1]
252        FMLA v28.4s, v14.4s, v10.s[1]
253        FMLA v30.4s, v14.4s, v11.s[1]
254        FMLA v21.4s, v15.4s,  v6.s[1]
255        FMLA v23.4s, v15.4s,  v7.s[1]
256        FMLA v25.4s, v15.4s,  v8.s[1]
257        FMLA v27.4s, v15.4s,  v9.s[1]
258        FMLA v29.4s, v15.4s, v10.s[1]
259        FMLA v31.4s, v15.4s, v11.s[1]
260
261        LDP  q12,  q13, [x5], 32
262        FMLA v20.4s, v16.4s,  v6.s[2]
263        FMLA v20.4s, v18.4s,  v6.s[3]
264        LDP  q14,  q15, [x5], 32
265        FMLA v21.4s, v17.4s,  v6.s[2]
266        FMLA v21.4s, v19.4s,  v6.s[3]
267        LDP  q0,  q6, [x3], 32
268        FMLA v22.4s, v16.4s,  v7.s[2]
269        FMLA v22.4s, v18.4s,  v7.s[3]
270        FMLA v23.4s, v17.4s,  v7.s[2]
271        FMLA v23.4s, v19.4s,  v7.s[3]
272        LDP  q1,  q7, [x9], 32
273        FMLA v24.4s, v16.4s,  v8.s[2]
274        FMLA v24.4s, v18.4s,  v8.s[3]
275        FMLA v25.4s, v17.4s,  v8.s[2]
276        FMLA v25.4s, v19.4s,  v8.s[3]
277        LDP  q2,  q8, [x10], 32
278        FMLA v26.4s, v16.4s,  v9.s[2]
279        FMLA v26.4s, v18.4s,  v9.s[3]
280        FMLA v27.4s, v17.4s,  v9.s[2]
281        FMLA v27.4s, v19.4s,  v9.s[3]
282        LDP  q3,  q9, [x11], 32
283        FMLA v28.4s, v16.4s, v10.s[2]
284        FMLA v28.4s, v18.4s, v10.s[3]
285        FMLA v29.4s, v17.4s, v10.s[2]
286        FMLA v29.4s, v19.4s, v10.s[3]
287        LDP  q4,  q10, [x12], 32
288        FMLA v30.4s, v16.4s, v11.s[2]
289        FMLA v30.4s, v18.4s, v11.s[3]
290        SUBS x0, x0, 32
291        FMLA v31.4s, v17.4s, v11.s[2]
292        FMLA v31.4s, v19.4s, v11.s[3]
293        B.HS 1b
294
295        # Epilogue - 8 floats of A (32 bytes)
296        # 96 FMA + 6 LDP A + 8 LDP B
297        # First block same as main loop.  Second block has no preloads.
2982:
299        # First group of 4 A.  48 FMA.  Loads A5
300
301        LDP  q5, q11, [x4], 32
302        FMLA v20.4s, v12.4s,  v0.s[0]
303        FMLA v22.4s, v12.4s,  v1.s[0]
304        LDP  q16,  q17, [x5], 32
305        FMLA v24.4s, v12.4s,  v2.s[0]
306        FMLA v26.4s, v12.4s,  v3.s[0]
307        LDP  q18,  q19, [x5], 32
308        FMLA v28.4s, v12.4s,  v4.s[0]
309        FMLA v30.4s, v12.4s,  v5.s[0]
310        FMLA v21.4s, v13.4s,  v0.s[0]
311        FMLA v23.4s, v13.4s,  v1.s[0]
312        FMLA v25.4s, v13.4s,  v2.s[0]
313        FMLA v27.4s, v13.4s,  v3.s[0]
314        FMLA v29.4s, v13.4s,  v4.s[0]
315        FMLA v31.4s, v13.4s,  v5.s[0]
316
317        FMLA v20.4s, v14.4s,  v0.s[1]
318        FMLA v22.4s, v14.4s,  v1.s[1]
319        FMLA v24.4s, v14.4s,  v2.s[1]
320        FMLA v26.4s, v14.4s,  v3.s[1]
321        FMLA v28.4s, v14.4s,  v4.s[1]
322        FMLA v30.4s, v14.4s,  v5.s[1]
323        FMLA v21.4s, v15.4s,  v0.s[1]
324        FMLA v23.4s, v15.4s,  v1.s[1]
325        FMLA v25.4s, v15.4s,  v2.s[1]
326        FMLA v27.4s, v15.4s,  v3.s[1]
327        FMLA v29.4s, v15.4s,  v4.s[1]
328        FMLA v31.4s, v15.4s,  v5.s[1]
329
330        LDP  q12,  q13, [x5], 32
331        FMLA v20.4s, v16.4s,  v0.s[2]
332        FMLA v22.4s, v16.4s,  v1.s[2]
333        LDP  q14,  q15, [x5], 32
334        FMLA v24.4s, v16.4s,  v2.s[2]
335        FMLA v26.4s, v16.4s,  v3.s[2]
336        FMLA v28.4s, v16.4s,  v4.s[2]
337        FMLA v30.4s, v16.4s,  v5.s[2]
338        FMLA v21.4s, v17.4s,  v0.s[2]
339        FMLA v23.4s, v17.4s,  v1.s[2]
340        FMLA v25.4s, v17.4s,  v2.s[2]
341        FMLA v27.4s, v17.4s,  v3.s[2]
342        FMLA v29.4s, v17.4s,  v4.s[2]
343        FMLA v31.4s, v17.4s,  v5.s[2]
344
345        FMLA v20.4s, v18.4s,  v0.s[3]
346        FMLA v22.4s, v18.4s,  v1.s[3]
347        FMLA v24.4s, v18.4s,  v2.s[3]
348        FMLA v26.4s, v18.4s,  v3.s[3]
349        FMLA v28.4s, v18.4s,  v4.s[3]
350        FMLA v30.4s, v18.4s,  v5.s[3]
351        FMLA v21.4s, v19.4s,  v0.s[3]
352        FMLA v23.4s, v19.4s,  v1.s[3]
353        FMLA v25.4s, v19.4s,  v2.s[3]
354        FMLA v27.4s, v19.4s,  v3.s[3]
355        FMLA v29.4s, v19.4s,  v4.s[3]
356        FMLA v31.4s, v19.4s,  v5.s[3]
357
358        # Second group of 4 A.  48 FMA. No A Loads, No last B load
359
360        LDP  q16,  q17, [x5], 32
361        FMLA v20.4s, v12.4s,  v6.s[0]
362        FMLA v22.4s, v12.4s,  v7.s[0]
363        LDP  q18,  q19, [x5], 32
364        FMLA v24.4s, v12.4s,  v8.s[0]
365        FMLA v26.4s, v12.4s,  v9.s[0]
366        FMLA v28.4s, v12.4s, v10.s[0]
367        FMLA v30.4s, v12.4s, v11.s[0]
368        FMLA v21.4s, v13.4s,  v6.s[0]
369        FMLA v23.4s, v13.4s,  v7.s[0]
370        FMLA v25.4s, v13.4s,  v8.s[0]
371        FMLA v27.4s, v13.4s,  v9.s[0]
372        FMLA v29.4s, v13.4s, v10.s[0]
373        FMLA v31.4s, v13.4s, v11.s[0]
374
375        FMLA v20.4s, v14.4s,  v6.s[1]
376        FMLA v22.4s, v14.4s,  v7.s[1]
377        FMLA v24.4s, v14.4s,  v8.s[1]
378        FMLA v26.4s, v14.4s,  v9.s[1]
379        FMLA v28.4s, v14.4s, v10.s[1]
380        FMLA v30.4s, v14.4s, v11.s[1]
381        FMLA v21.4s, v15.4s,  v6.s[1]
382        FMLA v23.4s, v15.4s,  v7.s[1]
383        FMLA v25.4s, v15.4s,  v8.s[1]
384        FMLA v27.4s, v15.4s,  v9.s[1]
385        FMLA v29.4s, v15.4s, v10.s[1]
386        FMLA v31.4s, v15.4s, v11.s[1]
387
388        # Last part of epilogue has loads removed.
389
390        FMLA v20.4s, v16.4s,  v6.s[2]
391        FMLA v22.4s, v16.4s,  v7.s[2]
392        FMLA v24.4s, v16.4s,  v8.s[2]
393        FMLA v26.4s, v16.4s,  v9.s[2]
394        FMLA v28.4s, v16.4s, v10.s[2]
395        FMLA v30.4s, v16.4s, v11.s[2]
396        FMLA v21.4s, v17.4s,  v6.s[2]
397        FMLA v23.4s, v17.4s,  v7.s[2]
398        FMLA v25.4s, v17.4s,  v8.s[2]
399        FMLA v27.4s, v17.4s,  v9.s[2]
400        FMLA v29.4s, v17.4s, v10.s[2]
401        FMLA v31.4s, v17.4s, v11.s[2]
402
403        FMLA v20.4s, v18.4s,  v6.s[3]
404        FMLA v22.4s, v18.4s,  v7.s[3]
405        FMLA v24.4s, v18.4s,  v8.s[3]
406        FMLA v26.4s, v18.4s,  v9.s[3]
407        FMLA v28.4s, v18.4s, v10.s[3]
408        FMLA v30.4s, v18.4s, v11.s[3]
409        FMLA v21.4s, v19.4s,  v6.s[3]
410        FMLA v23.4s, v19.4s,  v7.s[3]
411
412        # Load min/max values
413        LD2R {v6.4s, v7.4s}, [x8]
414
415        FMLA v25.4s, v19.4s,  v8.s[3]
416        FMLA v27.4s, v19.4s,  v9.s[3]
417        # Is there a remainder?- 4 floats of A (16 bytes) or less
418        TST x0, 31
419        FMLA v29.4s, v19.4s, v10.s[3]
420        FMLA v31.4s, v19.4s, v11.s[3]
421        B.NE 4f
422
423        .p2align 3
424
425        # Clamp
4263:
427        FMAX v20.4s, v20.4s, v6.4s
428        # Load cn_stride
429        LDR x0, [sp, 64]
430        FMAX v21.4s, v21.4s, v6.4s
431        FMAX v22.4s, v22.4s, v6.4s
432        FMAX v23.4s, v23.4s, v6.4s
433        FMAX v24.4s, v24.4s, v6.4s
434        FMAX v25.4s, v25.4s, v6.4s
435        FMAX v26.4s, v26.4s, v6.4s
436        FMAX v27.4s, v27.4s, v6.4s
437        FMAX v28.4s, v28.4s, v6.4s
438        FMAX v29.4s, v29.4s, v6.4s
439        FMAX v30.4s, v30.4s, v6.4s
440        FMAX v31.4s, v31.4s, v6.4s
441        SUBS x1, x1, 8
442        FMIN v20.4s, v20.4s, v7.4s
443        FMIN v21.4s, v21.4s, v7.4s
444        FMIN v22.4s, v22.4s, v7.4s
445        FMIN v23.4s, v23.4s, v7.4s
446        FMIN v24.4s, v24.4s, v7.4s
447        FMIN v25.4s, v25.4s, v7.4s
448        FMIN v26.4s, v26.4s, v7.4s
449        FMIN v27.4s, v27.4s, v7.4s
450        FMIN v28.4s, v28.4s, v7.4s
451        FMIN v29.4s, v29.4s, v7.4s
452        FMIN v30.4s, v30.4s, v7.4s
453        FMIN v31.4s, v31.4s, v7.4s
454
455        # Store full 6 x 8
456        B.LO 7f
457
458        $if INC:
459          STP q30, q31,  [x7]
460          ADD x7, x7, x0
461          SUB  x3,  x3, x2 // a0 -= kc
462          STP q28, q29, [x13]
463          ADD x13, x13, x0
464          SUB  x9,  x9, x2 // a1 -= kc
465          STP q26, q27, [x14]
466          ADD x14, x14, x0
467          SUB x10, x10, x2 // a2 -= kc
468          STP q24, q25, [x17]
469          ADD x17, x17, x0
470          SUB x11, x11, x2 // a3 -= kc
471          STP q22, q23, [x16]
472          ADD x16, x16, x0
473          SUB x12, x12, x2 // a4 -= kc
474          STP q20, q21,  [x6]
475          ADD  x6,  x6, x0
476          SUB  x4,  x4, x2 // a5 -= kc
477        $else:
478          STP q20, q21,  [x6]
479          ADD  x6,  x6, x0
480          SUB  x3,  x3, x2 // a0 -= kc
481          STP q22, q23, [x16]
482          ADD x16, x16, x0
483          SUB  x9,  x9, x2 // a1 -= kc
484          STP q24, q25, [x17]
485          ADD x17, x17, x0
486          SUB x10, x10, x2 // a2 -= kc
487          STP q26, q27, [x14]
488          ADD x14, x14, x0
489          SUB x11, x11, x2 // a3 -= kc
490          STP q28, q29, [x13]
491          ADD x13, x13, x0
492          SUB x12, x12, x2 // a4 -= kc
493          STP q30, q31,  [x7]
494          ADD x7, x7, x0
495          SUB  x4,  x4, x2 // a5 -= kc
496
497        NOP
498        B.HI 0b
499
500        # Restore d8-d15 from stack
501        LDP d14, d15, [sp, 48]
502        LDP d12, d13, [sp, 32]
503        LDP d10, d11, [sp, 16]
504        LDP  d8,  d9, [sp], 64
505        RET
506
507        .p2align 3
5084:
509        # Load min/max values
510        LD2R {v6.4s, v7.4s}, [x8]
511
512        # Is there a remainder?- 4 floats of A (16 bytes)
513        TBZ x0, 4, 5f
514
515        # Remainder- 4 floats of A (16 bytes)
516        # Load A
517        LDR   q0,  [x3], 16
518        LDR   q1,  [x9], 16
519        LDR   q2, [x10], 16
520        LDR   q3, [x11], 16
521        LDR   q4, [x12], 16
522        LDR   q5,  [x4], 16
523        # Load B
524        LDP  q12,  q13, [x5], 32
525        LDP  q14,  q15, [x5], 32
526        LDP  q16,  q17, [x5], 32
527        LDP  q18,  q19, [x5], 32
528
529        FMLA v20.4s, v12.4s,  v0.s[0]
530        FMLA v22.4s, v12.4s,  v1.s[0]
531        FMLA v24.4s, v12.4s,  v2.s[0]
532        FMLA v26.4s, v12.4s,  v3.s[0]
533        FMLA v28.4s, v12.4s,  v4.s[0]
534        FMLA v30.4s, v12.4s,  v5.s[0]
535        FMLA v21.4s, v13.4s,  v0.s[0]
536        FMLA v23.4s, v13.4s,  v1.s[0]
537        FMLA v25.4s, v13.4s,  v2.s[0]
538        FMLA v27.4s, v13.4s,  v3.s[0]
539        FMLA v29.4s, v13.4s,  v4.s[0]
540        FMLA v31.4s, v13.4s,  v5.s[0]
541
542        FMLA v20.4s, v14.4s,  v0.s[1]
543        FMLA v22.4s, v14.4s,  v1.s[1]
544        FMLA v24.4s, v14.4s,  v2.s[1]
545        FMLA v26.4s, v14.4s,  v3.s[1]
546        FMLA v28.4s, v14.4s,  v4.s[1]
547        FMLA v30.4s, v14.4s,  v5.s[1]
548        FMLA v21.4s, v15.4s,  v0.s[1]
549        FMLA v23.4s, v15.4s,  v1.s[1]
550        FMLA v25.4s, v15.4s,  v2.s[1]
551        FMLA v27.4s, v15.4s,  v3.s[1]
552        FMLA v29.4s, v15.4s,  v4.s[1]
553        FMLA v31.4s, v15.4s,  v5.s[1]
554
555        FMLA v20.4s, v16.4s,  v0.s[2]
556        FMLA v22.4s, v16.4s,  v1.s[2]
557        FMLA v24.4s, v16.4s,  v2.s[2]
558        FMLA v26.4s, v16.4s,  v3.s[2]
559        FMLA v28.4s, v16.4s,  v4.s[2]
560        FMLA v30.4s, v16.4s,  v5.s[2]
561        FMLA v21.4s, v17.4s,  v0.s[2]
562        FMLA v23.4s, v17.4s,  v1.s[2]
563        FMLA v25.4s, v17.4s,  v2.s[2]
564        FMLA v27.4s, v17.4s,  v3.s[2]
565        FMLA v29.4s, v17.4s,  v4.s[2]
566        FMLA v31.4s, v17.4s,  v5.s[2]
567
568        FMLA v20.4s, v18.4s,  v0.s[3]
569        FMLA v22.4s, v18.4s,  v1.s[3]
570        FMLA v24.4s, v18.4s,  v2.s[3]
571        FMLA v26.4s, v18.4s,  v3.s[3]
572        FMLA v28.4s, v18.4s,  v4.s[3]
573        FMLA v30.4s, v18.4s,  v5.s[3]
574        FMLA v21.4s, v19.4s,  v0.s[3]
575        FMLA v23.4s, v19.4s,  v1.s[3]
576        FMLA v25.4s, v19.4s,  v2.s[3]
577        FMLA v27.4s, v19.4s,  v3.s[3]
578        FMLA v29.4s, v19.4s,  v4.s[3]
579        FMLA v31.4s, v19.4s,  v5.s[3]
580
581        # Is there a remainder?- 2 floats of A (8 bytes)
5825:
583        TBZ x0, 3, 6f
584
585        # Remainder- 2 floats of A (8 bytes)
586        # Load A
587        LDR   d0,  [x3], 8
588        LDR   d1,  [x9], 8
589        LDR   d2, [x10], 8
590        LDR   d3, [x11], 8
591        LDR   d4, [x12], 8
592        LDR   d5,  [x4], 8
593        # Load B
594        LDP  q12,  q13, [x5], 32
595        LDP  q14,  q15, [x5], 32
596
597        FMLA v20.4s, v12.4s,  v0.s[0]
598        FMLA v22.4s, v12.4s,  v1.s[0]
599        FMLA v24.4s, v12.4s,  v2.s[0]
600        FMLA v26.4s, v12.4s,  v3.s[0]
601        FMLA v28.4s, v12.4s,  v4.s[0]
602        FMLA v30.4s, v12.4s,  v5.s[0]
603        FMLA v21.4s, v13.4s,  v0.s[0]
604        FMLA v23.4s, v13.4s,  v1.s[0]
605        FMLA v25.4s, v13.4s,  v2.s[0]
606        FMLA v27.4s, v13.4s,  v3.s[0]
607        FMLA v29.4s, v13.4s,  v4.s[0]
608        FMLA v31.4s, v13.4s,  v5.s[0]
609
610        FMLA v20.4s, v14.4s,  v0.s[1]
611        FMLA v22.4s, v14.4s,  v1.s[1]
612        FMLA v24.4s, v14.4s,  v2.s[1]
613        FMLA v26.4s, v14.4s,  v3.s[1]
614        FMLA v28.4s, v14.4s,  v4.s[1]
615        FMLA v30.4s, v14.4s,  v5.s[1]
616        FMLA v21.4s, v15.4s,  v0.s[1]
617        FMLA v23.4s, v15.4s,  v1.s[1]
618        FMLA v25.4s, v15.4s,  v2.s[1]
619        FMLA v27.4s, v15.4s,  v3.s[1]
620        FMLA v29.4s, v15.4s,  v4.s[1]
621        FMLA v31.4s, v15.4s,  v5.s[1]
622
623        # Is there a remainder?- 1 float of A (4 bytes)
6246:
625        TBZ x0, 2, 3b
626
627        # Remainder- 1 float of A (4 bytes)
628        # Load A
629        LDR   s0,  [x3], 4
630        LDR   s1,  [x9], 4
631        LDR   s2, [x10], 4
632        LDR   s3, [x11], 4
633        LDR   s4, [x12], 4
634        LDR   s5,  [x4], 4
635        # Load B
636        LDP  q12,  q13, [x5], 32
637
638        FMLA v20.4s, v12.4s,  v0.s[0]
639        FMLA v22.4s, v12.4s,  v1.s[0]
640        FMLA v24.4s, v12.4s,  v2.s[0]
641        FMLA v26.4s, v12.4s,  v3.s[0]
642        FMLA v28.4s, v12.4s,  v4.s[0]
643        FMLA v30.4s, v12.4s,  v5.s[0]
644        FMLA v21.4s, v13.4s,  v0.s[0]
645        FMLA v23.4s, v13.4s,  v1.s[0]
646        FMLA v25.4s, v13.4s,  v2.s[0]
647        FMLA v27.4s, v13.4s,  v3.s[0]
648        FMLA v29.4s, v13.4s,  v4.s[0]
649        FMLA v31.4s, v13.4s,  v5.s[0]
650        B 3b
651
652        .p2align 3
653
654        # Store odd width
6557:
656        TBZ x1, 2, 8f
657        $if INC:
658          STR q30,  [x7], 16
659          MOV v30.16b, v31.16b
660          STR q28, [x13], 16
661          MOV v28.16b, v29.16b
662          STR q26, [x14], 16
663          MOV v26.16b, v27.16b
664          STR q24, [x17], 16
665          MOV v24.16b, v25.16b
666          STR q22, [x16], 16
667          MOV v22.16b, v23.16b
668          STR q20,  [x6], 16
669          MOV v20.16b, v21.16b
670        $else:
671          STR q20,  [x6], 16
672          MOV v20.16b, v21.16b
673          STR q22, [x16], 16
674          MOV v22.16b, v23.16b
675          STR q24, [x17], 16
676          MOV v24.16b, v25.16b
677          STR q26, [x14], 16
678          MOV v26.16b, v27.16b
679          STR q28, [x13], 16
680          MOV v28.16b, v29.16b
681          STR q30,  [x7], 16
682          MOV v30.16b, v31.16b
6838:
684        TBZ x1, 1, 9f
685        $if INC:
686          STR d30,  [x7], 8
687          DUP d30, v30.d[1]
688          STR d28, [x13], 8
689          DUP d28, v28.d[1]
690          STR d26, [x14], 8
691          DUP d26, v26.d[1]
692          STR d24, [x17], 8
693          DUP d24, v24.d[1]
694          STR d22, [x16], 8
695          DUP d22, v22.d[1]
696          STR d20,  [x6], 8
697          DUP d20, v20.d[1]
698        $else:
699          STR d20,  [x6], 8
700          DUP d20, v20.d[1]
701          STR d22, [x16], 8
702          DUP d22, v22.d[1]
703          STR d24, [x17], 8
704          DUP d24, v24.d[1]
705          STR d26, [x14], 8
706          DUP d26, v26.d[1]
707          STR d28, [x13], 8
708          DUP d28, v28.d[1]
709          STR d30,  [x7], 8
710          DUP d30, v30.d[1]
711
7129:
713        TBZ x1, 0, 10f
714        $if INC:
715          STR s30,  [x7]
716          STR s28, [x13]
717          STR s26, [x14]
718          STR s24, [x17]
719          STR s22, [x16]
720          STR s20,  [x6]
721        $else:
722          STR s20,  [x6]
723          STR s22, [x16]
724          STR s24, [x17]
725          STR s26, [x14]
726          STR s28, [x13]
727          STR s30,  [x7]
72810:
729        # Restore d8-d15 from stack
730        LDP d14, d15, [sp, 48]
731        LDP d12, d13, [sp, 32]
732        LDP d10, d11, [sp, 16]
733        LDP  d8,  d9, [sp], 64
734        RET
735
736END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73
737
738#ifdef __ELF__
739.section ".note.GNU-stack","",%progbits
740#endif
741