1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const void*restrict w,             x5
19#     uint8_t*restrict c,                x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> (x0)
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> x8
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29# x14 a0
30# x15 a1
31# x20 a2
32# x21 a3
33# x22 a4
34# x23 a5
35
36# C pointers
37#  x6 c0
38# x16 c1
39# x17 c2
40# x10 c3
41# x13 c4
42#  x7 c5
43
44# Vector register usage
45# A0   v0  v6
46# A1   v1  v7
47# A2   v2  v8
48# A3   v3  v9
49# A4   v4 v10
50# A5   v5 v11
51# B   v12 v13 v14 v15
52# B   v16 v17 v18 v19
53# C   v20 v21
54# C   v22 v23
55# C   v24 v25
56# C   v26 v27
57# C   v28 v29
58# C   v30 v31
59# Clamp v6 v7
60
61BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75
62
63        # Clamp C pointers / Save d8-d15 on stack
64        STP  d8,  d9, [sp, -96]!
65        CMP x0, 2                // if mr < 2
66        ADD x16, x6, x7          // c1 = c0 + cm_stride
67        CSEL x16, x6, x16, LO    //   c1 = c0
68
69        STP d10, d11, [sp, 16]
70        ADD x17, x16, x7         // c2 = c1 + cm_stride
71                                 // if mr <= 2
72        CSEL x17, x16, x17, LS   //   c2 = c1
73
74        STP d12, d13, [sp, 32]
75        CMP x0, 4                // if mr < 4
76        ADD x10, x17, x7         // c3 = c2 + cm_stride
77        CSEL x10, x17, x10, LO   //   c3 = c2
78
79        STP d14, d15, [sp, 48]
80        ADD x13, x10, x7         // c4 = c3 + cm_stride
81                                 // if mr <= 4
82        CSEL x13, x10, x13, LS   //   c4 = c3
83
84        # Save x20,x21,x22,x23 on stack
85        STP x20, x21, [sp, 64]
86        STP x22, x23, [sp, 80]
87
88        CMP x0, 6                // if mr < 6
89        ADD x7, x13, x7          // c5 = c4 + cm_stride
90        CSEL x7, x13, x7, LO     //   c5 = c4
91
92        # Load a_offset
93        LDR x11, [sp, 104]
94
95        # Load zero, params pointer
96        LDP x12, x8, [sp, 112]
97
980:
99        # Load initial bias from w into accumulators
100        LDP q20, q21, [x5], 32
101        MOV v22.16b, v20.16b
102        MOV v23.16b, v21.16b
103        PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
104        MOV v24.16b, v20.16b
105        MOV v25.16b, v21.16b
106        PRFM PLDL1KEEP, [x5, 64]
107        MOV v26.16b, v20.16b
108        MOV v27.16b, v21.16b
109        PRFM PLDL1KEEP, [x5, 128]
110        MOV v28.16b, v20.16b
111        MOV v29.16b, v21.16b
112        PRFM PLDL1KEEP, [x5, 192]
113        MOV v30.16b, v20.16b
114        MOV v31.16b, v21.16b
115
116        MOV x9, x3  // p = ks
117
1181:
119        # Load next 6 A pointers
120        LDP x14, x15, [x4], 16
121        LDP x20, x21, [x4], 16
122        LDP x22, x23, [x4], 16
123
124        CMP x14, x12            // if a0 == zero
125        ADD x14, x14, x11       // a0 += a_offset
126        CSEL x14, x12, x14, EQ  //   a0 = zero, else += a0 + a_offset
127        CMP x15, x12            // if a1 == zero
128        ADD x15, x15, x11       // a1 += a_offset
129        CSEL x15, x12, x15, EQ  //   a1 = zero, else += a1 + a_offset
130        CMP x20, x12            // if a2 == zero
131        ADD x20, x20, x11       // a2 += a_offset
132        CSEL x20, x12, x20, EQ  //   a2 = zero, else += a2 + a_offset
133        CMP x21, x12            // if a3 == zero
134        ADD x21, x21, x11       // a3 += a_offset
135        CSEL x21, x12, x21, EQ  //   a3 = zero, else += a3 + a_offset
136        CMP x22, x12            // if a4 == zero
137        ADD x22, x22, x11       // a4 += a_offset
138        CSEL x22, x12, x22, EQ  //   a4 = zero, else += a4 + a_offset
139        CMP x23, x12            // if a5 == zero
140        ADD x23, x23, x11       // a5 += a_offset
141        CSEL x23, x12, x23, EQ  //   a5 = zero, else += a5 + a_offset
142
143        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
144        SUBS x0, x2, 32  // k = kc - 32
145        B.LO 5f
146
147        # Prologue - loads for main loop of 96 FMA
148        LDR   q0, [x14], 16
149        LDR   q1, [x15], 16
150        LDR   q2, [x20], 16
151        LDR   q3, [x21], 16
152        LDR   q4, [x22], 16
153        LDR   q5, [x23], 16
154        LDP  q12, q13, [x5], 32  // Fetch 3 B (4th deferred)
155        LDP  q14, q15, [x5], 32
156        LDP  q16, q17, [x5], 32
157
158        # Is there at least 8 floats (32 bytes) for main loop?
159        SUBS x0, x0, 32
160        B.LO 3f
161
162        # Main loop - 8 floats of A (32 bytes)
163        # 96 FMA + 6 LDP A + 8 LDP B
1642:
165        # First group of 4 A.  48 FMA.
166        FMLA v20.4s, v12.4s,  v0.s[0]
167        LDP  q18, q19, [x5], 32        // Load last B
168        FMLA v22.4s, v12.4s,  v1.s[0]
169        FMLA v24.4s, v12.4s,  v2.s[0]
170        FMLA v26.4s, v12.4s,  v3.s[0]
171        FMLA v28.4s, v12.4s,  v4.s[0]
172        FMLA v30.4s, v12.4s,  v5.s[0]
173        FMLA v21.4s, v13.4s,  v0.s[0]
174        FMLA v23.4s, v13.4s,  v1.s[0]
175        FMLA v25.4s, v13.4s,  v2.s[0]
176        FMLA v27.4s, v13.4s,  v3.s[0]
177        FMLA v29.4s, v13.4s,  v4.s[0]
178
179        FMLA v31.4s, v13.4s,  v5.s[0]
180        FMLA v20.4s, v14.4s,  v0.s[1]
181        PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
182        FMLA v22.4s, v14.4s,  v1.s[1]
183        FMLA v24.4s, v14.4s,  v2.s[1]
184        FMLA v26.4s, v14.4s,  v3.s[1]
185        FMLA v28.4s, v14.4s,  v4.s[1]
186        PRFM PLDL1KEEP, [x5, 256]
187        FMLA v30.4s, v14.4s,  v5.s[1]
188        FMLA v21.4s, v15.4s,  v0.s[1]
189        FMLA v23.4s, v15.4s,  v1.s[1]
190        FMLA v25.4s, v15.4s,  v2.s[1]
191        LDR   q6, [x14], 16            // Load next 6 A
192        FMLA v27.4s, v15.4s,  v3.s[1]
193        FMLA v29.4s, v15.4s,  v4.s[1]
194        FMLA v31.4s, v15.4s,  v5.s[1]
195        LDR   q7, [x15], 16
196
197        FMLA v20.4s, v16.4s,  v0.s[2]
198        FMLA v22.4s, v16.4s,  v1.s[2]
199        FMLA v24.4s, v16.4s,  v2.s[2]
200        LDR   q8, [x20], 16
201        FMLA v26.4s, v16.4s,  v3.s[2]
202        FMLA v28.4s, v16.4s,  v4.s[2]
203        FMLA v30.4s, v16.4s,  v5.s[2]
204        LDR   q9, [x21], 16
205        FMLA v21.4s, v17.4s,  v0.s[2]
206        FMLA v23.4s, v17.4s,  v1.s[2]
207        FMLA v25.4s, v17.4s,  v2.s[2]
208        LDR   q10, [x22], 16
209        FMLA v27.4s, v17.4s,  v3.s[2]
210        FMLA v29.4s, v17.4s,  v4.s[2]
211        FMLA v31.4s, v17.4s,  v5.s[2]
212        LDR  q11, [x23], 16
213
214        FMLA v20.4s, v18.4s,  v0.s[3]
215        FMLA v22.4s, v18.4s,  v1.s[3]
216        FMLA v24.4s, v18.4s,  v2.s[3]
217        LDP  q12, q13, [x5], 32        // Load 4 B
218        FMLA v26.4s, v18.4s,  v3.s[3]
219        FMLA v28.4s, v18.4s,  v4.s[3]
220        FMLA v30.4s, v18.4s,  v5.s[3]
221        LDP  q14, q15, [x5], 32
222        FMLA v21.4s, v19.4s,  v0.s[3]
223        FMLA v23.4s, v19.4s,  v1.s[3]
224        FMLA v25.4s, v19.4s,  v2.s[3]
225        LDP  q16, q17, [x5], 32
226        FMLA v27.4s, v19.4s,  v3.s[3]
227        FMLA v29.4s, v19.4s,  v4.s[3]
228        FMLA v31.4s, v19.4s,  v5.s[3]
229        LDP  q18, q19, [x5], 32
230
231        # Second group of 4 A.  48 FMA.
232        FMLA v20.4s, v12.4s,  v6.s[0]
233        FMLA v22.4s, v12.4s,  v7.s[0]
234        FMLA v24.4s, v12.4s,  v8.s[0]
235        LDR   q0, [x14], 16            // Load next 6 A
236        FMLA v26.4s, v12.4s,  v9.s[0]
237        FMLA v28.4s, v12.4s, v10.s[0]
238        FMLA v30.4s, v12.4s, v11.s[0]
239        LDR   q1, [x15], 16
240        FMLA v21.4s, v13.4s,  v6.s[0]
241        FMLA v23.4s, v13.4s,  v7.s[0]
242        FMLA v25.4s, v13.4s,  v8.s[0]
243        LDR   q2, [x20], 16
244        FMLA v27.4s, v13.4s,  v9.s[0]
245        FMLA v29.4s, v13.4s, v10.s[0]
246        FMLA v31.4s, v13.4s, v11.s[0]
247        LDR   q3, [x21], 16
248
249        FMLA v20.4s, v14.4s,  v6.s[1]
250        FMLA v22.4s, v14.4s,  v7.s[1]
251        FMLA v24.4s, v14.4s,  v8.s[1]
252        LDR   q4, [x22], 16
253        FMLA v26.4s, v14.4s,  v9.s[1]
254        FMLA v28.4s, v14.4s, v10.s[1]
255        FMLA v30.4s, v14.4s, v11.s[1]
256        LDR   q5, [x23], 16
257        FMLA v21.4s, v15.4s,  v6.s[1]
258        FMLA v23.4s, v15.4s,  v7.s[1]
259        FMLA v25.4s, v15.4s,  v8.s[1]
260        LDP  q12, q13, [x5], 32        // Load next 3 B (not last)
261        FMLA v27.4s, v15.4s,  v9.s[1]
262        FMLA v29.4s, v15.4s, v10.s[1]
263        FMLA v31.4s, v15.4s, v11.s[1]
264        LDP  q14, q15, [x5], 32
265
266        FMLA v20.4s, v16.4s,  v6.s[2]
267        FMLA v22.4s, v16.4s,  v7.s[2]
268        FMLA v24.4s, v16.4s,  v8.s[2]
269        FMLA v26.4s, v16.4s,  v9.s[2]
270        FMLA v28.4s, v16.4s, v10.s[2]
271        FMLA v30.4s, v16.4s, v11.s[2]
272        FMLA v21.4s, v17.4s,  v6.s[2]
273        FMLA v23.4s, v17.4s,  v7.s[2]
274        FMLA v25.4s, v17.4s,  v8.s[2]
275        FMLA v27.4s, v17.4s,  v9.s[2]
276        FMLA v29.4s, v17.4s, v10.s[2]
277        FMLA v31.4s, v17.4s, v11.s[2]
278        LDP  q16,  q17, [x5], 32
279
280        FMLA v20.4s, v18.4s,  v6.s[3]
281        FMLA v22.4s, v18.4s,  v7.s[3]
282        SUBS x0, x0, 32
283        FMLA v24.4s, v18.4s,  v8.s[3]
284        FMLA v26.4s, v18.4s,  v9.s[3]
285        FMLA v28.4s, v18.4s, v10.s[3]
286        FMLA v30.4s, v18.4s, v11.s[3]
287        FMLA v21.4s, v19.4s,  v6.s[3]
288        FMLA v23.4s, v19.4s,  v7.s[3]
289        FMLA v25.4s, v19.4s,  v8.s[3]
290        FMLA v27.4s, v19.4s,  v9.s[3]
291        FMLA v29.4s, v19.4s, v10.s[3]
292        FMLA v31.4s, v19.4s, v11.s[3]
293        B.HS 2b
294
295        # Epilogue - 8 floats of A (32 bytes)
296        # 96 FMA + 6 LDP A + 8 LDP B
297        # First block same as main loop.  Second block has no preloads.
2983:
299        # First group of 4 A.  48 FMA.
300        FMLA v20.4s, v12.4s,  v0.s[0]
301        LDP  q18, q19, [x5], 32        // Load last B
302        FMLA v22.4s, v12.4s,  v1.s[0]
303        FMLA v24.4s, v12.4s,  v2.s[0]
304        FMLA v26.4s, v12.4s,  v3.s[0]
305        FMLA v28.4s, v12.4s,  v4.s[0]
306        FMLA v30.4s, v12.4s,  v5.s[0]
307        FMLA v21.4s, v13.4s,  v0.s[0]
308        FMLA v23.4s, v13.4s,  v1.s[0]
309        FMLA v25.4s, v13.4s,  v2.s[0]
310        FMLA v27.4s, v13.4s,  v3.s[0]
311        FMLA v29.4s, v13.4s,  v4.s[0]
312
313        FMLA v31.4s, v13.4s,  v5.s[0]
314        FMLA v20.4s, v14.4s,  v0.s[1]
315        PRFM PLDL1KEEP, [x5, 128]      // Prefetch B
316        FMLA v22.4s, v14.4s,  v1.s[1]
317        FMLA v24.4s, v14.4s,  v2.s[1]
318        FMLA v26.4s, v14.4s,  v3.s[1]
319        FMLA v28.4s, v14.4s,  v4.s[1]
320        PRFM PLDL1KEEP, [x5, 256]
321        FMLA v30.4s, v14.4s,  v5.s[1]
322        FMLA v21.4s, v15.4s,  v0.s[1]
323        FMLA v23.4s, v15.4s,  v1.s[1]
324        FMLA v25.4s, v15.4s,  v2.s[1]
325        LDR   q6, [x14], 16            // Load next 6 A
326        FMLA v27.4s, v15.4s,  v3.s[1]
327        FMLA v29.4s, v15.4s,  v4.s[1]
328        FMLA v31.4s, v15.4s,  v5.s[1]
329        LDR   q7, [x15], 16
330
331        FMLA v20.4s, v16.4s,  v0.s[2]
332        FMLA v22.4s, v16.4s,  v1.s[2]
333        FMLA v24.4s, v16.4s,  v2.s[2]
334        LDR   q8, [x20], 16
335        FMLA v26.4s, v16.4s,  v3.s[2]
336        FMLA v28.4s, v16.4s,  v4.s[2]
337        FMLA v30.4s, v16.4s,  v5.s[2]
338        LDR   q9, [x21], 16
339        FMLA v21.4s, v17.4s,  v0.s[2]
340        FMLA v23.4s, v17.4s,  v1.s[2]
341        FMLA v25.4s, v17.4s,  v2.s[2]
342        LDR   q10, [x22], 16
343        FMLA v27.4s, v17.4s,  v3.s[2]
344        FMLA v29.4s, v17.4s,  v4.s[2]
345        FMLA v31.4s, v17.4s,  v5.s[2]
346        LDR  q11, [x23], 16
347
348        FMLA v20.4s, v18.4s,  v0.s[3]
349        FMLA v22.4s, v18.4s,  v1.s[3]
350        FMLA v24.4s, v18.4s,  v2.s[3]
351        LDP  q12, q13, [x5], 32        // Load 4 B
352        FMLA v26.4s, v18.4s,  v3.s[3]
353        FMLA v28.4s, v18.4s,  v4.s[3]
354        FMLA v30.4s, v18.4s,  v5.s[3]
355        LDP  q14, q15, [x5], 32
356        FMLA v21.4s, v19.4s,  v0.s[3]
357        FMLA v23.4s, v19.4s,  v1.s[3]
358        FMLA v25.4s, v19.4s,  v2.s[3]
359        LDP  q16, q17, [x5], 32
360        FMLA v27.4s, v19.4s,  v3.s[3]
361        FMLA v29.4s, v19.4s,  v4.s[3]
362        FMLA v31.4s, v19.4s,  v5.s[3]
363        LDP  q18, q19, [x5], 32
364
365        # Second group of 4 A.  48 FMA.
366        FMLA v20.4s, v12.4s,  v6.s[0]
367        FMLA v22.4s, v12.4s,  v7.s[0]
368        FMLA v24.4s, v12.4s,  v8.s[0]
369        FMLA v26.4s, v12.4s,  v9.s[0]
370        FMLA v28.4s, v12.4s, v10.s[0]
371        FMLA v30.4s, v12.4s, v11.s[0]
372        FMLA v21.4s, v13.4s,  v6.s[0]
373        FMLA v23.4s, v13.4s,  v7.s[0]
374        FMLA v25.4s, v13.4s,  v8.s[0]
375        FMLA v27.4s, v13.4s,  v9.s[0]
376        FMLA v29.4s, v13.4s, v10.s[0]
377        FMLA v31.4s, v13.4s, v11.s[0]
378
379        FMLA v20.4s, v14.4s,  v6.s[1]
380        FMLA v22.4s, v14.4s,  v7.s[1]
381        FMLA v24.4s, v14.4s,  v8.s[1]
382        FMLA v26.4s, v14.4s,  v9.s[1]
383        FMLA v28.4s, v14.4s, v10.s[1]
384        FMLA v30.4s, v14.4s, v11.s[1]
385        FMLA v21.4s, v15.4s,  v6.s[1]
386        FMLA v23.4s, v15.4s,  v7.s[1]
387        FMLA v25.4s, v15.4s,  v8.s[1]
388        FMLA v27.4s, v15.4s,  v9.s[1]
389        FMLA v29.4s, v15.4s, v10.s[1]
390        FMLA v31.4s, v15.4s, v11.s[1]
391
392        FMLA v20.4s, v16.4s,  v6.s[2]
393        FMLA v22.4s, v16.4s,  v7.s[2]
394        FMLA v24.4s, v16.4s,  v8.s[2]
395        FMLA v26.4s, v16.4s,  v9.s[2]
396        FMLA v28.4s, v16.4s, v10.s[2]
397        FMLA v30.4s, v16.4s, v11.s[2]
398        FMLA v21.4s, v17.4s,  v6.s[2]
399        FMLA v23.4s, v17.4s,  v7.s[2]
400        FMLA v25.4s, v17.4s,  v8.s[2]
401        FMLA v27.4s, v17.4s,  v9.s[2]
402        FMLA v29.4s, v17.4s, v10.s[2]
403        FMLA v31.4s, v17.4s, v11.s[2]
404
405        FMLA v20.4s, v18.4s,  v6.s[3]
406        FMLA v22.4s, v18.4s,  v7.s[3]
407        FMLA v24.4s, v18.4s,  v8.s[3]
408        FMLA v26.4s, v18.4s,  v9.s[3]
409        FMLA v28.4s, v18.4s, v10.s[3]
410        FMLA v30.4s, v18.4s, v11.s[3]
411        FMLA v21.4s, v19.4s,  v6.s[3]
412        FMLA v23.4s, v19.4s,  v7.s[3]
413
414        # Load min/max values
415        LD2R {v6.4s, v7.4s}, [x8]
416
417        FMLA v25.4s, v19.4s,  v8.s[3]
418        FMLA v27.4s, v19.4s,  v9.s[3]
419        # Is there a remainder?- 4 floats of A (16 bytes) or less
420        TST x0, 31
421        FMLA v29.4s, v19.4s, v10.s[3]
422        FMLA v31.4s, v19.4s, v11.s[3]
423        B.NE 5f
424
4254:
426        # ks loop
427        SUBS x9, x9, 48  // ks -= MR * sizeof(void*)
428        B.HI 1b
429
430        # Clamp
431        FMAX v20.4s, v20.4s, v6.4s
432        # Load cn_stride
433        LDR x0, [sp, 96]
434        FMAX v21.4s, v21.4s, v6.4s
435        FMAX v22.4s, v22.4s, v6.4s
436        FMAX v23.4s, v23.4s, v6.4s
437        FMAX v24.4s, v24.4s, v6.4s
438        FMAX v25.4s, v25.4s, v6.4s
439        FMAX v26.4s, v26.4s, v6.4s
440        FMAX v27.4s, v27.4s, v6.4s
441        FMAX v28.4s, v28.4s, v6.4s
442        FMAX v29.4s, v29.4s, v6.4s
443        FMAX v30.4s, v30.4s, v6.4s
444        FMAX v31.4s, v31.4s, v6.4s
445        SUBS x1, x1, 8
446        FMIN v20.4s, v20.4s, v7.4s
447        FMIN v21.4s, v21.4s, v7.4s
448        FMIN v22.4s, v22.4s, v7.4s
449        FMIN v23.4s, v23.4s, v7.4s
450        FMIN v24.4s, v24.4s, v7.4s
451        FMIN v25.4s, v25.4s, v7.4s
452        FMIN v26.4s, v26.4s, v7.4s
453        FMIN v27.4s, v27.4s, v7.4s
454        FMIN v28.4s, v28.4s, v7.4s
455        FMIN v29.4s, v29.4s, v7.4s
456        FMIN v30.4s, v30.4s, v7.4s
457        FMIN v31.4s, v31.4s, v7.4s
458
459        # Store full 6 x 8
460        B.LO 8f
461
462        STP q30, q31,  [x7]
463        ADD x7, x7, x0
464        STP q28, q29, [x13]
465        ADD x13, x13, x0
466        STP q26, q27, [x10]
467        ADD x10, x10, x0
468        STP q24, q25, [x17]
469        ADD x17, x17, x0
470        STP q22, q23, [x16]
471        ADD x16, x16, x0
472        STP q20, q21,  [x6]
473        ADD  x6,  x6, x0
474
475        SUB x4, x4, x3  // a -= ks
476
477        # nc loop
478        B.HI 0b
479
480        # Restore x20,x21,x22,x23 from stack
481        LDP x22, x23, [sp, 80]
482        LDP x20, x21, [sp, 64]
483
484        # Restore d8-d15 from stack
485        LDP d14, d15, [sp, 48]
486        LDP d12, d13, [sp, 32]
487        LDP d10, d11, [sp, 16]
488        LDP  d8,  d9, [sp], 96
489        RET
490
4915:
492        # Load min/max values
493        LD2R {v6.4s, v7.4s}, [x8]
494
495        # Is there a remainder?- 4 floats of A (16 bytes)
496        TBZ x0, 4, 6f
497
498        # Remainder- 4 floats of A (16 bytes)
499        # Load A
500        LDR   q0, [x14], 16
501        LDR   q1, [x15], 16
502        LDR   q2, [x20], 16
503        LDR   q3, [x21], 16
504        LDR   q4, [x22], 16
505        LDR   q5, [x23], 16
506        # Load B
507        LDP  q12, q13, [x5], 32
508        LDP  q14, q15, [x5], 32
509        LDP  q16, q17, [x5], 32
510        LDP  q18, q19, [x5], 32
511
512        FMLA v20.4s, v12.4s,  v0.s[0]
513        FMLA v22.4s, v12.4s,  v1.s[0]
514        FMLA v24.4s, v12.4s,  v2.s[0]
515        FMLA v26.4s, v12.4s,  v3.s[0]
516        FMLA v28.4s, v12.4s,  v4.s[0]
517        FMLA v30.4s, v12.4s,  v5.s[0]
518        FMLA v21.4s, v13.4s,  v0.s[0]
519        FMLA v23.4s, v13.4s,  v1.s[0]
520        FMLA v25.4s, v13.4s,  v2.s[0]
521        FMLA v27.4s, v13.4s,  v3.s[0]
522        FMLA v29.4s, v13.4s,  v4.s[0]
523        FMLA v31.4s, v13.4s,  v5.s[0]
524
525        FMLA v20.4s, v14.4s,  v0.s[1]
526        FMLA v22.4s, v14.4s,  v1.s[1]
527        FMLA v24.4s, v14.4s,  v2.s[1]
528        FMLA v26.4s, v14.4s,  v3.s[1]
529        FMLA v28.4s, v14.4s,  v4.s[1]
530        FMLA v30.4s, v14.4s,  v5.s[1]
531        FMLA v21.4s, v15.4s,  v0.s[1]
532        FMLA v23.4s, v15.4s,  v1.s[1]
533        FMLA v25.4s, v15.4s,  v2.s[1]
534        FMLA v27.4s, v15.4s,  v3.s[1]
535        FMLA v29.4s, v15.4s,  v4.s[1]
536        FMLA v31.4s, v15.4s,  v5.s[1]
537
538        FMLA v20.4s, v16.4s,  v0.s[2]
539        FMLA v22.4s, v16.4s,  v1.s[2]
540        FMLA v24.4s, v16.4s,  v2.s[2]
541        FMLA v26.4s, v16.4s,  v3.s[2]
542        FMLA v28.4s, v16.4s,  v4.s[2]
543        FMLA v30.4s, v16.4s,  v5.s[2]
544        FMLA v21.4s, v17.4s,  v0.s[2]
545        FMLA v23.4s, v17.4s,  v1.s[2]
546        FMLA v25.4s, v17.4s,  v2.s[2]
547        FMLA v27.4s, v17.4s,  v3.s[2]
548        FMLA v29.4s, v17.4s,  v4.s[2]
549        FMLA v31.4s, v17.4s,  v5.s[2]
550
551        FMLA v20.4s, v18.4s,  v0.s[3]
552        FMLA v22.4s, v18.4s,  v1.s[3]
553        FMLA v24.4s, v18.4s,  v2.s[3]
554        FMLA v26.4s, v18.4s,  v3.s[3]
555        FMLA v28.4s, v18.4s,  v4.s[3]
556        FMLA v30.4s, v18.4s,  v5.s[3]
557        FMLA v21.4s, v19.4s,  v0.s[3]
558        FMLA v23.4s, v19.4s,  v1.s[3]
559        FMLA v25.4s, v19.4s,  v2.s[3]
560        FMLA v27.4s, v19.4s,  v3.s[3]
561        FMLA v29.4s, v19.4s,  v4.s[3]
562        FMLA v31.4s, v19.4s,  v5.s[3]
563
564        # Is there a remainder?- 2 floats of A (8 bytes)
5656:
566        TBZ x0, 3, 7f
567
568        # Remainder- 2 floats of A (8 bytes)
569        # Load A
570        LDR   d0, [x14], 8
571        LDR   d1, [x15], 8
572        LDR   d2, [x20], 8
573        LDR   d3, [x21], 8
574        LDR   d4, [x22], 8
575        LDR   d5, [x23], 8
576        # Load B
577        LDP  q12, q13, [x5], 32
578        LDP  q14, q15, [x5], 32
579
580        FMLA v20.4s, v12.4s,  v0.s[0]
581        FMLA v22.4s, v12.4s,  v1.s[0]
582        FMLA v24.4s, v12.4s,  v2.s[0]
583        FMLA v26.4s, v12.4s,  v3.s[0]
584        FMLA v28.4s, v12.4s,  v4.s[0]
585        FMLA v30.4s, v12.4s,  v5.s[0]
586        FMLA v21.4s, v13.4s,  v0.s[0]
587        FMLA v23.4s, v13.4s,  v1.s[0]
588        FMLA v25.4s, v13.4s,  v2.s[0]
589        FMLA v27.4s, v13.4s,  v3.s[0]
590        FMLA v29.4s, v13.4s,  v4.s[0]
591        FMLA v31.4s, v13.4s,  v5.s[0]
592
593        FMLA v20.4s, v14.4s,  v0.s[1]
594        FMLA v22.4s, v14.4s,  v1.s[1]
595        FMLA v24.4s, v14.4s,  v2.s[1]
596        FMLA v26.4s, v14.4s,  v3.s[1]
597        FMLA v28.4s, v14.4s,  v4.s[1]
598        FMLA v30.4s, v14.4s,  v5.s[1]
599        FMLA v21.4s, v15.4s,  v0.s[1]
600        FMLA v23.4s, v15.4s,  v1.s[1]
601        FMLA v25.4s, v15.4s,  v2.s[1]
602        FMLA v27.4s, v15.4s,  v3.s[1]
603        FMLA v29.4s, v15.4s,  v4.s[1]
604        FMLA v31.4s, v15.4s,  v5.s[1]
605
606        # Is there a remainder?- 1 float of A (4 bytes)
6077:
608        TBZ x0, 2, 4b
609
610        # Remainder- 1 float of A (4 bytes)
611        # Load A
612        LDR   s0, [x14], 4
613        LDR   s1, [x15], 4
614        LDR   s2, [x20], 4
615        LDR   s3, [x21], 4
616        LDR   s4, [x22], 4
617        LDR   s5, [x23], 4
618        # Load B
619        LDP  q12, q13, [x5], 32
620
621        FMLA v20.4s, v12.4s,  v0.s[0]
622        FMLA v22.4s, v12.4s,  v1.s[0]
623        FMLA v24.4s, v12.4s,  v2.s[0]
624        FMLA v26.4s, v12.4s,  v3.s[0]
625        FMLA v28.4s, v12.4s,  v4.s[0]
626        FMLA v30.4s, v12.4s,  v5.s[0]
627        FMLA v21.4s, v13.4s,  v0.s[0]
628        FMLA v23.4s, v13.4s,  v1.s[0]
629        FMLA v25.4s, v13.4s,  v2.s[0]
630        FMLA v27.4s, v13.4s,  v3.s[0]
631        FMLA v29.4s, v13.4s,  v4.s[0]
632        FMLA v31.4s, v13.4s,  v5.s[0]
633        B 4b
634
635        # Store odd width
6368:
637        TBZ x1, 2, 9f
638        STR q30,  [x7], 16
639        MOV v30.16b, v31.16b
640        STR q28, [x13], 16
641        MOV v28.16b, v29.16b
642        STR q26, [x10], 16
643        MOV v26.16b, v27.16b
644        STR q24, [x17], 16
645        MOV v24.16b, v25.16b
646        STR q22, [x16], 16
647        MOV v22.16b, v23.16b
648        STR q20,  [x6], 16
649        MOV v20.16b, v21.16b
6509:
651        TBZ x1, 1, 10f
652        STR d30,  [x7], 8
653        DUP d30, v30.d[1]
654        STR d28, [x13], 8
655        DUP d28, v28.d[1]
656        STR d26, [x10], 8
657        DUP d26, v26.d[1]
658        STR d24, [x17], 8
659        DUP d24, v24.d[1]
660        STR d22, [x16], 8
661        DUP d22, v22.d[1]
662        STR d20,  [x6], 8
663        DUP d20, v20.d[1]
664
66510:
666        TBZ x1, 0, 11f
667        STR s30,  [x7]
668        STR s28, [x13]
669        STR s26, [x10]
670        STR s24, [x17]
671        STR s22, [x16]
672        STR s20,  [x6]
67311:
674        # Restore x20,x21,x22,x23 from stack
675        LDP x22, x23, [sp, 80]
676        LDP x20, x21, [sp, 64]
677
678        # Restore d8-d15 from stack
679        LDP d14, d15, [sp, 48]
680        LDP d12, d13, [sp, 32]
681        LDP d10, d11, [sp, 16]
682        LDP  d8,  d9, [sp], 96
683        RET
684
685END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75
686
687#ifdef __ELF__
688.section ".note.GNU-stack","",%progbits
689#endif
690