1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const float*restrict w,            x5
19#     float*restrict c,                  x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> x10
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> x8
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29# x20 a0
30# x13 a1
31# x14 a2
32# x15 a3
33
34# C pointers
35# x6  c0
36# x16 c1
37# x17 c2
38# x7  c3 / cm_stride
39
40# Vector register usage
41# A0  v0  v4
42# A1  v1  v5
43# A2  v2  v6
44# A3  v3  v7
45# B   v8  v9 v10 v11
46# B  v12 v13 v14 v15
47# B  v20 v21 v22 v23
48# B  v24 v25 v26 v27
49# C  v16 v17
50# C  v18 v19
51# C  v28 v29
52# C  v30 v31
53# Clamp v4 v5
54
55BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75
56
57        # Load cn_stride, a_offset
58        LDP x10, x11, [sp]
59
60        # Load zero, params pointer
61        LDP x12, x8, [sp, 16]
62
63        # Load min/max values
64        LD2R {v4.4s, v5.4s}, [x8]
65
66        # Save x20 on stack
67        STR x20, [sp, -80]!
68
69        # Save d8-d15 on stack
70        STP  d8,  d9, [sp, 16]
71        STP d10, d11, [sp, 32]
72        STP d12, d13, [sp, 48]
73        STP d14, d15, [sp, 64]
74
75        # Clamp C pointers
76        CMP x0, 2                // if mr < 2
77        ADD x16, x6, x7          // c1 = c0 + cm_stride
78        CSEL x16, x6, x16, LO    //   c1 = c0
79
80        ADD x17, x16, x7         // c2 = c1 + cm_stride
81                                 // if mr <= 2
82        CSEL x17, x16, x17, LS   //   c2 = c1
83
84        CMP x0, 4                // if mr < 4
85        ADD x7, x17, x7          // c3 = c2 + cm_stride
86        CSEL x7, x17, x7, LO     //   c3 = c2
87
880:
89        # Load initial bias from w into accumulators
90        LDP q16, q17, [x5], 32
91        MOV v18.16b, v16.16b
92        MOV v19.16b, v17.16b
93        MOV v28.16b, v16.16b
94        MOV v29.16b, v17.16b
95        MOV v30.16b, v16.16b
96        MOV v31.16b, v17.16b
97
98        MOV x9, x3  // p = ks
99
1001:
101        # Load next 4 A pointers
102        LDP x20, x13, [x4], 16
103        LDP x14, x15, [x4], 16
104
105        CMP x20, x12            // if a0 == zero
106        ADD x20, x20, x11       // a0 += a_offset
107        CSEL x20, x12, x20, EQ  //   a0 = zero, else += a0 + a_offset
108        CMP x13, x12            // if a1 == zero
109        ADD x13, x13, x11       // a1 += a_offset
110        CSEL x13, x12, x13, EQ  //   a1 = zero, else += a1 + a_offset
111        CMP x14, x12            // if a2 == zero
112        ADD x14, x14, x11       // a2 += a_offset
113        CSEL x14, x12, x14, EQ  //   a2 = zero, else += a2 + a_offset
114        CMP x15, x12            // if a3 == zero
115        ADD x15, x15, x11       // a3 += a_offset
116        CSEL x15, x12, x15, EQ  //   a3 = zero, else += a3 + a_offset
117
118        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
119        SUBS x0, x2, 32  // k = kc - 32
120        B.LO 4f
121
122        # 16 prologue
123        # Read first block of 4 A and B.
124        LDR q0, [x20], 16
125        LDP q20, q21, [x5], 32
126        LDR q1, [x13], 16
127        LDR q2, [x14], 16
128        LDR q3, [x15], 16
129        LDP q22, q23, [x5], 32
130        LDP q24, q25, [x5], 32
131        LDP q26, q27, [x5], 32
132
133        # Is there at least 32.  yes do main loop
134        SUBS x0, x0, 32
135        B.LO 3f
136
137        # Main loop - 8 floats of A
1382:
139        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
140        FMLA v16.4s, v20.4s, v0.s[0]
141        LDP q8, q9, [x5], 32
142        FMLA v17.4s, v21.4s, v0.s[0]
143        FMLA v18.4s, v20.4s, v1.s[0]
144        LDP q10, q11, [x5], 32
145        FMLA v19.4s, v21.4s, v1.s[0]
146        FMLA v28.4s, v20.4s, v2.s[0]
147        LDP q12, q13, [x5], 32
148        FMLA v29.4s, v21.4s, v2.s[0]
149        FMLA v30.4s, v20.4s, v3.s[0]
150        LDP q14, q15, [x5], 32
151        FMLA v31.4s, v21.4s, v3.s[0]
152        FMLA v16.4s, v22.4s, v0.s[1]
153        LDR q4, [x20], 16
154        FMLA v17.4s, v23.4s, v0.s[1]
155        FMLA v18.4s, v22.4s, v1.s[1]
156        LDR q5, [x13], 16
157        FMLA v19.4s, v23.4s, v1.s[1]
158        FMLA v28.4s, v22.4s, v2.s[1]
159        LDR q6, [x14], 16
160        FMLA v29.4s, v23.4s, v2.s[1]
161        FMLA v30.4s, v22.4s, v3.s[1]
162        LDR q7, [x15], 16
163        FMLA v31.4s, v23.4s, v3.s[1]
164        FMLA v16.4s, v24.4s, v0.s[2]
165        PRFM PLDL1KEEP, [x5, 128]
166        FMLA v17.4s, v25.4s, v0.s[2]
167        FMLA v18.4s, v24.4s, v1.s[2]
168        PRFM PLDL1KEEP, [x5, 192]
169        FMLA v19.4s, v25.4s, v1.s[2]
170        FMLA v28.4s, v24.4s, v2.s[2]
171        PRFM PLDL1KEEP, [x5, 256]
172        FMLA v29.4s, v25.4s, v2.s[2]
173        FMLA v30.4s, v24.4s, v3.s[2]
174        PRFM PLDL1KEEP, [x5, 320]
175        FMLA v31.4s, v25.4s, v3.s[2]
176        FMLA v16.4s, v26.4s, v0.s[3]
177        FMLA v17.4s, v27.4s, v0.s[3]
178        FMLA v18.4s, v26.4s, v1.s[3]
179        FMLA v19.4s, v27.4s, v1.s[3]
180        FMLA v28.4s, v26.4s, v2.s[3]
181        FMLA v29.4s, v27.4s, v2.s[3]
182        FMLA v30.4s, v26.4s, v3.s[3]
183        FMLA v31.4s, v27.4s, v3.s[3]
184
185        # Second block of 4.  FMA for second 4, loads for 1nd block of 4.
186        FMLA v16.4s, v8.4s, v4.s[0]
187        LDP q20, q21, [x5], 32
188        FMLA v17.4s, v9.4s, v4.s[0]
189        FMLA v18.4s, v8.4s, v5.s[0]
190        LDP q22, q23, [x5], 32
191        FMLA v19.4s, v9.4s, v5.s[0]
192        FMLA v28.4s, v8.4s, v6.s[0]
193        LDP q24, q25, [x5], 32
194        FMLA v29.4s, v9.4s, v6.s[0]
195        FMLA v30.4s, v8.4s, v7.s[0]
196        LDP q26, q27, [x5], 32
197        FMLA v31.4s, v9.4s, v7.s[0]
198        FMLA v16.4s, v10.4s, v4.s[1]
199        LDR q0, [x20], 16
200        FMLA v17.4s, v11.4s, v4.s[1]
201        FMLA v18.4s, v10.4s, v5.s[1]
202        LDR q1, [x13], 16
203        FMLA v19.4s, v11.4s, v5.s[1]
204        FMLA v28.4s, v10.4s, v6.s[1]
205        LDR q2, [x14], 16
206        FMLA v29.4s, v11.4s, v6.s[1]
207        FMLA v30.4s, v10.4s, v7.s[1]
208        LDR q3, [x15], 16
209        FMLA v31.4s, v11.4s, v7.s[1]
210        FMLA v16.4s, v12.4s, v4.s[2]
211        FMLA v17.4s, v13.4s, v4.s[2]
212        FMLA v18.4s, v12.4s, v5.s[2]
213        FMLA v19.4s, v13.4s, v5.s[2]
214        FMLA v28.4s, v12.4s, v6.s[2]
215        FMLA v29.4s, v13.4s, v6.s[2]
216        FMLA v30.4s, v12.4s, v7.s[2]
217        FMLA v31.4s, v13.4s, v7.s[2]
218        FMLA v16.4s, v14.4s, v4.s[3]
219        FMLA v17.4s, v15.4s, v4.s[3]
220        FMLA v18.4s, v14.4s, v5.s[3]
221        FMLA v19.4s, v15.4s, v5.s[3]
222        FMLA v28.4s, v14.4s, v6.s[3]
223        FMLA v29.4s, v15.4s, v6.s[3]
224        SUBS x0, x0, 32
225        FMLA v30.4s, v14.4s, v7.s[3]
226        FMLA v31.4s, v15.4s, v7.s[3]
227
228        B.HS 2b
229
2303:
231        # Epilogue
232        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
233        FMLA v16.4s, v20.4s, v0.s[0]
234        LDP q8, q9, [x5], 32
235        FMLA v17.4s, v21.4s, v0.s[0]
236        FMLA v18.4s, v20.4s, v1.s[0]
237        LDP q10, q11, [x5], 32
238        FMLA v19.4s, v21.4s, v1.s[0]
239        FMLA v28.4s, v20.4s, v2.s[0]
240        LDP q12, q13, [x5], 32
241        FMLA v29.4s, v21.4s, v2.s[0]
242        FMLA v30.4s, v20.4s, v3.s[0]
243        LDP q14, q15, [x5], 32
244        FMLA v31.4s, v21.4s, v3.s[0]
245        FMLA v16.4s, v22.4s, v0.s[1]
246        LDR q4, [x20], 16
247        FMLA v17.4s, v23.4s, v0.s[1]
248        FMLA v18.4s, v22.4s, v1.s[1]
249        LDR q5, [x13], 16
250        FMLA v19.4s, v23.4s, v1.s[1]
251        FMLA v28.4s, v22.4s, v2.s[1]
252        LDR q6, [x14], 16
253        FMLA v29.4s, v23.4s, v2.s[1]
254        FMLA v30.4s, v22.4s, v3.s[1]
255        LDR q7, [x15], 16
256        FMLA v31.4s, v23.4s, v3.s[1]
257        FMLA v16.4s, v24.4s, v0.s[2]
258        FMLA v17.4s, v25.4s, v0.s[2]
259        FMLA v18.4s, v24.4s, v1.s[2]
260        FMLA v19.4s, v25.4s, v1.s[2]
261        FMLA v28.4s, v24.4s, v2.s[2]
262        FMLA v29.4s, v25.4s, v2.s[2]
263        FMLA v30.4s, v24.4s, v3.s[2]
264        FMLA v31.4s, v25.4s, v3.s[2]
265        FMLA v16.4s, v26.4s, v0.s[3]
266        FMLA v17.4s, v27.4s, v0.s[3]
267        FMLA v18.4s, v26.4s, v1.s[3]
268        FMLA v19.4s, v27.4s, v1.s[3]
269        FMLA v28.4s, v26.4s, v2.s[3]
270        FMLA v29.4s, v27.4s, v2.s[3]
271        FMLA v30.4s, v26.4s, v3.s[3]
272        FMLA v31.4s, v27.4s, v3.s[3]
273
274        # Second block of 4.  FMA for second 4, noloads
275        FMLA v16.4s, v8.4s, v4.s[0]
276        FMLA v17.4s, v9.4s, v4.s[0]
277        FMLA v18.4s, v8.4s, v5.s[0]
278        FMLA v19.4s, v9.4s, v5.s[0]
279        FMLA v28.4s, v8.4s, v6.s[0]
280        FMLA v29.4s, v9.4s, v6.s[0]
281        FMLA v30.4s, v8.4s, v7.s[0]
282        FMLA v31.4s, v9.4s, v7.s[0]
283        FMLA v16.4s, v10.4s, v4.s[1]
284        FMLA v17.4s, v11.4s, v4.s[1]
285        FMLA v18.4s, v10.4s, v5.s[1]
286        FMLA v19.4s, v11.4s, v5.s[1]
287        FMLA v28.4s, v10.4s, v6.s[1]
288        FMLA v29.4s, v11.4s, v6.s[1]
289        FMLA v30.4s, v10.4s, v7.s[1]
290        FMLA v31.4s, v11.4s, v7.s[1]
291        FMLA v16.4s, v12.4s, v4.s[2]
292        FMLA v17.4s, v13.4s, v4.s[2]
293        FMLA v18.4s, v12.4s, v5.s[2]
294        FMLA v19.4s, v13.4s, v5.s[2]
295        FMLA v28.4s, v12.4s, v6.s[2]
296        FMLA v29.4s, v13.4s, v6.s[2]
297        FMLA v30.4s, v12.4s, v7.s[2]
298        FMLA v31.4s, v13.4s, v7.s[2]
299
300        FMLA v16.4s, v14.4s, v4.s[3]
301        FMLA v17.4s, v15.4s, v4.s[3]
302        FMLA v18.4s, v14.4s, v5.s[3]
303        FMLA v19.4s, v15.4s, v5.s[3]
304
305        # Load min/max values
306        LD2R {v4.4s, v5.4s}, [x8]
307
308        FMLA v28.4s, v14.4s, v6.s[3]
309        FMLA v29.4s, v15.4s, v6.s[3]
310        FMLA v30.4s, v14.4s, v7.s[3]
311        FMLA v31.4s, v15.4s, v7.s[3]
312
3134:
314        # Remainder- 4 floats of A
315        TBZ x0, 4, 5f
316
317        LDR q0, [x20], 16
318        LDP q20, q21, [x5], 32
319        LDR q1, [x13], 16
320        LDR q2, [x14], 16
321        LDR q3, [x15], 16
322        FMLA v16.4s, v20.4s, v0.s[0]
323        FMLA v17.4s, v21.4s, v0.s[0]
324        LDP q22, q23, [x5], 32
325        FMLA v18.4s, v20.4s, v1.s[0]
326        FMLA v19.4s, v21.4s, v1.s[0]
327        LDP q24, q25, [x5], 32
328        FMLA v28.4s, v20.4s, v2.s[0]
329        FMLA v29.4s, v21.4s, v2.s[0]
330        LDP q26, q27, [x5], 32
331        FMLA v30.4s, v20.4s, v3.s[0]
332        FMLA v31.4s, v21.4s, v3.s[0]
333        FMLA v16.4s, v22.4s, v0.s[1]
334        FMLA v17.4s, v23.4s, v0.s[1]
335        FMLA v18.4s, v22.4s, v1.s[1]
336        FMLA v19.4s, v23.4s, v1.s[1]
337        FMLA v28.4s, v22.4s, v2.s[1]
338        FMLA v29.4s, v23.4s, v2.s[1]
339        FMLA v30.4s, v22.4s, v3.s[1]
340        FMLA v31.4s, v23.4s, v3.s[1]
341        FMLA v16.4s, v24.4s, v0.s[2]
342        FMLA v17.4s, v25.4s, v0.s[2]
343        FMLA v18.4s, v24.4s, v1.s[2]
344        FMLA v19.4s, v25.4s, v1.s[2]
345        FMLA v28.4s, v24.4s, v2.s[2]
346        FMLA v29.4s, v25.4s, v2.s[2]
347        FMLA v30.4s, v24.4s, v3.s[2]
348        FMLA v31.4s, v25.4s, v3.s[2]
349        FMLA v16.4s, v26.4s, v0.s[3]
350        FMLA v17.4s, v27.4s, v0.s[3]
351        FMLA v18.4s, v26.4s, v1.s[3]
352        FMLA v19.4s, v27.4s, v1.s[3]
353        FMLA v28.4s, v26.4s, v2.s[3]
354        FMLA v29.4s, v27.4s, v2.s[3]
355        FMLA v30.4s, v26.4s, v3.s[3]
356        FMLA v31.4s, v27.4s, v3.s[3]
357
3585:
359        # Remainder- 2 floats of A
360        TBZ x0, 3, 6f
361
362        LDR d0, [x20], 8
363        LDP q20, q21, [x5], 32
364        LDR d1, [x13], 8
365        LDR d2, [x14], 8
366        LDR d3, [x15], 8
367        FMLA v16.4s, v20.4s, v0.s[0]
368        FMLA v17.4s, v21.4s, v0.s[0]
369        LDP q22, q23, [x5], 32
370        FMLA v18.4s, v20.4s, v1.s[0]
371        FMLA v19.4s, v21.4s, v1.s[0]
372        FMLA v28.4s, v20.4s, v2.s[0]
373        FMLA v29.4s, v21.4s, v2.s[0]
374        FMLA v30.4s, v20.4s, v3.s[0]
375        FMLA v31.4s, v21.4s, v3.s[0]
376        FMLA v16.4s, v22.4s, v0.s[1]
377        FMLA v17.4s, v23.4s, v0.s[1]
378        FMLA v18.4s, v22.4s, v1.s[1]
379        FMLA v19.4s, v23.4s, v1.s[1]
380        FMLA v28.4s, v22.4s, v2.s[1]
381        FMLA v29.4s, v23.4s, v2.s[1]
382        FMLA v30.4s, v22.4s, v3.s[1]
383        FMLA v31.4s, v23.4s, v3.s[1]
384
3856:
386        # Remainder- 1 float of A
387        TBZ x0, 2, 7f
388
389        LDR s0, [x20], 4
390        LDP q20, q21, [x5], 32
391        LDR s1, [x13], 4
392        LDR s2, [x14], 4
393        LDR s3, [x15], 4
394        FMLA v16.4s, v20.4s, v0.s[0]
395        FMLA v17.4s, v21.4s, v0.s[0]
396        FMLA v18.4s, v20.4s, v1.s[0]
397        FMLA v19.4s, v21.4s, v1.s[0]
398        FMLA v28.4s, v20.4s, v2.s[0]
399        FMLA v29.4s, v21.4s, v2.s[0]
400        FMLA v30.4s, v20.4s, v3.s[0]
401        FMLA v31.4s, v21.4s, v3.s[0]
402
4037:
404        # ks loop
405        SUBS x9, x9, 32  // ks -= MR * sizeof(void*)
406        B.HI 1b
407
408        # Clamp
409        FMAX v16.4s, v16.4s, v4.4s
410        FMAX v17.4s, v17.4s, v4.4s
411        FMAX v18.4s, v18.4s, v4.4s
412        FMAX v19.4s, v19.4s, v4.4s
413        FMAX v28.4s, v28.4s, v4.4s
414        FMAX v29.4s, v29.4s, v4.4s
415        FMAX v30.4s, v30.4s, v4.4s
416        FMAX v31.4s, v31.4s, v4.4s
417        FMIN v16.4s, v16.4s, v5.4s
418        FMIN v17.4s, v17.4s, v5.4s
419        FMIN v18.4s, v18.4s, v5.4s
420        FMIN v19.4s, v19.4s, v5.4s
421        FMIN v28.4s, v28.4s, v5.4s
422        FMIN v29.4s, v29.4s, v5.4s
423        FMIN v30.4s, v30.4s, v5.4s
424        FMIN v31.4s, v31.4s, v5.4s
425
426        # Store full 4 x 8
427        SUBS x1, x1, 8
428        B.LO 8f
429
430        STP q30, q31,  [x7]
431        ADD  x7,  x7, x10
432        STP q28, q29, [x17]
433        ADD x17, x17, x10
434        STP q18, q19, [x16]
435        ADD x16, x16, x10
436        STP q16, q17,  [x6]
437        ADD  x6,  x6, x10
438
439        SUB x4, x4, x3  // a -= ks
440
441        # nc loop
442        B.HI 0b
443
444        # Restore d8-d15 from stack
445        LDP d14, d15, [sp, 64]
446        LDP d12, d13, [sp, 48]
447        LDP d10, d11, [sp, 32]
448        LDP  d8,  d9, [sp, 16]
449
450        # Restore x20 from stack
451        LDR x20, [sp], 80
452        RET
453
454        # Store odd width
4558:
456        TBZ x1, 2, 9f
457        STR q30, [x7], 16
458        MOV v30.16b, v31.16b
459        STR q28, [x17], 16
460        MOV v28.16b, v29.16b
461        STR q18, [x16], 16
462        MOV v18.16b, v19.16b
463        STR q16, [x6], 16
464        MOV v16.16b, v17.16b
465
4669:
467        TBZ x1, 1, 10f
468        STR d30, [x7], 8
469        DUP d30, v30.d[1]
470        STR d28, [x17], 8
471        DUP d28, v28.d[1]
472        STR d18, [x16], 8
473        DUP d18, v18.d[1]
474        STR d16, [x6], 8
475        DUP d16, v16.d[1]
476
47710:
478        TBZ x1, 0, 11f
479        STR s30,  [x7]
480        STR s28, [x17]
481        STR s18, [x16]
482        STR s16,  [x6]
48311:
484        # Restore d8-d15 from stack
485        LDP d14, d15, [sp, 64]
486        LDP d12, d13, [sp, 48]
487        LDP d10, d11, [sp, 32]
488        LDP  d8,  d9, [sp, 16]
489
490        # Restore x20 from stack
491        LDR x20, [sp], 80
492        RET
493
494END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75
495
496#ifdef __ELF__
497.section ".note.GNU-stack","",%progbits
498#endif
499