1// Auto-generated file. Do not edit!
2//   Template: src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a57(
13#     size_t mr,                         x0
14#     size_t nc,                         x1
15#     size_t kc,                         x2 / x0
16#     size_t ks,                         x3 / x9
17#     const float**restrict a,           x4
18#     const float*restrict w,            x5
19#     float*restrict c,                  x6
20#     size_t cm_stride,                  x7
21#     size_t cn_stride,                  [sp] -> x10
22#     size_t a_offset,                   [sp + 8] -> x11
23#     const float* zero,                 [sp + 16] -> x12
24#     const xnn_f32_minmax_params params [sp + 24] -> x8
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# A pointers
29# x20 a0
30# x13 a1
31# x14 a2
32# x15 a3
33
34# C pointers
35# x6  c0
36# x16 c1
37# x17 c2
38# x7  c3 / cm_stride
39
40# Vector register usage
41# A0  v0  v4
42# A1  v1  v5
43# A2  v2  v6
44# A3  v3  v7
45# B   v8  v9 v10 v11
46# B  v12 v13 v14 v15
47# B  v20 v21 v22 v23
48# B  v24 v25 v26 v27
49# C  v16 v17
50# C  v18 v19
51# C  v28 v29
52# C  v30 v31
53# Clamp v4 v5
54
55BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a57
56
57        # Load cn_stride, a_offset
58        LDP x10, x11, [sp]
59
60        # Load zero, params pointer
61        LDP x12, x8, [sp, 16]
62
63        # Load min/max values
64        LD2R {v4.4s, v5.4s}, [x8]
65
66        # Save x20 on stack
67        STR x20, [sp, -80]!
68
69        # Save d8-d15 on stack
70        STP  d8,  d9, [sp, 16]
71        STP d10, d11, [sp, 32]
72        STP d12, d13, [sp, 48]
73        STP d14, d15, [sp, 64]
74
75        # Clamp C pointers
76        CMP x0, 2                // if mr < 2
77        ADD x16, x6, x7          // c1 = c0 + cm_stride
78        CSEL x16, x6, x16, LO    //   c1 = c0
79
80        ADD x17, x16, x7         // c2 = c1 + cm_stride
81                                 // if mr <= 2
82        CSEL x17, x16, x17, LS   //   c2 = c1
83
84        CMP x0, 4                // if mr < 4
85        ADD x7, x17, x7          // c3 = c2 + cm_stride
86        CSEL x7, x17, x7, LO     //   c3 = c2
87
880:
89        # Load initial bias from w into accumulators
90        LDP q16, q17, [x5], 32
91        MOV v18.16b, v16.16b
92        MOV v19.16b, v17.16b
93        MOV v28.16b, v16.16b
94        MOV v29.16b, v17.16b
95        MOV v30.16b, v16.16b
96        MOV v31.16b, v17.16b
97
98        MOV x9, x3  // p = ks
99
1001:
101        # Load next 4 A pointers
102        LDP x20, x13, [x4], 16
103        LDP x14, x15, [x4], 16
104
105        CMP x20, x12            // if a0 == zero
106        ADD x20, x20, x11       // a0 += a_offset
107        CSEL x20, x12, x20, EQ  //   a0 = zero, else += a0 + a_offset
108        CMP x13, x12            // if a1 == zero
109        ADD x13, x13, x11       // a1 += a_offset
110        CSEL x13, x12, x13, EQ  //   a1 = zero, else += a1 + a_offset
111        CMP x14, x12            // if a2 == zero
112        ADD x14, x14, x11       // a2 += a_offset
113        CSEL x14, x12, x14, EQ  //   a2 = zero, else += a2 + a_offset
114        CMP x15, x12            // if a3 == zero
115        ADD x15, x15, x11       // a3 += a_offset
116        CSEL x15, x12, x15, EQ  //   a3 = zero, else += a3 + a_offset
117
118        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
119        SUBS x0, x2, 32  // k = kc - 32
120        B.LO 4f
121
122        # 16 prologue
123        # Read first block of 4 A and B.
124        LDR q0, [x20], 16
125        LDP q20, q21, [x5], 32
126        LDR q1, [x13], 16
127        LDR q2, [x14], 16
128        LDR q3, [x15], 16
129        LDP q22, q23, [x5], 32
130        LDP q24, q25, [x5], 32
131        LDP q26, q27, [x5], 32
132
133        # Is there at least 32.  yes do main loop
134        SUBS x0, x0, 32
135        B.LO 3f
136
137        # Main loop - 8 floats of A
1382:
139        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
140        FMLA v16.4s, v20.4s, v0.s[0]
141        LDP q8, q9, [x5], 32
142        FMLA v17.4s, v21.4s, v0.s[0]
143        FMLA v18.4s, v20.4s, v1.s[0]
144        LDP q10, q11, [x5], 32
145        FMLA v19.4s, v21.4s, v1.s[0]
146        FMLA v28.4s, v20.4s, v2.s[0]
147        LDP q12, q13, [x5], 32
148        FMLA v29.4s, v21.4s, v2.s[0]
149        FMLA v30.4s, v20.4s, v3.s[0]
150        LDP q14, q15, [x5], 32
151        FMLA v31.4s, v21.4s, v3.s[0]
152        FMLA v16.4s, v22.4s, v0.s[1]
153        LDR q4, [x20], 16
154        FMLA v17.4s, v23.4s, v0.s[1]
155        FMLA v18.4s, v22.4s, v1.s[1]
156        LDR q5, [x13], 16
157        FMLA v19.4s, v23.4s, v1.s[1]
158        FMLA v28.4s, v22.4s, v2.s[1]
159        LDR q6, [x14], 16
160        FMLA v29.4s, v23.4s, v2.s[1]
161        FMLA v30.4s, v22.4s, v3.s[1]
162        LDR q7, [x15], 16
163        FMLA v31.4s, v23.4s, v3.s[1]
164        FMLA v16.4s, v24.4s, v0.s[2]
165        FMLA v17.4s, v25.4s, v0.s[2]
166        FMLA v18.4s, v24.4s, v1.s[2]
167        FMLA v19.4s, v25.4s, v1.s[2]
168        FMLA v28.4s, v24.4s, v2.s[2]
169        FMLA v29.4s, v25.4s, v2.s[2]
170        FMLA v30.4s, v24.4s, v3.s[2]
171        FMLA v31.4s, v25.4s, v3.s[2]
172        FMLA v16.4s, v26.4s, v0.s[3]
173        FMLA v17.4s, v27.4s, v0.s[3]
174        FMLA v18.4s, v26.4s, v1.s[3]
175        FMLA v19.4s, v27.4s, v1.s[3]
176        FMLA v28.4s, v26.4s, v2.s[3]
177        FMLA v29.4s, v27.4s, v2.s[3]
178        FMLA v30.4s, v26.4s, v3.s[3]
179        FMLA v31.4s, v27.4s, v3.s[3]
180
181        # Second block of 4.  FMA for second 4, loads for 1nd block of 4.
182        FMLA v16.4s, v8.4s, v4.s[0]
183        LDP q20, q21, [x5], 32
184        FMLA v17.4s, v9.4s, v4.s[0]
185        FMLA v18.4s, v8.4s, v5.s[0]
186        LDP q22, q23, [x5], 32
187        FMLA v19.4s, v9.4s, v5.s[0]
188        FMLA v28.4s, v8.4s, v6.s[0]
189        LDP q24, q25, [x5], 32
190        FMLA v29.4s, v9.4s, v6.s[0]
191        FMLA v30.4s, v8.4s, v7.s[0]
192        LDP q26, q27, [x5], 32
193        FMLA v31.4s, v9.4s, v7.s[0]
194        FMLA v16.4s, v10.4s, v4.s[1]
195        LDR q0, [x20], 16
196        FMLA v17.4s, v11.4s, v4.s[1]
197        FMLA v18.4s, v10.4s, v5.s[1]
198        LDR q1, [x13], 16
199        FMLA v19.4s, v11.4s, v5.s[1]
200        FMLA v28.4s, v10.4s, v6.s[1]
201        LDR q2, [x14], 16
202        FMLA v29.4s, v11.4s, v6.s[1]
203        FMLA v30.4s, v10.4s, v7.s[1]
204        LDR q3, [x15], 16
205        FMLA v31.4s, v11.4s, v7.s[1]
206        FMLA v16.4s, v12.4s, v4.s[2]
207        FMLA v17.4s, v13.4s, v4.s[2]
208        FMLA v18.4s, v12.4s, v5.s[2]
209        FMLA v19.4s, v13.4s, v5.s[2]
210        FMLA v28.4s, v12.4s, v6.s[2]
211        FMLA v29.4s, v13.4s, v6.s[2]
212        FMLA v30.4s, v12.4s, v7.s[2]
213        FMLA v31.4s, v13.4s, v7.s[2]
214        FMLA v16.4s, v14.4s, v4.s[3]
215        FMLA v17.4s, v15.4s, v4.s[3]
216        FMLA v18.4s, v14.4s, v5.s[3]
217        FMLA v19.4s, v15.4s, v5.s[3]
218        FMLA v28.4s, v14.4s, v6.s[3]
219        FMLA v29.4s, v15.4s, v6.s[3]
220        SUBS x0, x0, 32
221        FMLA v30.4s, v14.4s, v7.s[3]
222        FMLA v31.4s, v15.4s, v7.s[3]
223
224        B.HS 2b
225
2263:
227        # Epilogue
228        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
229        FMLA v16.4s, v20.4s, v0.s[0]
230        LDP q8, q9, [x5], 32
231        FMLA v17.4s, v21.4s, v0.s[0]
232        FMLA v18.4s, v20.4s, v1.s[0]
233        LDP q10, q11, [x5], 32
234        FMLA v19.4s, v21.4s, v1.s[0]
235        FMLA v28.4s, v20.4s, v2.s[0]
236        LDP q12, q13, [x5], 32
237        FMLA v29.4s, v21.4s, v2.s[0]
238        FMLA v30.4s, v20.4s, v3.s[0]
239        LDP q14, q15, [x5], 32
240        FMLA v31.4s, v21.4s, v3.s[0]
241        FMLA v16.4s, v22.4s, v0.s[1]
242        LDR q4, [x20], 16
243        FMLA v17.4s, v23.4s, v0.s[1]
244        FMLA v18.4s, v22.4s, v1.s[1]
245        LDR q5, [x13], 16
246        FMLA v19.4s, v23.4s, v1.s[1]
247        FMLA v28.4s, v22.4s, v2.s[1]
248        LDR q6, [x14], 16
249        FMLA v29.4s, v23.4s, v2.s[1]
250        FMLA v30.4s, v22.4s, v3.s[1]
251        LDR q7, [x15], 16
252        FMLA v31.4s, v23.4s, v3.s[1]
253        FMLA v16.4s, v24.4s, v0.s[2]
254        FMLA v17.4s, v25.4s, v0.s[2]
255        FMLA v18.4s, v24.4s, v1.s[2]
256        FMLA v19.4s, v25.4s, v1.s[2]
257        FMLA v28.4s, v24.4s, v2.s[2]
258        FMLA v29.4s, v25.4s, v2.s[2]
259        FMLA v30.4s, v24.4s, v3.s[2]
260        FMLA v31.4s, v25.4s, v3.s[2]
261        FMLA v16.4s, v26.4s, v0.s[3]
262        FMLA v17.4s, v27.4s, v0.s[3]
263        FMLA v18.4s, v26.4s, v1.s[3]
264        FMLA v19.4s, v27.4s, v1.s[3]
265        FMLA v28.4s, v26.4s, v2.s[3]
266        FMLA v29.4s, v27.4s, v2.s[3]
267        FMLA v30.4s, v26.4s, v3.s[3]
268        FMLA v31.4s, v27.4s, v3.s[3]
269
270        # Second block of 4.  FMA for second 4, noloads
271        FMLA v16.4s, v8.4s, v4.s[0]
272        FMLA v17.4s, v9.4s, v4.s[0]
273        FMLA v18.4s, v8.4s, v5.s[0]
274        FMLA v19.4s, v9.4s, v5.s[0]
275        FMLA v28.4s, v8.4s, v6.s[0]
276        FMLA v29.4s, v9.4s, v6.s[0]
277        FMLA v30.4s, v8.4s, v7.s[0]
278        FMLA v31.4s, v9.4s, v7.s[0]
279        FMLA v16.4s, v10.4s, v4.s[1]
280        FMLA v17.4s, v11.4s, v4.s[1]
281        FMLA v18.4s, v10.4s, v5.s[1]
282        FMLA v19.4s, v11.4s, v5.s[1]
283        FMLA v28.4s, v10.4s, v6.s[1]
284        FMLA v29.4s, v11.4s, v6.s[1]
285        FMLA v30.4s, v10.4s, v7.s[1]
286        FMLA v31.4s, v11.4s, v7.s[1]
287        FMLA v16.4s, v12.4s, v4.s[2]
288        FMLA v17.4s, v13.4s, v4.s[2]
289        FMLA v18.4s, v12.4s, v5.s[2]
290        FMLA v19.4s, v13.4s, v5.s[2]
291        FMLA v28.4s, v12.4s, v6.s[2]
292        FMLA v29.4s, v13.4s, v6.s[2]
293        FMLA v30.4s, v12.4s, v7.s[2]
294        FMLA v31.4s, v13.4s, v7.s[2]
295
296        FMLA v16.4s, v14.4s, v4.s[3]
297        FMLA v17.4s, v15.4s, v4.s[3]
298        FMLA v18.4s, v14.4s, v5.s[3]
299        FMLA v19.4s, v15.4s, v5.s[3]
300
301        # Load min/max values
302        LD2R {v4.4s, v5.4s}, [x8]
303
304        FMLA v28.4s, v14.4s, v6.s[3]
305        FMLA v29.4s, v15.4s, v6.s[3]
306        FMLA v30.4s, v14.4s, v7.s[3]
307        FMLA v31.4s, v15.4s, v7.s[3]
308
3094:
310        # Remainder- 4 floats of A
311        TBZ x0, 4, 5f
312
313        LDR q0, [x20], 16
314        LDP q20, q21, [x5], 32
315        LDR q1, [x13], 16
316        LDR q2, [x14], 16
317        LDR q3, [x15], 16
318        FMLA v16.4s, v20.4s, v0.s[0]
319        FMLA v17.4s, v21.4s, v0.s[0]
320        LDP q22, q23, [x5], 32
321        FMLA v18.4s, v20.4s, v1.s[0]
322        FMLA v19.4s, v21.4s, v1.s[0]
323        LDP q24, q25, [x5], 32
324        FMLA v28.4s, v20.4s, v2.s[0]
325        FMLA v29.4s, v21.4s, v2.s[0]
326        LDP q26, q27, [x5], 32
327        FMLA v30.4s, v20.4s, v3.s[0]
328        FMLA v31.4s, v21.4s, v3.s[0]
329        FMLA v16.4s, v22.4s, v0.s[1]
330        FMLA v17.4s, v23.4s, v0.s[1]
331        FMLA v18.4s, v22.4s, v1.s[1]
332        FMLA v19.4s, v23.4s, v1.s[1]
333        FMLA v28.4s, v22.4s, v2.s[1]
334        FMLA v29.4s, v23.4s, v2.s[1]
335        FMLA v30.4s, v22.4s, v3.s[1]
336        FMLA v31.4s, v23.4s, v3.s[1]
337        FMLA v16.4s, v24.4s, v0.s[2]
338        FMLA v17.4s, v25.4s, v0.s[2]
339        FMLA v18.4s, v24.4s, v1.s[2]
340        FMLA v19.4s, v25.4s, v1.s[2]
341        FMLA v28.4s, v24.4s, v2.s[2]
342        FMLA v29.4s, v25.4s, v2.s[2]
343        FMLA v30.4s, v24.4s, v3.s[2]
344        FMLA v31.4s, v25.4s, v3.s[2]
345        FMLA v16.4s, v26.4s, v0.s[3]
346        FMLA v17.4s, v27.4s, v0.s[3]
347        FMLA v18.4s, v26.4s, v1.s[3]
348        FMLA v19.4s, v27.4s, v1.s[3]
349        FMLA v28.4s, v26.4s, v2.s[3]
350        FMLA v29.4s, v27.4s, v2.s[3]
351        FMLA v30.4s, v26.4s, v3.s[3]
352        FMLA v31.4s, v27.4s, v3.s[3]
353
3545:
355        # Remainder- 2 floats of A
356        TBZ x0, 3, 6f
357
358        LDR d0, [x20], 8
359        LDP q20, q21, [x5], 32
360        LDR d1, [x13], 8
361        LDR d2, [x14], 8
362        LDR d3, [x15], 8
363        FMLA v16.4s, v20.4s, v0.s[0]
364        FMLA v17.4s, v21.4s, v0.s[0]
365        LDP q22, q23, [x5], 32
366        FMLA v18.4s, v20.4s, v1.s[0]
367        FMLA v19.4s, v21.4s, v1.s[0]
368        FMLA v28.4s, v20.4s, v2.s[0]
369        FMLA v29.4s, v21.4s, v2.s[0]
370        FMLA v30.4s, v20.4s, v3.s[0]
371        FMLA v31.4s, v21.4s, v3.s[0]
372        FMLA v16.4s, v22.4s, v0.s[1]
373        FMLA v17.4s, v23.4s, v0.s[1]
374        FMLA v18.4s, v22.4s, v1.s[1]
375        FMLA v19.4s, v23.4s, v1.s[1]
376        FMLA v28.4s, v22.4s, v2.s[1]
377        FMLA v29.4s, v23.4s, v2.s[1]
378        FMLA v30.4s, v22.4s, v3.s[1]
379        FMLA v31.4s, v23.4s, v3.s[1]
380
3816:
382        # Remainder- 1 float of A
383        TBZ x0, 2, 7f
384
385        LDR s0, [x20], 4
386        LDP q20, q21, [x5], 32
387        LDR s1, [x13], 4
388        LDR s2, [x14], 4
389        LDR s3, [x15], 4
390        FMLA v16.4s, v20.4s, v0.s[0]
391        FMLA v17.4s, v21.4s, v0.s[0]
392        FMLA v18.4s, v20.4s, v1.s[0]
393        FMLA v19.4s, v21.4s, v1.s[0]
394        FMLA v28.4s, v20.4s, v2.s[0]
395        FMLA v29.4s, v21.4s, v2.s[0]
396        FMLA v30.4s, v20.4s, v3.s[0]
397        FMLA v31.4s, v21.4s, v3.s[0]
398
3997:
400        # ks loop
401        SUBS x9, x9, 32  // ks -= MR * sizeof(void*)
402        B.HI 1b
403
404        # Clamp
405        FMAX v16.4s, v16.4s, v4.4s
406        FMAX v17.4s, v17.4s, v4.4s
407        FMAX v18.4s, v18.4s, v4.4s
408        FMAX v19.4s, v19.4s, v4.4s
409        FMAX v28.4s, v28.4s, v4.4s
410        FMAX v29.4s, v29.4s, v4.4s
411        FMAX v30.4s, v30.4s, v4.4s
412        FMAX v31.4s, v31.4s, v4.4s
413        FMIN v16.4s, v16.4s, v5.4s
414        FMIN v17.4s, v17.4s, v5.4s
415        FMIN v18.4s, v18.4s, v5.4s
416        FMIN v19.4s, v19.4s, v5.4s
417        FMIN v28.4s, v28.4s, v5.4s
418        FMIN v29.4s, v29.4s, v5.4s
419        FMIN v30.4s, v30.4s, v5.4s
420        FMIN v31.4s, v31.4s, v5.4s
421
422        # Store full 4 x 8
423        SUBS x1, x1, 8
424        B.LO 8f
425
426        STP q30, q31,  [x7]
427        ADD  x7,  x7, x10
428        STP q28, q29, [x17]
429        ADD x17, x17, x10
430        STP q18, q19, [x16]
431        ADD x16, x16, x10
432        STP q16, q17,  [x6]
433        ADD  x6,  x6, x10
434
435        SUB x4, x4, x3  // a -= ks
436
437        # nc loop
438        B.HI 0b
439
440        # Restore d8-d15 from stack
441        LDP d14, d15, [sp, 64]
442        LDP d12, d13, [sp, 48]
443        LDP d10, d11, [sp, 32]
444        LDP  d8,  d9, [sp, 16]
445
446        # Restore x20 from stack
447        LDR x20, [sp], 80
448        RET
449
450        # Store odd width
4518:
452        TBZ x1, 2, 9f
453        STR q30, [x7], 16
454        MOV v30.16b, v31.16b
455        STR q28, [x17], 16
456        MOV v28.16b, v29.16b
457        STR q18, [x16], 16
458        MOV v18.16b, v19.16b
459        STR q16, [x6], 16
460        MOV v16.16b, v17.16b
461
4629:
463        TBZ x1, 1, 10f
464        STR d30, [x7], 8
465        DUP d30, v30.d[1]
466        STR d28, [x17], 8
467        DUP d28, v28.d[1]
468        STR d18, [x16], 8
469        DUP d18, v18.d[1]
470        STR d16, [x6], 8
471        DUP d16, v16.d[1]
472
47310:
474        TBZ x1, 0, 11f
475        STR s30,  [x7]
476        STR s28, [x17]
477        STR s18, [x16]
478        STR s16,  [x6]
47911:
480        # Restore d8-d15 from stack
481        LDP d14, d15, [sp, 64]
482        LDP d12, d13, [sp, 48]
483        LDP d10, d11, [sp, 32]
484        LDP  d8,  d9, [sp, 16]
485
486        # Restore x20 from stack
487        LDR x20, [sp], 80
488        RET
489
490END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a57
491
492#ifdef __ELF__
493.section ".note.GNU-stack","",%progbits
494#endif
495