1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53(
13#     size_t mr,                (x0) - unused.  mr = 1
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          (x4) - unused
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         (x7) - unused
21#     size_t cn_stride,         [sp] -> x14
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# A pointer
28# x3  a0
29
30# C pointer
31# x6  c0
32
33# Clamp v2 v3
34
35# A53 based on A57/A75 but with LD64
36
37BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53
38
39        # Load cn_stride, acc
40        LDP x14, x15, [sp]
41        # Load params pointer
42        LDR x8, [sp, 16]
43
44        # Load min/max values
45        LD2R {v2.4s, v3.4s}, [x8]
460:
47        # Load initial accumulators
48        LD1 {v16.16b, v17.16b, v18.16b}, [x15], 48
49
50        MOVI v5.4s, 0  // second set of C for pipelining FMLA
51        PRFM PLDL1KEEP, [x5]
52        MOVI v6.4s, 0
53        PRFM PLDL1KEEP, [x5, 64]
54        MOVI v7.4s, 0
55        PRFM PLDL1KEEP, [x5, 128]
56        PRFM PLDL1KEEP, [x5, 192]
57
58        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
59        SUBS x0, x2, 32  // k = kc - 32
60
61        B.LO 3f
62
63        # 16 prologue
64        # Read first block of 1 A and B.
65        LDP q20, q21, [x5], 32
66        LDP q22, q23, [x5], 32
67        LDP q24, q25, [x5], 32
68        LDP q26, q27, [x5], 32
69        LDP q28, q29, [x5], 32
70        LDP q30, q31, [x5], 32
71        LDR q0, [x3], 16
72
73        # Is there at least 32.  yes do main loop
74        SUBS x0, x0, 32
75        B.LO 2f
76
77        # Main loop - 8 floats of A (32 bytes)
781:
79        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
80        FMLA v16.4s, v20.4s, v0.s[0]
81        LDR  q1, [x3], 16
82        FMLA v17.4s, v21.4s, v0.s[0]
83        LDR q20, [x5], 16
84        FMLA v18.4s, v22.4s, v0.s[0]
85        LDR q21, [x5], 16
86        FMLA  v5.4s, v23.4s, v0.s[1]
87        LDR q22, [x5], 16
88        FMLA  v6.4s, v24.4s, v0.s[1]
89        LDR q23, [x5], 16
90        FMLA  v7.4s, v25.4s, v0.s[1]
91        LDR q24, [x5], 16
92        FMLA v16.4s, v26.4s, v0.s[2]
93        LDR q25, [x5], 16
94        FMLA v17.4s, v27.4s, v0.s[2]
95        LDR q26, [x5], 16
96        FMLA v18.4s, v28.4s, v0.s[2]
97        LDR q27, [x5], 16
98        FMLA  v5.4s, v29.4s, v0.s[3]
99        LDR q28, [x5], 16
100        FMLA  v6.4s, v30.4s, v0.s[3]
101        LDR q29, [x5], 16
102        FMLA  v7.4s, v31.4s, v0.s[3]
103        LDR q30, [x5], 16
104        LDR q31, [x5], 16
105
106        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
107        FMLA v16.4s, v20.4s, v1.s[0]
108        LDR  q0, [x3], 16
109        FMLA v17.4s, v21.4s, v1.s[0]
110        LDR q20, [x5], 16
111        FMLA v18.4s, v22.4s, v1.s[0]
112        LDR q21, [x5], 16
113        FMLA  v5.4s, v23.4s, v1.s[1]
114        LDR q22, [x5], 16
115        FMLA  v6.4s, v24.4s, v1.s[1]
116        LDR q23, [x5], 16
117        FMLA  v7.4s, v25.4s, v1.s[1]
118        LDR q24, [x5], 16
119        FMLA v16.4s, v26.4s, v1.s[2]
120        LDR q25, [x5], 16
121        FMLA v17.4s, v27.4s, v1.s[2]
122        LDR q26, [x5], 16
123        FMLA v18.4s, v28.4s, v1.s[2]
124        LDR q27, [x5], 16
125        FMLA  v5.4s, v29.4s, v1.s[3]
126        LDR q28, [x5], 16
127        FMLA  v6.4s, v30.4s, v1.s[3]
128        LDR q29, [x5], 16
129        FMLA  v7.4s, v31.4s, v1.s[3]
130        LDR q30, [x5], 16
131        SUBS x0, x0, 32
132        LDR q31, [x5], 16
133        B.HS 1b
134
1352:
136        # Epilogue
137
138        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
139        FMLA v16.4s, v20.4s, v0.s[0]
140        LDR  q1, [x3], 16
141        FMLA v17.4s, v21.4s, v0.s[0]
142        LDR q20, [x5], 16
143        FMLA v18.4s, v22.4s, v0.s[0]
144        LDR q21, [x5], 16
145        FMLA  v5.4s, v23.4s, v0.s[1]
146        LDR q22, [x5], 16
147        FMLA  v6.4s, v24.4s, v0.s[1]
148        LDR q23, [x5], 16
149        FMLA  v7.4s, v25.4s, v0.s[1]
150        LDR q24, [x5], 16
151        FMLA v16.4s, v26.4s, v0.s[2]
152        LDR q25, [x5], 16
153        FMLA v17.4s, v27.4s, v0.s[2]
154        LDR q26, [x5], 16
155        FMLA v18.4s, v28.4s, v0.s[2]
156        LDR q27, [x5], 16
157        FMLA  v5.4s, v29.4s, v0.s[3]
158        LDR q28, [x5], 16
159        FMLA  v6.4s, v30.4s, v0.s[3]
160        LDR q29, [x5], 16
161        FMLA  v7.4s, v31.4s, v0.s[3]
162        LDR q30, [x5], 16
163
164        # Second block of 4.  FMA for second 4, no loads.
165        FMLA v16.4s, v20.4s, v1.s[0]
166        LDR q31, [x5], 16
167        FMLA v17.4s, v21.4s, v1.s[0]
168        FMLA v18.4s, v22.4s, v1.s[0]
169        FMLA  v5.4s, v23.4s, v1.s[1]
170        FMLA  v6.4s, v24.4s, v1.s[1]
171        FMLA  v7.4s, v25.4s, v1.s[1]
172        FMLA v16.4s, v26.4s, v1.s[2]
173        FMLA v17.4s, v27.4s, v1.s[2]
174        FMLA v18.4s, v28.4s, v1.s[2]
175        FMLA  v5.4s, v29.4s, v1.s[3]
176        FMLA  v6.4s, v30.4s, v1.s[3]
177        FMLA  v7.4s, v31.4s, v1.s[3]
178
1793:
180        # Is there a remainder?- 4 floats of A (16 bytes)
181        TBNZ x0, 4, 5f
182        # Is there a remainder?- 2 floats of A (8 bytes)
183        TBNZ x0, 3, 6f
184        # Is there a remainder?- 1 floats of A (4 bytes)
185        TBNZ x0, 2, 8f
186
1874:
188        FADD v16.4s, v16.4s, v5.4s
189        FADD v17.4s, v17.4s, v6.4s
190        FADD v18.4s, v18.4s, v7.4s
191        SUBS x1, x1, 12
192
193        # Clamp
194        FMAX v16.4s, v16.4s, v2.4s
195        FMAX v17.4s, v17.4s, v2.4s
196        FMAX v18.4s, v18.4s, v2.4s
197        FMIN v16.4s, v16.4s, v3.4s
198        FMIN v17.4s, v17.4s, v3.4s
199        FMIN v18.4s, v18.4s, v3.4s
200
201        # Store full 1 x 12
202        B.LO 9f
203
204        ST1 {v16.16b, v17.16b, v18.16b}, [x6], x14
205        SUB  x3,  x3, x2 // a0 -= kc
206
207        B.HI 0b
208
209        RET
210
2115:
212        # Remainder- 4 floats of A (16 bytes)
213        LDR q0, [x3], 16
214        LDR q20, [x5], 16
215        LDR q21, [x5], 16
216        LDR q22, [x5], 16
217        FMLA v16.4s, v20.4s, v0.s[0]
218        FMLA v17.4s, v21.4s, v0.s[0]
219        FMLA v18.4s, v22.4s, v0.s[0]
220
221        LDR q20, [x5], 16
222        LDR q21, [x5], 16
223        LDR q22, [x5], 16
224        FMLA v16.4s, v20.4s, v0.s[1]
225        FMLA v17.4s, v21.4s, v0.s[1]
226        FMLA v18.4s, v22.4s, v0.s[1]
227
228        LDR q20, [x5], 16
229        LDR q21, [x5], 16
230        LDR q22, [x5], 16
231        FMLA v16.4s, v20.4s, v0.s[2]
232        FMLA v17.4s, v21.4s, v0.s[2]
233        FMLA v18.4s, v22.4s, v0.s[2]
234
235        LDR q20, [x5], 16
236        LDR q21, [x5], 16
237        LDR q22, [x5], 16
238        FMLA v16.4s, v20.4s, v0.s[3]
239        FMLA v17.4s, v21.4s, v0.s[3]
240        FMLA v18.4s, v22.4s, v0.s[3]
241
242        TBZ x0, 3, 7f
2436:
244        # Remainder- 2 floats of A (8 bytes)
245        LDR d0, [x3], 8
246        LDR q20, [x5], 16
247        LDR q21, [x5], 16
248        LDR q22, [x5], 16
249        FMLA v16.4s, v20.4s, v0.s[0]
250        FMLA v17.4s, v21.4s, v0.s[0]
251        FMLA v18.4s, v22.4s, v0.s[0]
252
253        LDR q20, [x5], 16
254        LDR q21, [x5], 16
255        LDR q22, [x5], 16
256        FMLA v16.4s, v20.4s, v0.s[1]
257        FMLA v17.4s, v21.4s, v0.s[1]
258        FMLA v18.4s, v22.4s, v0.s[1]
2597:
260        TBZ x0, 2, 4b
2618:
262        # Remainder- 1 float of A (4 bytes)
263        LDR s0, [x3], 4
264        LDR q20, [x5], 16
265        LDR q21, [x5], 16
266        LDR q22, [x5], 16
267        FMLA v16.4s, v20.4s, v0.s[0]
268        FMLA v17.4s, v21.4s, v0.s[0]
269        FMLA v18.4s, v22.4s, v0.s[0]
270        B 4b
271
272        # Store odd channels
2739:
274        ADD x1, x1, 12
275        TBZ x1, 3, 10f
276        STP q16, q17, [x6], 32
277        MOV v16.16b, v18.16b
278
27910:
280        TBZ x1, 2, 11f
281        STR q16, [x6], 16
282        MOV v16.16b, v17.16b
283
28411:
285        TBZ x1, 1, 12f
286        STR d16, [x6], 8
287        DUP d16, v16.d[1]
288
28912:
290        TBZ x1, 0, 13f
291        STR s16, [x6]
29213:
293        RET
294
295END_FUNCTION xnn_f32_gemminc_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53
296
297#ifdef __ELF__
298.section ".note.GNU-stack","",%progbits
299#endif
300