1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53(
9#     size_t mr,                (x0) - unused.  mr = 1
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          (x4) - unused
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         (x7) - unused
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointer
27# x3  a0
28
29# C pointer
30# x6  c0
31
32# Clamp v2 v3
33
34# A53 based on A57/A75 but with LD64
35
36BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53
37
38        $if INC:
39          # Load cn_stride, acc
40          LDP x14, x15, [sp]
41          # Load params pointer
42          LDR x8, [sp, 16]
43        $else:
44          # Load cn_stride, params pointer
45          LDP x14, x8, [sp]
46
47        # Load min/max values
48        LD2R {v2.4s, v3.4s}, [x8]
490:
50        $if INC:
51          # Load initial accumulators
52          LD1 {v16.16b, v17.16b, v18.16b}, [x15], 48
53        $else:
54          # Load initial bias from w into accumulators
55          LD1 {v16.16b, v17.16b, v18.16b}, [x5], 48
56
57        MOVI v5.4s, 0  // second set of C for pipelining FMLA
58        PRFM PLDL1KEEP, [x5]
59        MOVI v6.4s, 0
60        PRFM PLDL1KEEP, [x5, 64]
61        MOVI v7.4s, 0
62        PRFM PLDL1KEEP, [x5, 128]
63        PRFM PLDL1KEEP, [x5, 192]
64
65        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
66        SUBS x0, x2, 32  // k = kc - 32
67
68        B.LO 3f
69
70        # 16 prologue
71        # Read first block of 1 A and B.
72        LDP q20, q21, [x5], 32
73        LDP q22, q23, [x5], 32
74        LDP q24, q25, [x5], 32
75        LDP q26, q27, [x5], 32
76        LDP q28, q29, [x5], 32
77        LDP q30, q31, [x5], 32
78        LDR q0, [x3], 16
79
80        # Is there at least 32.  yes do main loop
81        SUBS x0, x0, 32
82        B.LO 2f
83
84        # Main loop - 8 floats of A (32 bytes)
851:
86        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
87        FMLA v16.4s, v20.4s, v0.s[0]
88        LDR  q1, [x3], 16
89        FMLA v17.4s, v21.4s, v0.s[0]
90        LDR q20, [x5], 16
91        FMLA v18.4s, v22.4s, v0.s[0]
92        LDR q21, [x5], 16
93        FMLA  v5.4s, v23.4s, v0.s[1]
94        LDR q22, [x5], 16
95        FMLA  v6.4s, v24.4s, v0.s[1]
96        LDR q23, [x5], 16
97        FMLA  v7.4s, v25.4s, v0.s[1]
98        LDR q24, [x5], 16
99        FMLA v16.4s, v26.4s, v0.s[2]
100        LDR q25, [x5], 16
101        FMLA v17.4s, v27.4s, v0.s[2]
102        LDR q26, [x5], 16
103        FMLA v18.4s, v28.4s, v0.s[2]
104        LDR q27, [x5], 16
105        FMLA  v5.4s, v29.4s, v0.s[3]
106        LDR q28, [x5], 16
107        FMLA  v6.4s, v30.4s, v0.s[3]
108        LDR q29, [x5], 16
109        FMLA  v7.4s, v31.4s, v0.s[3]
110        LDR q30, [x5], 16
111        LDR q31, [x5], 16
112
113        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
114        FMLA v16.4s, v20.4s, v1.s[0]
115        LDR  q0, [x3], 16
116        FMLA v17.4s, v21.4s, v1.s[0]
117        LDR q20, [x5], 16
118        FMLA v18.4s, v22.4s, v1.s[0]
119        LDR q21, [x5], 16
120        FMLA  v5.4s, v23.4s, v1.s[1]
121        LDR q22, [x5], 16
122        FMLA  v6.4s, v24.4s, v1.s[1]
123        LDR q23, [x5], 16
124        FMLA  v7.4s, v25.4s, v1.s[1]
125        LDR q24, [x5], 16
126        FMLA v16.4s, v26.4s, v1.s[2]
127        LDR q25, [x5], 16
128        FMLA v17.4s, v27.4s, v1.s[2]
129        LDR q26, [x5], 16
130        FMLA v18.4s, v28.4s, v1.s[2]
131        LDR q27, [x5], 16
132        FMLA  v5.4s, v29.4s, v1.s[3]
133        LDR q28, [x5], 16
134        FMLA  v6.4s, v30.4s, v1.s[3]
135        LDR q29, [x5], 16
136        FMLA  v7.4s, v31.4s, v1.s[3]
137        LDR q30, [x5], 16
138        SUBS x0, x0, 32
139        LDR q31, [x5], 16
140        B.HS 1b
141
1422:
143        # Epilogue
144
145        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
146        FMLA v16.4s, v20.4s, v0.s[0]
147        LDR  q1, [x3], 16
148        FMLA v17.4s, v21.4s, v0.s[0]
149        LDR q20, [x5], 16
150        FMLA v18.4s, v22.4s, v0.s[0]
151        LDR q21, [x5], 16
152        FMLA  v5.4s, v23.4s, v0.s[1]
153        LDR q22, [x5], 16
154        FMLA  v6.4s, v24.4s, v0.s[1]
155        LDR q23, [x5], 16
156        FMLA  v7.4s, v25.4s, v0.s[1]
157        LDR q24, [x5], 16
158        FMLA v16.4s, v26.4s, v0.s[2]
159        LDR q25, [x5], 16
160        FMLA v17.4s, v27.4s, v0.s[2]
161        LDR q26, [x5], 16
162        FMLA v18.4s, v28.4s, v0.s[2]
163        LDR q27, [x5], 16
164        FMLA  v5.4s, v29.4s, v0.s[3]
165        LDR q28, [x5], 16
166        FMLA  v6.4s, v30.4s, v0.s[3]
167        LDR q29, [x5], 16
168        FMLA  v7.4s, v31.4s, v0.s[3]
169        LDR q30, [x5], 16
170
171        # Second block of 4.  FMA for second 4, no loads.
172        FMLA v16.4s, v20.4s, v1.s[0]
173        LDR q31, [x5], 16
174        FMLA v17.4s, v21.4s, v1.s[0]
175        FMLA v18.4s, v22.4s, v1.s[0]
176        FMLA  v5.4s, v23.4s, v1.s[1]
177        FMLA  v6.4s, v24.4s, v1.s[1]
178        FMLA  v7.4s, v25.4s, v1.s[1]
179        FMLA v16.4s, v26.4s, v1.s[2]
180        FMLA v17.4s, v27.4s, v1.s[2]
181        FMLA v18.4s, v28.4s, v1.s[2]
182        FMLA  v5.4s, v29.4s, v1.s[3]
183        FMLA  v6.4s, v30.4s, v1.s[3]
184        FMLA  v7.4s, v31.4s, v1.s[3]
185
1863:
187        # Is there a remainder?- 4 floats of A (16 bytes)
188        TBNZ x0, 4, 5f
189        # Is there a remainder?- 2 floats of A (8 bytes)
190        TBNZ x0, 3, 6f
191        # Is there a remainder?- 1 floats of A (4 bytes)
192        TBNZ x0, 2, 8f
193
1944:
195        FADD v16.4s, v16.4s, v5.4s
196        FADD v17.4s, v17.4s, v6.4s
197        FADD v18.4s, v18.4s, v7.4s
198        SUBS x1, x1, 12
199
200        # Clamp
201        FMAX v16.4s, v16.4s, v2.4s
202        FMAX v17.4s, v17.4s, v2.4s
203        FMAX v18.4s, v18.4s, v2.4s
204        FMIN v16.4s, v16.4s, v3.4s
205        FMIN v17.4s, v17.4s, v3.4s
206        FMIN v18.4s, v18.4s, v3.4s
207
208        # Store full 1 x 12
209        B.LO 9f
210
211        ST1 {v16.16b, v17.16b, v18.16b}, [x6], x14
212        SUB  x3,  x3, x2 // a0 -= kc
213
214        B.HI 0b
215
216        RET
217
2185:
219        # Remainder- 4 floats of A (16 bytes)
220        LDR q0, [x3], 16
221        LDR q20, [x5], 16
222        LDR q21, [x5], 16
223        LDR q22, [x5], 16
224        FMLA v16.4s, v20.4s, v0.s[0]
225        FMLA v17.4s, v21.4s, v0.s[0]
226        FMLA v18.4s, v22.4s, v0.s[0]
227
228        LDR q20, [x5], 16
229        LDR q21, [x5], 16
230        LDR q22, [x5], 16
231        FMLA v16.4s, v20.4s, v0.s[1]
232        FMLA v17.4s, v21.4s, v0.s[1]
233        FMLA v18.4s, v22.4s, v0.s[1]
234
235        LDR q20, [x5], 16
236        LDR q21, [x5], 16
237        LDR q22, [x5], 16
238        FMLA v16.4s, v20.4s, v0.s[2]
239        FMLA v17.4s, v21.4s, v0.s[2]
240        FMLA v18.4s, v22.4s, v0.s[2]
241
242        LDR q20, [x5], 16
243        LDR q21, [x5], 16
244        LDR q22, [x5], 16
245        FMLA v16.4s, v20.4s, v0.s[3]
246        FMLA v17.4s, v21.4s, v0.s[3]
247        FMLA v18.4s, v22.4s, v0.s[3]
248
249        TBZ x0, 3, 7f
2506:
251        # Remainder- 2 floats of A (8 bytes)
252        LDR d0, [x3], 8
253        LDR q20, [x5], 16
254        LDR q21, [x5], 16
255        LDR q22, [x5], 16
256        FMLA v16.4s, v20.4s, v0.s[0]
257        FMLA v17.4s, v21.4s, v0.s[0]
258        FMLA v18.4s, v22.4s, v0.s[0]
259
260        LDR q20, [x5], 16
261        LDR q21, [x5], 16
262        LDR q22, [x5], 16
263        FMLA v16.4s, v20.4s, v0.s[1]
264        FMLA v17.4s, v21.4s, v0.s[1]
265        FMLA v18.4s, v22.4s, v0.s[1]
2667:
267        TBZ x0, 2, 4b
2688:
269        # Remainder- 1 float of A (4 bytes)
270        LDR s0, [x3], 4
271        LDR q20, [x5], 16
272        LDR q21, [x5], 16
273        LDR q22, [x5], 16
274        FMLA v16.4s, v20.4s, v0.s[0]
275        FMLA v17.4s, v21.4s, v0.s[0]
276        FMLA v18.4s, v22.4s, v0.s[0]
277        B 4b
278
279        # Store odd channels
2809:
281        ADD x1, x1, 12
282        TBZ x1, 3, 10f
283        STP q16, q17, [x6], 32
284        MOV v16.16b, v18.16b
285
28610:
287        TBZ x1, 2, 11f
288        STR q16, [x6], 16
289        MOV v16.16b, v17.16b
290
29111:
292        TBZ x1, 1, 12f
293        STR d16, [x6], 8
294        DUP d16, v16.d[1]
295
29612:
297        TBZ x1, 0, 13f
298        STR s16, [x6]
29913:
300        RET
301
302END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53
303
304#ifdef __ELF__
305.section ".note.GNU-stack","",%progbits
306#endif
307