1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> (x0)
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31# x12 a4
32#  x4 a5
33
34# C pointers
35#  x6 c0
36# x16 c1
37# x17 c2
38# x14 c3
39# x13 c4
40#  x7 c5
41
42# Vector register usage
43# A0   v0
44# A1   v1
45# A2   v2
46# A3   v3
47# A4   v4
48# A5   v5
49# B   v16 v17 v18 v19
50# C   v20 v21
51# C   v22 v23
52# C   v24 v25
53# C   v26 v27
54# C   v28 v29
55# C   v30 v31
56# Clamp v6, (v4), (v5)
57# unused A   v8 v9 v10 v11
58# unused B   v12 v13 v14 v15
59
60
61BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32
62
63        $if INC:
64          # Load acc, params pointer
65          LDP x15, x8, [sp, 8]
66        $else:
67          # Load params pointer
68          LDR x8, [sp, 8]
69
70        # Clamp A and C pointers
71        CMP x0, 2                // if mr < 2
72        ADD x9, x3, x4           // a1 = a0 + a_stride
73        ADD x16, x6, x7          // c1 = c0 + cm_stride
74        CSEL x9, x3, x9, LO      //   a1 = a0
75        CSEL x16, x6, x16, LO    //   c1 = c0
76
77        ADD x10, x9, x4          // a2 = a1 + a_stride
78        ADD x17, x16, x7         // c2 = c1 + cm_stride
79                                 // if mr <= 2
80        CSEL x10, x9, x10, LS    //   a2 = a1
81        CSEL x17, x16, x17, LS   //   c2 = c1
82
83        CMP x0, 4                // if mr < 4
84        ADD x11, x10, x4         // a3 = a2 + a_stride
85        ADD x14, x17, x7         // c3 = c2 + cm_stride
86        CSEL x11, x10, x11, LO   //   a3 = a2
87        CSEL x14, x17, x14, LO   //   c3 = c2
88
89        ADD x12, x11, x4         // a4 = a3 + a_stride
90        ADD x13, x14, x7         // c4 = c3 + cm_stride
91                                 // if mr <= 4
92        CSEL x12, x11, x12, LS   //   a4 = a3
93        CSEL x13, x14, x13, LS   //   c4 = c3
94
95        CMP x0, 6                // if mr < 6
96        ADD x4, x12, x4          // a5 = a4 + a_stride
97        ADD x7, x13, x7          // c5 = c4 + cm_stride
98        CSEL x4, x12, x4, LO     //   a5 = a4
99        CSEL x7, x13, x7, LO     //   c5 = c4
100
101        # Load params scale value
102        LD1R {v6.8h}, [x8]
103        ADD x8, x8, 2
104
1050:
106        $if INC:
107          # Load initial accumulators
108          LDP q20, q21, [x15], 32
109          LDP q22, q23, [x15], 32
110          LDP q24, q25, [x15], 32
111          LDP q26, q27, [x15], 32
112          LDP q28, q29, [x15], 32
113          LDP q30, q31, [x15], 32
114        $else:
115          # Load initial bias from w into accumulators
116          LDP q20, q21, [x5], 32
117          MOV v22.16b, v20.16b
118          MOV v23.16b, v21.16b
119          MOV v24.16b, v20.16b
120          MOV v25.16b, v21.16b
121          MOV v26.16b, v20.16b
122          MOV v27.16b, v21.16b
123          MOV v28.16b, v20.16b
124          MOV v29.16b, v21.16b
125          MOV v30.16b, v20.16b
126          MOV v31.16b, v21.16b
127
128         # Is there at least 2 halffloats (4 bytes)?
129        SUBS x0, x2, 4  // k = kc - 4
130        B.LO 3f
131
132        # Main loop - 2 halffloats of A (4 bytes)
133        # 24 FMA + 6 ld32 A + 4 LDR B
1341:
135        LDR   s0,  [x3], 4
136        LDR  q16, [x5], 16
137        LDR  q17, [x5], 16
138        LDR   s1,  [x9], 4
139        LDR   s2, [x10], 4
140        LDR   s3, [x11], 4
141        LDR   s4, [x12], 4
142        LDR   s5,  [x4], 4
143        SUBS x0, x0, 4
144        FMLA v20.8h, v16.8h,  v0.h[0]
145        FMLA v22.8h, v16.8h,  v1.h[0]
146        FMLA v24.8h, v16.8h,  v2.h[0]
147        FMLA v26.8h, v16.8h,  v3.h[0]
148        LDR  q18, [x5], 16
149        LDR  q19, [x5], 16
150        FMLA v28.8h, v16.8h,  v4.h[0]
151        FMLA v30.8h, v16.8h,  v5.h[0]
152        FMLA v21.8h, v17.8h,  v0.h[0]
153        FMLA v23.8h, v17.8h,  v1.h[0]
154        FMLA v25.8h, v17.8h,  v2.h[0]
155        FMLA v27.8h, v17.8h,  v3.h[0]
156        FMLA v29.8h, v17.8h,  v4.h[0]
157        FMLA v31.8h, v17.8h,  v5.h[0]
158
159        FMLA v20.8h, v18.8h,  v0.h[1]
160        FMLA v22.8h, v18.8h,  v1.h[1]
161        FMLA v24.8h, v18.8h,  v2.h[1]
162        FMLA v26.8h, v18.8h,  v3.h[1]
163        FMLA v28.8h, v18.8h,  v4.h[1]
164        FMLA v30.8h, v18.8h,  v5.h[1]
165        FMLA v21.8h, v19.8h,  v0.h[1]
166        FMLA v23.8h, v19.8h,  v1.h[1]
167        FMLA v25.8h, v19.8h,  v2.h[1]
168        FMLA v27.8h, v19.8h,  v3.h[1]
169        FMLA v29.8h, v19.8h,  v4.h[1]
170        FMLA v31.8h, v19.8h,  v5.h[1]
171        B.HS 1b
172
173        # Is there a remainder?- 1 halffloat of A (2 bytes)
174        TBNZ x0, 1, 3f
1752:
176        # Scale and Clamp
177        FMUL v20.8h, v20.8h, v6.8h
178        # Load params values
179        LD2R {v4.8h, v5.8h}, [x8]
180        FMUL v21.8h, v21.8h, v6.8h
181        FMUL v22.8h, v22.8h, v6.8h
182        FMUL v23.8h, v23.8h, v6.8h
183        FMUL v24.8h, v24.8h, v6.8h
184        FMUL v25.8h, v25.8h, v6.8h
185        FMUL v26.8h, v26.8h, v6.8h
186        FMUL v27.8h, v27.8h, v6.8h
187        FMUL v28.8h, v28.8h, v6.8h
188        FMUL v29.8h, v29.8h, v6.8h
189        FMUL v30.8h, v30.8h, v6.8h
190        FMUL v31.8h, v31.8h, v6.8h
191        # Load cn_stride
192        LDR x0, [sp, 0]
193        FMAX v20.8h, v20.8h, v4.8h
194        FMAX v21.8h, v21.8h, v4.8h
195        FMAX v22.8h, v22.8h, v4.8h
196        FMAX v23.8h, v23.8h, v4.8h
197        FMAX v24.8h, v24.8h, v4.8h
198        FMAX v25.8h, v25.8h, v4.8h
199        FMAX v26.8h, v26.8h, v4.8h
200        FMAX v27.8h, v27.8h, v4.8h
201        FMAX v28.8h, v28.8h, v4.8h
202        FMAX v29.8h, v29.8h, v4.8h
203        FMAX v30.8h, v30.8h, v4.8h
204        FMAX v31.8h, v31.8h, v4.8h
205        SUBS x1, x1, 16
206        FMIN v20.8h, v20.8h, v5.8h
207        FMIN v21.8h, v21.8h, v5.8h
208        FMIN v22.8h, v22.8h, v5.8h
209        FMIN v23.8h, v23.8h, v5.8h
210        FMIN v24.8h, v24.8h, v5.8h
211        FMIN v25.8h, v25.8h, v5.8h
212        FMIN v26.8h, v26.8h, v5.8h
213        FMIN v27.8h, v27.8h, v5.8h
214        FMIN v28.8h, v28.8h, v5.8h
215        FMIN v29.8h, v29.8h, v5.8h
216        FMIN v30.8h, v30.8h, v5.8h
217        FMIN v31.8h, v31.8h, v5.8h
218
219        # Store full 6 x 16
220        B.LO 4f
221
222        $if INC:
223          ST1 {v30.16b, v31.16b},  [x7], x0
224          SUB  x3,  x3, x2 // a0 -= kc
225          ST1 {v28.16b, v29.16b}, [x13], x0
226          SUB  x9,  x9, x2 // a1 -= kc
227          ST1 {v26.16b, v27.16b}, [x14], x0
228          SUB x10, x10, x2 // a2 -= kc
229          ST1 {v24.16b, v25.16b}, [x17], x0
230          SUB x11, x11, x2 // a3 -= kc
231          ST1 {v22.16b, v23.16b}, [x16], x0
232          SUB x12, x12, x2 // a4 -= kc
233          ST1 {v20.16b, v21.16b},  [x6], x0
234          SUB  x4,  x4, x2 // a5 -= kc
235        $else:
236          ST1 {v20.16b, v21.16b},  [x6], x0
237          SUB  x3,  x3, x2 // a0 -= kc
238          ST1 {v22.16b, v23.16b}, [x16], x0
239          SUB  x9,  x9, x2 // a1 -= kc
240          ST1 {v24.16b, v25.16b}, [x17], x0
241          SUB x10, x10, x2 // a2 -= kc
242          ST1 {v26.16b, v27.16b}, [x14], x0
243          SUB x11, x11, x2 // a3 -= kc
244          ST1 {v28.16b, v29.16b}, [x13], x0
245          SUB x12, x12, x2 // a4 -= kc
246          ST1 {v30.16b, v31.16b},  [x7], x0
247          SUB  x4,  x4, x2 // a5 -= kc
248
249        B.HI 0b
250        RET
251
2523:
253        # Remainder- 1 halffloat of A (2 bytes)
254        LDR   h0,  [x3], 2
255        LDR  q16, [x5], 16
256        LDR  q17, [x5], 16
257        LDR   h1,  [x9], 2
258        LDR   h2, [x10], 2
259        LDR   h3, [x11], 2
260        LDR   h4, [x12], 2
261        LDR   h5,  [x4], 2
262        FMLA v20.8h, v16.8h,  v0.h[0]
263        FMLA v22.8h, v16.8h,  v1.h[0]
264        FMLA v24.8h, v16.8h,  v2.h[0]
265        FMLA v26.8h, v16.8h,  v3.h[0]
266        FMLA v28.8h, v16.8h,  v4.h[0]
267        FMLA v30.8h, v16.8h,  v5.h[0]
268        FMLA v21.8h, v17.8h,  v0.h[0]
269        FMLA v23.8h, v17.8h,  v1.h[0]
270        FMLA v25.8h, v17.8h,  v2.h[0]
271        FMLA v27.8h, v17.8h,  v3.h[0]
272        FMLA v29.8h, v17.8h,  v4.h[0]
273        FMLA v31.8h, v17.8h,  v5.h[0]
274        B 2b
275
276        # Store odd width
2774:
278        TBZ x1, 3, 5f
279        $if INC:
280          STR q30,  [x7], 16
281          MOV v30.16b, v31.16b
282          STR q28, [x13], 16
283          MOV v28.16b, v29.16b
284          STR q26, [x14], 16
285          MOV v26.16b, v27.16b
286          STR q24, [x17], 16
287          MOV v24.16b, v25.16b
288          STR q22, [x16], 16
289          MOV v22.16b, v23.16b
290          STR q20,  [x6], 16
291          MOV v20.16b, v21.16b
292        $else:
293          STR q20,  [x6], 16
294          MOV v20.16b, v21.16b
295          STR q22, [x16], 16
296          MOV v22.16b, v23.16b
297          STR q24, [x17], 16
298          MOV v24.16b, v25.16b
299          STR q26, [x14], 16
300          MOV v26.16b, v27.16b
301          STR q28, [x13], 16
302          MOV v28.16b, v29.16b
303          STR q30,  [x7], 16
304          MOV v30.16b, v31.16b
305
3065:
307        TBZ x1, 2, 6f
308        $if INC:
309          STR d30,  [x7], 8
310          DUP d30, v30.d[1]
311          STR d28, [x13], 8
312          DUP d28, v28.d[1]
313          STR d26, [x14], 8
314          DUP d26, v26.d[1]
315          STR d24, [x17], 8
316          DUP d24, v24.d[1]
317          STR d22, [x16], 8
318          DUP d22, v22.d[1]
319          STR d20,  [x6], 8
320          DUP d20, v20.d[1]
321        $else:
322          STR d20,  [x6], 8
323          DUP d20, v20.d[1]
324          STR d22, [x16], 8
325          DUP d22, v22.d[1]
326          STR d24, [x17], 8
327          DUP d24, v24.d[1]
328          STR d26, [x14], 8
329          DUP d26, v26.d[1]
330          STR d28, [x13], 8
331          DUP d28, v28.d[1]
332          STR d30,  [x7], 8
333          DUP d30, v30.d[1]
334
3356:
336        TBZ x1, 1, 7f
337        $if INC:
338          STR s30,  [x7], 4
339          DUP s30, v30.s[1]
340          STR s28, [x13], 4
341          DUP s28, v28.s[1]
342          STR s26, [x14], 4
343          DUP s26, v26.s[1]
344          STR s24, [x17], 4
345          DUP s24, v24.s[1]
346          STR s22, [x16], 4
347          DUP s22, v22.s[1]
348          STR s20,  [x6], 4
349          DUP s20, v20.s[1]
350        $else:
351          STR s20,  [x6], 4
352          DUP s20, v20.s[1]
353          STR s22, [x16], 4
354          DUP s22, v22.s[1]
355          STR s24, [x17], 4
356          DUP s24, v24.s[1]
357          STR s26, [x14], 4
358          DUP s26, v26.s[1]
359          STR s28, [x13], 4
360          DUP s28, v28.s[1]
361          STR s30,  [x7], 4
362          DUP s30, v30.s[1]
363
3647:
365        TBZ x1, 0, 8f
366        $if INC:
367          STR h30,  [x7]
368          STR h28, [x13]
369          STR h26, [x14]
370          STR h24, [x17]
371          STR h22, [x16]
372          STR h20,  [x6]
373        $else:
374          STR h20,  [x6]
375          STR h22, [x16]
376          STR h24, [x17]
377          STR h26, [x14]
378          STR h28, [x13]
379          STR h30,  [x7]
3808:
381        RET
382
383END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32
384
385#ifdef __ELF__
386.section ".note.GNU-stack","",%progbits
387#endif
388