1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> (x0)
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31# x12 a4
32#  x4 a5
33
34# C pointers
35#  x6 c0
36# x16 c1
37# x17 c2
38# x14 c3
39# x13 c4
40#  x7 c5
41
42# Vector register usage
43# A0   v0
44# A1   v1
45# A2   v2
46# A3   v3
47# A4   v4
48# A5   v5
49# B   v16 v17 v18 v19
50# C   v20
51# C   v22
52# C   v24
53# C   v26
54# C   v28
55# C   v30
56# Clamp v6, (v4), (v5)
57# unused A   v8 v9 v10 v11
58# unused B   v12 v13 v14 v15
59
60
61BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
62
63        $if INC:
64          # Load acc, params pointer
65          LDP x15, x8, [sp, 8]
66        $else:
67          # Load params pointer
68          LDR x8, [sp, 8]
69
70        # Clamp A and C pointers
71        CMP x0, 2                // if mr < 2
72        ADD x9, x3, x4           // a1 = a0 + a_stride
73        ADD x16, x6, x7          // c1 = c0 + cm_stride
74        CSEL x9, x3, x9, LO      //   a1 = a0
75        CSEL x16, x6, x16, LO    //   c1 = c0
76
77        ADD x10, x9, x4          // a2 = a1 + a_stride
78        ADD x17, x16, x7         // c2 = c1 + cm_stride
79                                 // if mr <= 2
80        CSEL x10, x9, x10, LS    //   a2 = a1
81        CSEL x17, x16, x17, LS   //   c2 = c1
82
83        CMP x0, 4                // if mr < 4
84        ADD x11, x10, x4         // a3 = a2 + a_stride
85        ADD x14, x17, x7         // c3 = c2 + cm_stride
86        CSEL x11, x10, x11, LO   //   a3 = a2
87        CSEL x14, x17, x14, LO   //   c3 = c2
88
89        ADD x12, x11, x4         // a4 = a3 + a_stride
90        ADD x13, x14, x7         // c4 = c3 + cm_stride
91                                 // if mr <= 4
92        CSEL x12, x11, x12, LS   //   a4 = a3
93        CSEL x13, x14, x13, LS   //   c4 = c3
94
95        CMP x0, 6                // if mr < 6
96        ADD x4, x12, x4          // a5 = a4 + a_stride
97        ADD x7, x13, x7          // c5 = c4 + cm_stride
98        CSEL x4, x12, x4, LO     //   a5 = a4
99        CSEL x7, x13, x7, LO     //   c5 = c4
100
101        # Load params scale value
102        LD1R {v6.8h}, [x8]
103        ADD x8, x8, 2
104
1050:
106        $if INC:
107          # Load initial accumulators
108          LDP q20, q22, [x15], 32
109          LDP q24, q26, [x15], 32
110          LDP q28, q30, [x15], 32
111        $else:
112          # Load initial bias from w into accumulators
113          LDR q20, [x5], 16
114          MOV v22.16b, v20.16b
115          MOV v24.16b, v20.16b
116          MOV v26.16b, v20.16b
117          MOV v28.16b, v20.16b
118          MOV v30.16b, v20.16b
119
120         # Is there at least 4 halffloats (8 bytes)?
121        SUBS x0, x2, 8  // k = kc - 8
122        B.LO 3f
123
124        # Main loop - 4 halffloats of A (8 bytes)
125        # 24 FMA + 6 ld64 A + 4 LDR B
1261:
127        LDR   d0,  [x3], 8
128        LDR  q16, [x5], 16
129        LDR  q17, [x5], 16
130        LDR   d1,  [x9], 8
131        LDR   d2, [x10], 8
132        LDR   d3, [x11], 8
133        LDR   d4, [x12], 8
134        LDR   d5,  [x4], 8
135        SUBS x0, x0, 8
136        FMLA v20.8h, v16.8h,  v0.h[0]
137        FMLA v22.8h, v16.8h,  v1.h[0]
138        FMLA v24.8h, v16.8h,  v2.h[0]
139        FMLA v26.8h, v16.8h,  v3.h[0]
140        FMLA v28.8h, v16.8h,  v4.h[0]
141        FMLA v30.8h, v16.8h,  v5.h[0]
142        LDR  q18, [x5], 16
143        LDR  q19, [x5], 16
144
145        FMLA v20.8h, v17.8h,  v0.h[1]
146        FMLA v22.8h, v17.8h,  v1.h[1]
147        FMLA v24.8h, v17.8h,  v2.h[1]
148        FMLA v26.8h, v17.8h,  v3.h[1]
149        FMLA v28.8h, v17.8h,  v4.h[1]
150        FMLA v30.8h, v17.8h,  v5.h[1]
151
152        FMLA v20.8h, v18.8h,  v0.h[2]
153        FMLA v22.8h, v18.8h,  v1.h[2]
154        FMLA v24.8h, v18.8h,  v2.h[2]
155        FMLA v26.8h, v18.8h,  v3.h[2]
156        FMLA v28.8h, v18.8h,  v4.h[2]
157        FMLA v30.8h, v18.8h,  v5.h[2]
158
159        FMLA v20.8h, v19.8h,  v0.h[3]
160        FMLA v22.8h, v19.8h,  v1.h[3]
161        FMLA v24.8h, v19.8h,  v2.h[3]
162        FMLA v26.8h, v19.8h,  v3.h[3]
163        FMLA v28.8h, v19.8h,  v4.h[3]
164        FMLA v30.8h, v19.8h,  v5.h[3]
165        B.HS 1b
166
167        # Is there a remainder?- 2 halffloats of A (4 bytes)
168        TBNZ x0, 2, 4f
169        # Is there a remainder?- 1 halffloats of A (2 bytes)
170        TBNZ x0, 1, 5f
1712:
172        # Scale and Clamp
173        FMUL v20.8h, v20.8h, v6.8h
174        # Load params values
175        LD2R {v4.8h, v5.8h}, [x8]
176        FMUL v22.8h, v22.8h, v6.8h
177        FMUL v24.8h, v24.8h, v6.8h
178        FMUL v26.8h, v26.8h, v6.8h
179        FMUL v28.8h, v28.8h, v6.8h
180        FMUL v30.8h, v30.8h, v6.8h
181        # Load cn_stride
182        LDR x0, [sp, 0]
183        FMAX v20.8h, v20.8h, v4.8h
184        FMAX v22.8h, v22.8h, v4.8h
185        FMAX v24.8h, v24.8h, v4.8h
186        FMAX v26.8h, v26.8h, v4.8h
187        FMAX v28.8h, v28.8h, v4.8h
188        FMAX v30.8h, v30.8h, v4.8h
189        SUBS x1, x1, 8
190        FMIN v20.8h, v20.8h, v5.8h
191        FMIN v22.8h, v22.8h, v5.8h
192        FMIN v24.8h, v24.8h, v5.8h
193        FMIN v26.8h, v26.8h, v5.8h
194        FMIN v28.8h, v28.8h, v5.8h
195        FMIN v30.8h, v30.8h, v5.8h
196
197        # Store full 6 x 8
198        B.LO 6f
199
200        $if INC:
201          ST1 {v30.16b},  [x7], x0
202          SUB  x3,  x3, x2 // a0 -= kc
203          ST1 {v28.16b}, [x13], x0
204          SUB  x9,  x9, x2 // a1 -= kc
205          ST1 {v26.16b}, [x14], x0
206          SUB x10, x10, x2 // a2 -= kc
207          ST1 {v24.16b}, [x17], x0
208          SUB x11, x11, x2 // a3 -= kc
209          ST1 {v22.16b}, [x16], x0
210          SUB x12, x12, x2 // a4 -= kc
211          ST1 {v20.16b},  [x6], x0
212          SUB  x4,  x4, x2 // a5 -= kc
213        $else:
214          ST1 {v20.16b},  [x6], x0
215          SUB  x3,  x3, x2 // a0 -= kc
216          ST1 {v22.16b}, [x16], x0
217          SUB  x9,  x9, x2 // a1 -= kc
218          ST1 {v24.16b}, [x17], x0
219          SUB x10, x10, x2 // a2 -= kc
220          ST1 {v26.16b}, [x14], x0
221          SUB x11, x11, x2 // a3 -= kc
222          ST1 {v28.16b}, [x13], x0
223          SUB x12, x12, x2 // a4 -= kc
224          ST1 {v30.16b},  [x7], x0
225          SUB  x4,  x4, x2 // a5 -= kc
226
227        B.HI 0b
228        RET
229
2303:
231        TBZ x0, 2, 5f
2324:
233        # Remainder- 2 halffloats of A (4 bytes)
234        LDR   s0,  [x3], 4
235        LDR  q16, [x5], 16
236        LDR  q17, [x5], 16
237        LDR   s1,  [x9], 4
238        LDR   s2, [x10], 4
239        LDR   s3, [x11], 4
240        LDR   s4, [x12], 4
241        LDR   s5,  [x4], 4
242
243        FMLA v20.8h, v16.8h,  v0.h[0]
244        FMLA v22.8h, v16.8h,  v1.h[0]
245        FMLA v24.8h, v16.8h,  v2.h[0]
246        FMLA v26.8h, v16.8h,  v3.h[0]
247        FMLA v28.8h, v16.8h,  v4.h[0]
248        FMLA v30.8h, v16.8h,  v5.h[0]
249
250        FMLA v20.8h, v17.8h,  v0.h[1]
251        FMLA v22.8h, v17.8h,  v1.h[1]
252        FMLA v24.8h, v17.8h,  v2.h[1]
253        FMLA v26.8h, v17.8h,  v3.h[1]
254        FMLA v28.8h, v17.8h,  v4.h[1]
255        FMLA v30.8h, v17.8h,  v5.h[1]
256
257        TBZ x0, 1, 2b
258
2595:
260        # Remainder- 1 halffloat of A (2 bytes)
261        LDR   h0,  [x3], 2
262        LDR  q16,  [x5], 16
263        LDR   h1,  [x9], 2
264        LDR   h2, [x10], 2
265        LDR   h3, [x11], 2
266        LDR   h4, [x12], 2
267        LDR   h5,  [x4], 2
268        FMLA v20.8h, v16.8h,  v0.h[0]
269        FMLA v22.8h, v16.8h,  v1.h[0]
270        FMLA v24.8h, v16.8h,  v2.h[0]
271        FMLA v26.8h, v16.8h,  v3.h[0]
272        FMLA v28.8h, v16.8h,  v4.h[0]
273        FMLA v30.8h, v16.8h,  v5.h[0]
274        B 2b
275
276        # Store odd width
2776:
278        TBZ x1, 2, 7f
279        $if INC:
280          STR d30,  [x7], 8
281          DUP d30, v30.d[1]
282          STR d28, [x13], 8
283          DUP d28, v28.d[1]
284          STR d26, [x14], 8
285          DUP d26, v26.d[1]
286          STR d24, [x17], 8
287          DUP d24, v24.d[1]
288          STR d22, [x16], 8
289          DUP d22, v22.d[1]
290          STR d20,  [x6], 8
291          DUP d20, v20.d[1]
292        $else:
293          STR d20,  [x6], 8
294          DUP d20, v20.d[1]
295          STR d22, [x16], 8
296          DUP d22, v22.d[1]
297          STR d24, [x17], 8
298          DUP d24, v24.d[1]
299          STR d26, [x14], 8
300          DUP d26, v26.d[1]
301          STR d28, [x13], 8
302          DUP d28, v28.d[1]
303          STR d30,  [x7], 8
304          DUP d30, v30.d[1]
305
3067:
307        TBZ x1, 1, 8f
308        $if INC:
309          STR s30,  [x7], 4
310          DUP s30, v30.s[1]
311          STR s28, [x13], 4
312          DUP s28, v28.s[1]
313          STR s26, [x14], 4
314          DUP s26, v26.s[1]
315          STR s24, [x17], 4
316          DUP s24, v24.s[1]
317          STR s22, [x16], 4
318          DUP s22, v22.s[1]
319          STR s20,  [x6], 4
320          DUP s20, v20.s[1]
321        $else:
322          STR s20,  [x6], 4
323          DUP s20, v20.s[1]
324          STR s22, [x16], 4
325          DUP s22, v22.s[1]
326          STR s24, [x17], 4
327          DUP s24, v24.s[1]
328          STR s26, [x14], 4
329          DUP s26, v26.s[1]
330          STR s28, [x13], 4
331          DUP s28, v28.s[1]
332          STR s30,  [x7], 4
333          DUP s30, v30.s[1]
334
3358:
336        TBZ x1, 0, 9f
337        $if INC:
338          STR h30,  [x7]
339          STR h28, [x13]
340          STR h26, [x14]
341          STR h24, [x17]
342          STR h22, [x16]
343          STR h20,  [x6]
344        $else:
345          STR h20,  [x6]
346          STR h22, [x16]
347          STR h24, [x17]
348          STR h26, [x14]
349          STR h28, [x13]
350          STR h30,  [x7]
3519:
352        RET
353
354END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64
355
356#ifdef __ELF__
357.section ".note.GNU-stack","",%progbits
358#endif
359