1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27# x3  a0
28# x11 a1
29# x12 a2
30# x4  a3 / a_stride
31
32# C pointers
33# x6  c0
34# x9  c1
35# x10 c2
36# x7  c3 / cm_stride
37
38# Clamp v4, v5, v6
39
40BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32
41
42        $if INC:
43          # Load cn_stride, acc
44          LDP x14, x15, [sp]
45          # Load params pointer
46          LDR x8, [sp, 16]
47        $else:
48          # Load cn_stride, params pointer
49          LDP x14, x8, [sp]
50
51        # Load params values
52        LD3R {v4.8h, v5.8h, v6.8h}, [x8]
53
54        # Clamp A and C pointers
55        CMP x0, 2                // if mr < 2
56        ADD x11, x3, x4          // a1 = a0 + a_stride
57        ADD x9, x6, x7           // c1 = c0 + cm_stride
58        CSEL x11, x3, x11, LO    //   a1 = a0
59        CSEL x9, x6, x9, LO      //   c1 = c0
60
61        ADD x12, x11, x4         // a2 = a1 + a_stride
62        ADD x10, x9, x7          // c2 = c1 + cm_stride
63                                 // if mr <= 2
64        CSEL x12, x11, x12, LS   //   a2 = a1
65        CSEL x10, x9, x10, LS    //   c2 = c1
66
67        CMP x0, 4                // if mr < 4
68        ADD x4, x12, x4          // a3 = a2 + a_stride
69        ADD x7, x10, x7          // c3 = c2 + cm_stride
70        CSEL x4, x12, x4, LO     //   a3 = a2
71        CSEL x7, x10, x7, LO     //   c3 = c2
72
730:
74        $if INC:
75          # Load initial accumulators
76          LDP q16, q17, [x15], 32
77          LDP q18, q19, [x15], 32
78          LDP q28, q29, [x15], 32
79          LDP q30, q31, [x15], 32
80        $else:
81          # Load initial bias from w into accumulators
82          LDP q16, q17, [x5], 32
83          MOV v18.16b, v16.16b
84          MOV v19.16b, v17.16b
85          MOV v28.16b, v16.16b
86          MOV v29.16b, v17.16b
87          MOV v30.16b, v16.16b
88          MOV v31.16b, v17.16b
89
90        # Is there at least 2 halffloats (4 bytes)?
91        SUBS x0, x2, 4  // k = kc - 4
92        B.LO 3f
93
94        # Main loop - 2 halffloats of A (4 bytes)
951:
96        LDR  s0,  [x3], 4
97        LDR q20,  [x5], 16
98        LDR q21,  [x5], 16
99        LDR  s1, [x11], 4
100        LDR  s2, [x12], 4
101        LDR  s3,  [x4], 4
102        SUBS x0, x0, 4
103        FMLA v16.8h, v20.8h, v0.h[0]
104        FMLA v17.8h, v21.8h, v0.h[0]
105        FMLA v18.8h, v20.8h, v1.h[0]
106        FMLA v19.8h, v21.8h, v1.h[0]
107        LDR q22,  [x5], 16
108        LDR q23,  [x5], 16
109        FMLA v28.8h, v20.8h, v2.h[0]
110        FMLA v29.8h, v21.8h, v2.h[0]
111        FMLA v30.8h, v20.8h, v3.h[0]
112        FMLA v31.8h, v21.8h, v3.h[0]
113        FMLA v16.8h, v22.8h, v0.h[1]
114        FMLA v17.8h, v23.8h, v0.h[1]
115        FMLA v18.8h, v22.8h, v1.h[1]
116        FMLA v19.8h, v23.8h, v1.h[1]
117        FMLA v28.8h, v22.8h, v2.h[1]
118        FMLA v29.8h, v23.8h, v2.h[1]
119        FMLA v30.8h, v22.8h, v3.h[1]
120        FMLA v31.8h, v23.8h, v3.h[1]
121        B.HS 1b
122
123        # Is there a remainder?- 1 halffloat of A (2 bytes)
124        TBNZ x0, 1, 3f
125
1262:
127        # Scale and Clamp
128        FMUL v16.8h, v16.8h, v4.8h
129        SUBS x1, x1, 16
130        FMUL v17.8h, v17.8h, v4.8h
131        FMUL v18.8h, v18.8h, v4.8h
132        FMUL v19.8h, v19.8h, v4.8h
133        FMUL v28.8h, v28.8h, v4.8h
134        FMUL v29.8h, v29.8h, v4.8h
135        FMUL v30.8h, v30.8h, v4.8h
136        FMUL v31.8h, v31.8h, v4.8h
137        FMAX v16.8h, v16.8h, v5.8h
138        FMAX v17.8h, v17.8h, v5.8h
139        FMAX v18.8h, v18.8h, v5.8h
140        FMAX v19.8h, v19.8h, v5.8h
141        FMAX v28.8h, v28.8h, v5.8h
142        FMAX v29.8h, v29.8h, v5.8h
143        FMAX v30.8h, v30.8h, v5.8h
144        FMAX v31.8h, v31.8h, v5.8h
145        FMIN v16.8h, v16.8h, v6.8h
146        FMIN v17.8h, v17.8h, v6.8h
147        FMIN v18.8h, v18.8h, v6.8h
148        FMIN v19.8h, v19.8h, v6.8h
149        FMIN v28.8h, v28.8h, v6.8h
150        FMIN v29.8h, v29.8h, v6.8h
151        FMIN v30.8h, v30.8h, v6.8h
152        FMIN v31.8h, v31.8h, v6.8h
153
154        # Store full 4 x 16
155        B.LO 4f
156
157        $if INC:
158          ST1 {v30.16b, v31.16b},  [x7], x14
159          SUB  x3,  x3, x2 // a0 -= kc
160          ST1 {v28.16b, v29.16b}, [x10], x14
161          SUB x11, x11, x2 // a1 -= kc
162          ST1 {v18.16b, v19.16b},  [x9], x14
163          SUB x12, x12, x2 // a2 -= kc
164          ST1 {v16.16b, v17.16b},  [x6], x14
165          SUB  x4,  x4, x2 // a3 -= kc
166        $else:
167          ST1 {v16.16b, v17.16b},  [x6], x14
168          SUB  x3,  x3, x2 // a0 -= kc
169          ST1 {v18.16b, v19.16b},  [x9], x14
170          SUB x11, x11, x2 // a1 -= kc
171          ST1 {v28.16b, v29.16b}, [x10], x14
172          SUB x12, x12, x2 // a2 -= kc
173          ST1 {v30.16b, v31.16b},  [x7], x14
174          SUB  x4,  x4, x2 // a3 -= kc
175
176        B.HI 0b
177
178        RET
179
180        # Remainder- 1 halffloat of A (2 bytes)
1813:
182        LDR  h0,  [x3], 2
183        LDR q20,  [x5], 16
184        LDR q21,  [x5], 16
185        LDR  h1, [x11], 2
186        LDR  h2, [x12], 2
187        LDR  h3,  [x4], 2
188        FMLA v16.8h, v20.8h, v0.h[0]
189        FMLA v17.8h, v21.8h, v0.h[0]
190        FMLA v18.8h, v20.8h, v1.h[0]
191        FMLA v19.8h, v21.8h, v1.h[0]
192        FMLA v28.8h, v20.8h, v2.h[0]
193        FMLA v29.8h, v21.8h, v2.h[0]
194        FMLA v30.8h, v20.8h, v3.h[0]
195        FMLA v31.8h, v21.8h, v3.h[0]
196        B 2b
197
198        # Store odd width
1994:
200        TBZ x1, 3, 5f
201        $if INC:
202          STR q30, [x7], 16
203          MOV v30.16b, v31.16b
204          STR q28, [x10], 16
205          MOV v28.16b, v29.16b
206          STR q18, [x9], 16
207          MOV v18.16b, v19.16b
208          STR q16, [x6], 16
209          MOV v16.16b, v17.16b
210        $else:
211          STR q16, [x6], 16
212          MOV v16.16b, v17.16b
213          STR q18, [x9], 16
214          MOV v18.16b, v19.16b
215          STR q28, [x10], 16
216          MOV v28.16b, v29.16b
217          STR q30, [x7], 16
218          MOV v30.16b, v31.16b
219
2205:
221        TBZ x1, 2, 6f
222        $if INC:
223          STR d30, [x7], 8
224          DUP d30, v30.d[1]
225          STR d28, [x10], 8
226          DUP d28, v28.d[1]
227          STR d18, [x9], 8
228          DUP d18, v18.d[1]
229          STR d16, [x6], 8
230          DUP d16, v16.d[1]
231        $else:
232          STR d16, [x6], 8
233          DUP d16, v16.d[1]
234          STR d18, [x9], 8
235          DUP d18, v18.d[1]
236          STR d28, [x10], 8
237          DUP d28, v28.d[1]
238          STR d30, [x7], 8
239          DUP d30, v30.d[1]
240
2416:
242        TBZ x1, 1, 7f
243        $if INC:
244          STR s30,  [x7], 4
245          DUP s30, v30.s[1]
246          STR s28, [x10], 4
247          DUP s28, v28.s[1]
248          STR s18,  [x9], 4
249          DUP s18, v18.s[1]
250          STR s16,  [x6], 4
251          DUP s16, v16.s[1]
252        $else:
253          STR s16,  [x6], 4
254          DUP s16, v16.s[1]
255          STR s18,  [x9], 4
256          DUP s18, v18.s[1]
257          STR s28, [x10], 4
258          DUP s28, v28.s[1]
259          STR s30,  [x7], 4
260          DUP s30, v30.s[1]
261
2627:
263        TBZ x1, 0, 8f
264        $if INC:
265          STR h30,  [x7]
266          STR h28, [x10]
267          STR h18,  [x9]
268          STR h16,  [x6]
269        $else:
270          STR h16,  [x6]
271          STR h18,  [x9]
272          STR h28, [x10]
273          STR h30,  [x7]
2748:
275        RET
276
277END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32
278
279#ifdef __ELF__
280.section ".note.GNU-stack","",%progbits
281#endif
282