1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64(
9#     size_t mr,                (x0) - unused.  mr = 1
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          (x4) - unused
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         (x7) - unused
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointer
27# x3  a0
28
29# C pointer
30# x6  c0
31
32# Vector register usage
33# A0   v0 v1
34# B   v20 v21 v22 v23
35# C   v16 v18
36# Clamp v4, v5, v6
37
38
39BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64
40
41        $if INC:
42          # Load cn_stride, acc
43          LDP x14, x15, [sp]
44          # Load params pointer
45          LDR x8, [sp, 16]
46        $else:
47          # Load cn_stride, params pointer
48          LDP x14, x8, [sp]
49
50        # Load params values
51        LD3R {v4.8h, v5.8h, v6.8h}, [x8]
520:
53        $if INC:
54          # Load initial accumulators
55          LDR q16, [x15], 16
56        $else:
57          # Load initial bias from w into accumulators
58          LDR q16, [x5], 16
59
60        MOVI v18.8h, 0  // second set of C for pipelining FMLA
61
62        # Is there at least 4 halffloats (8 bytes)
63        SUBS x0, x2, 8  // k = kc - 8
64
65        B.LO 3f
66
67        # Main loop - 4 halffloats of A (8 bytes)
681:
69        LDR  d0, [x3], 8
70        LDR q20, [x5], 16
71        LDR q21, [x5], 16
72        LDR q22, [x5], 16
73        LDR q23, [x5], 16
74        SUBS x0, x0, 8
75        FMLA v16.8h, v20.8h, v0.h[0]
76        FMLA v18.8h, v21.8h, v0.h[1]
77        FMLA v16.8h, v22.8h, v0.h[2]
78        FMLA v18.8h, v23.8h, v0.h[3]
79        B.HS 1b
80
81        # Is there a remainder?- 2 halffloats of A (4 bytes)
82        TBNZ x0, 2, 4f
83        # Is there a remainder?- 1 halffloats of A (2 bytes)
84        TBNZ x0, 1, 5f
85
862:
87        FADD v16.8h, v16.8h, v18.8h
88        SUBS x1, x1, 8
89
90        # Scale and Clamp
91        FMUL v16.8h, v16.8h, v4.8h
92        FMAX v16.8h, v16.8h, v5.8h
93        FMIN v16.8h, v16.8h, v6.8h
94
95        # Store full 1 x 8
96        B.LO 6f
97
98        ST1 {v16.16b},  [x6], x14
99        SUB  x3,  x3, x2 // a0 -= kc
100
101        B.HI 0b
102        RET
103
1043:
105        TBZ x0, 2, 5f
1064:
107        # Remainder- 2 halffloats of A (4 bytes)
108        LDR  s0, [x3], 4
109        LDR q20, [x5], 16
110        LDR q21, [x5], 16
111        FMLA v16.8h, v20.8h, v0.h[0]
112        FMLA v18.8h, v21.8h, v0.h[1]
113        TBZ x0, 1, 2b
114
1155:
116        # Remainder- 1 halffloat of A (2 bytes)
117        LDR h0,  [x3], 2
118        LDR q20, [x5], 16
119        FMLA v16.8h, v20.8h, v0.h[0]
120        B 2b
121
122        # Store odd channels
1236:
124        TBZ x1, 2, 7f
125        STR d16, [x6], 8
126        DUP d16, v16.d[1]
127
1287:
129        TBZ x1, 1, 8f
130        STR s16, [x6], 4
131        DUP s16, v16.s[1]
132
1338:
134        TBZ x1, 0, 9f
135        STR h16, [x6]
1369:
137        RET
138
139END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64
140
141#ifdef __ELF__
142.section ".note.GNU-stack","",%progbits
143#endif
144