1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfma_ld64(
9#     size_t mr,                (x0) - unused.  mr = 1
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          (x4) - unused
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         (x7) - unused
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointer
27# x3  a0
28
29# C pointer
30# x6  c0
31
32# Clamp v4 v5
33
34BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfma_ld64
35
36        $if INC:
37          # Load cn_stride, acc
38          LDP x14, x15, [sp]
39          # Load params pointer
40          LDR x8, [sp, 16]
41        $else:
42          # Load cn_stride, params pointer
43          LDP x14, x8, [sp]
44
45        # Load min/max values
46        LD2R {v4.4s, v5.4s}, [x8]
470:
48        $if INC:
49          # Load initial accumulators
50          LDP q16, q17, [x15], 32
51        $else:
52          # Load initial bias from w into accumulators
53          LDP q16, q17, [x5], 32
54
55        MOVI v18.4s, 0  // second set of C for pipelining FMLA
56        MOVI v19.4s, 0
57
58        # Is there at least 2 floats (8 bytes)
59        SUBS x0, x2, 8  // k = kc - 8
60
61        B.LO 3f
62
63        # Main loop - 2 floats of A (8 bytes)
641:
65        LDR d0, [x3], 8
66        LDP q20, q21, [x5], 32
67        LDP q22, q23, [x5], 32
68        FMLA v16.4s, v20.4s, v0.s[0]
69        FMLA v17.4s, v21.4s, v0.s[0]
70        SUBS x0, x0, 8
71        FMLA v18.4s, v22.4s, v0.s[1]
72        FMLA v19.4s, v23.4s, v0.s[1]
73        B.HS 1b
74
75        # Is there a remainder?- 1 floats of A (4 bytes)
76        TBNZ x0, 2, 3f
77
782:
79        FADD v16.4s, v16.4s, v18.4s
80        FADD v17.4s, v17.4s, v19.4s
81        SUBS x1, x1, 8
82
83        # Clamp
84        FMAX v16.4s, v16.4s, v4.4s
85        FMAX v17.4s, v17.4s, v4.4s
86        FMIN v16.4s, v16.4s, v5.4s
87        FMIN v17.4s, v17.4s, v5.4s
88
89        # Store full 1 x 8
90        B.LO 4f
91
92        STP q16, q17, [x6]
93        ADD x6, x6, x14
94
95        SUB  x3,  x3, x2 // a0 -= kc
96
97        B.HI 0b
98
99        RET
100
1013:
102        # Remainder- 1 float of A (4 bytes)
103        LDP q20, q21, [x5], 32
104        LDR s0, [x3], 4
105        FMLA v16.4s, v20.4s, v0.s[0]
106        FMLA v17.4s, v21.4s, v0.s[0]
107        B 2b
108
109        # Store odd channels
1104:
111        TBZ x1, 2, 5f
112        STR q16, [x6], 16
113        MOV v16.16b, v17.16b
114
1155:
116        TBZ x1, 1, 6f
117        STR d16, [x6], 8
118        DUP d16, v16.d[1]
119
1206:
121        TBZ x1, 0, 7f
122        STR s16, [x6]
1237:
124        RET
125
126END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfma_ld64
127
128#ifdef __ELF__
129.section ".note.GNU-stack","",%progbits
130#endif
131