1// Auto-generated file. Do not edit!
2//   Template: src/f16-gemm/1x16-aarch64-neonfp16arith-ld32.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f16_gemminc_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32(
13#     size_t mr,                (x0) - unused.  mr = 1
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          (x4) - unused
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         (x7) - unused
21#     size_t cn_stride,         [sp] -> x14
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# A pointer
28# x3  a0
29
30# C pointer
31# x6  c0
32
33# Clamp v4, v5, v6
34
35BEGIN_FUNCTION xnn_f16_gemminc_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32
36
37        # Load cn_stride, acc
38        LDP x14, x15, [sp]
39        # Load params pointer
40        LDR x8, [sp, 16]
41
42        # Load params values
43        LD3R {v4.8h, v5.8h, v6.8h}, [x8]
440:
45        # Load initial accumulators
46        LDP q16, q17, [x15], 32
47
48        MOVI v18.8h, 0  // second set of C for pipelining FMLA
49        MOVI v19.8h, 0
50
51        # Is there at least 2 halffloats (4 bytes)
52        SUBS x0, x2, 4  // k = kc - 4
53
54        B.LO 3f
55
56        # Main loop - 2 halffloats of A (4 bytes)
571:
58        LDR  s0, [x3], 4
59        LDR q20, [x5], 16
60        LDR q21, [x5], 16
61        LDR q22, [x5], 16
62        LDR q23, [x5], 16
63        SUBS x0, x0, 4
64        FMLA v16.8h, v20.8h, v0.h[0]
65        FMLA v17.8h, v21.8h, v0.h[0]
66        FMLA v18.8h, v22.8h, v0.h[1]
67        FMLA v19.8h, v23.8h, v0.h[1]
68        B.HS 1b
69
70        # Is there a remainder?- 1 halffloats of A (2 bytes)
71        TBNZ x0, 1, 3f
72
732:
74        FADD v16.8h, v16.8h, v18.8h
75        FADD v17.8h, v17.8h, v19.8h
76        SUBS x1, x1, 16
77
78        # Scale and Clamp
79
80        FMUL v16.8h, v16.8h, v4.8h
81        FMUL v17.8h, v17.8h, v4.8h
82        FMAX v16.8h, v16.8h, v5.8h
83        FMAX v17.8h, v17.8h, v5.8h
84        FMIN v16.8h, v16.8h, v6.8h
85        FMIN v17.8h, v17.8h, v6.8h
86
87        # Store full 1 x 16
88        B.LO 4f
89
90        STP q16, q17, [x6]
91        ADD x6, x6, x14
92
93        SUB  x3,  x3, x2 // a0 -= kc
94
95        B.HI 0b
96
97        RET
98
993:
100        # Remainder- 1 halffloat of A (2 bytes)
101        LDR q20, [x5], 16
102        LDR q21, [x5], 16
103        LDR h0, [x3], 2
104        FMLA v16.8h, v20.8h, v0.h[0]
105        FMLA v17.8h, v21.8h, v0.h[0]
106        B 2b
107
108        # Store odd channels
1094:
110        TBZ x1, 3, 5f
111        STR q16, [x6], 16
112        MOV v16.16b, v17.16b
113
1145:
115        TBZ x1, 2, 6f
116        STR d16, [x6], 8
117        DUP d16, v16.d[1]
118
1196:
120        TBZ x1, 1, 7f
121        STR s16, [x6], 4
122        DUP s16, v16.s[1]
123
1247:
125        TBZ x1, 0, 8f
126        STR h16, [x6]
1278:
128        RET
129
130END_FUNCTION xnn_f16_gemminc_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32
131
132#ifdef __ELF__
133.section ".note.GNU-stack","",%progbits
134#endif
135