1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75(
13#     size_t mr,                (x0) - unused.  mr = 1
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          (x4) - unused
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         (x7) - unused
21#     size_t cn_stride,         [sp] -> x14
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# A pointer
28# x3  a0
29
30# C pointer
31# x6  c0
32
33# Clamp v4 v5
34
35BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75
36
37        # Load cn_stride, acc
38        LDP x14, x15, [sp]
39        # Load params pointer
40        LDR x8, [sp, 16]
41
42        # Load min/max values
43        LD2R {v4.4s, v5.4s}, [x8]
440:
45        # Load initial accumulators
46        LDP q16, q17, [x15], 32
47
48        MOVI v18.4s, 0  // second set of C for pipelining FMLA
49        PRFM PLDL1KEEP, [x5]
50        MOVI v19.4s, 0
51        PRFM PLDL1KEEP, [x5, 64]
52        PRFM PLDL1KEEP, [x5, 128]
53        PRFM PLDL1KEEP, [x5, 192]
54
55        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
56        SUBS x0, x2, 32  // k = kc - 32
57
58        B.LO 3f
59
60        # 16 prologue
61        # Read first block of 1 A and B.
62        LDP q20, q21, [x5], 32
63        LDP q22, q23, [x5], 32
64        LDP q24, q25, [x5], 32
65        LDP q26, q27, [x5], 32
66        LDR q0, [x3], 16
67
68        # Is there at least 32.  yes do main loop
69        SUBS x0, x0, 32
70        B.LO 2f
71
72        # Main loop - 8 floats of A (32 bytes)
731:
74        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
75        FMLA v16.4s, v20.4s, v0.s[0]
76        LDR q1, [x3], 16
77        FMLA v17.4s, v21.4s, v0.s[0]
78        LDP q20, q21, [x5], 32
79        FMLA v18.4s, v22.4s, v0.s[1]
80        PRFM PLDL1KEEP, [x5, 96]
81        FMLA v19.4s, v23.4s, v0.s[1]
82        LDP q22, q23, [x5], 32
83        FMLA v16.4s, v24.4s, v0.s[2]
84        FMLA v17.4s, v25.4s, v0.s[2]
85        LDP q24, q25, [x5], 32
86        FMLA v18.4s, v26.4s, v0.s[3]
87        FMLA v19.4s, v27.4s, v0.s[3]
88        LDP q26, q27, [x5], 32
89
90        # Second block of 4.  FMA for second 4, loads for 1st block of 4.
91        FMLA v16.4s, v20.4s, v1.s[0]
92        LDR q0, [x3], 16
93        FMLA v17.4s, v21.4s, v1.s[0]
94        LDP q20, q21, [x5], 32
95        FMLA v18.4s, v22.4s, v1.s[1]
96        FMLA v19.4s, v23.4s, v1.s[1]
97        LDP q22, q23, [x5], 32
98        FMLA v16.4s, v24.4s, v1.s[2]
99        FMLA v17.4s, v25.4s, v1.s[2]
100        LDP q24, q25, [x5], 32
101        FMLA v18.4s, v26.4s, v1.s[3]
102        FMLA v19.4s, v27.4s, v1.s[3]
103        SUBS x0, x0, 32
104        LDP q26, q27, [x5], 32
105        B.HS 1b
106
1072:
108        # Epilogue
109
110        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
111        FMLA v16.4s, v20.4s, v0.s[0]
112        LDR q1, [x3], 16
113        FMLA v17.4s, v21.4s, v0.s[0]
114        LDP q20, q21, [x5], 32
115        FMLA v18.4s, v22.4s, v0.s[1]
116        FMLA v19.4s, v23.4s, v0.s[1]
117        LDP q22, q23, [x5], 32
118        FMLA v16.4s, v24.4s, v0.s[2]
119        FMLA v17.4s, v25.4s, v0.s[2]
120        LDP q24, q25, [x5], 32
121        FMLA v18.4s, v26.4s, v0.s[3]
122        FMLA v19.4s, v27.4s, v0.s[3]
123        LDP q26, q27, [x5], 32
124
125        # Second block of 4.  no loads
126        FMLA v16.4s, v20.4s, v1.s[0]
127        FMLA v17.4s, v21.4s, v1.s[0]
128        FMLA v18.4s, v22.4s, v1.s[1]
129        FMLA v19.4s, v23.4s, v1.s[1]
130        FMLA v16.4s, v24.4s, v1.s[2]
131        FMLA v17.4s, v25.4s, v1.s[2]
132        FMLA v18.4s, v26.4s, v1.s[3]
133        FMLA v19.4s, v27.4s, v1.s[3]
134
1353:
136        # Is there a remainder?- 4 floats of A (16 bytes)
137        TBNZ x0, 4, 5f
138        # Is there a remainder?- 2 floats of A (8 bytes)
139        TBNZ x0, 3, 6f
140        # Is there a remainder?- 1 floats of A (4 bytes)
141        TBNZ x0, 2, 8f
142
1434:
144        FADD v16.4s, v16.4s, v18.4s
145        SUBS x1, x1, 8
146        FADD v17.4s, v17.4s, v19.4s
147
148        # Clamp
149        FMAX v16.4s, v16.4s, v4.4s
150        FMAX v17.4s, v17.4s, v4.4s
151        FMIN v16.4s, v16.4s, v5.4s
152        FMIN v17.4s, v17.4s, v5.4s
153
154        # Store full 1 x 8
155        B.LO 9f
156
157        STP q16, q17, [x6]
158        ADD x6, x6, x14
159
160        SUB  x3,  x3, x2 // a0 -= kc
161
162        B.HI 0b
163
164        RET
165
1665:
167        # Remainder- 4 floats of A (16 bytes)
168        LDP q20, q21, [x5], 32
169        LDR q0, [x3], 16
170        FMLA v16.4s, v20.4s, v0.s[0]
171        FMLA v17.4s, v21.4s, v0.s[0]
172        LDP q22, q23, [x5], 32
173        LDP q24, q25, [x5], 32
174        LDP q26, q27, [x5], 32
175        FMLA v18.4s, v22.4s, v0.s[1]
176        FMLA v19.4s, v23.4s, v0.s[1]
177        FMLA v16.4s, v24.4s, v0.s[2]
178        FMLA v17.4s, v25.4s, v0.s[2]
179        FMLA v18.4s, v26.4s, v0.s[3]
180        FMLA v19.4s, v27.4s, v0.s[3]
181
182        TBZ x0, 3, 7f
1836:
184        # Remainder- 2 floats of A (8 bytes)
185        LDP q20, q21, [x5], 32
186        LDR d0, [x3], 8
187        FMLA v16.4s, v20.4s, v0.s[0]
188        FMLA v17.4s, v21.4s, v0.s[0]
189        LDP q22, q23, [x5], 32
190        FMLA v18.4s, v22.4s, v0.s[1]
191        FMLA v19.4s, v23.4s, v0.s[1]
1927:
193        TBZ x0, 2, 4b
1948:
195        # Remainder- 1 float of A (4 bytes)
196        LDP q20, q21, [x5], 32
197        LDR s0, [x3], 4
198        FMLA v16.4s, v20.4s, v0.s[0]
199        FMLA v17.4s, v21.4s, v0.s[0]
200        B 4b
201
202        # Store odd channels
2039:
204        TBZ x1, 2, 10f
205        STR q16, [x6], 16
206        MOV v16.16b, v17.16b
207
20810:
209        TBZ x1, 1, 11f
210        STR d16, [x6], 8
211        DUP d16, v16.d[1]
212
21311:
214        TBZ x1, 0, 12f
215        STR s16, [x6]
21612:
217        RET
218
219END_FUNCTION xnn_f32_gemminc_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75
220
221#ifdef __ELF__
222.section ".note.GNU-stack","",%progbits
223#endif
224