1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_ld128(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27# x3  a0
28# x11 a1
29# x12 a2
30# x4  a3 / a_stride
31
32# C pointers
33# x6  c0
34# x9  c1
35# x10 c2
36# x7  c3 / cm_stride
37
38BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_ld128
39
40        $if INC:
41          # Load cn_stride, acc
42          LDP x14, x15, [sp]
43          # Load params pointer
44          LDR x8, [sp, 16]
45        $else:
46          # Load cn_stride, params pointer
47          LDP x14, x8, [sp]
48
49        # Load min/max values
50        LD2R {v4.4s, v5.4s}, [x8]
51
52        # Clamp A and C pointers
53        CMP x0, 2                // if mr < 2
54        ADD x11, x3, x4          // a1 = a0 + a_stride
55        ADD x9, x6, x7           // c1 = c0 + cm_stride
56        CSEL x11, x3, x11, LO    //   a1 = a0
57        CSEL x9, x6, x9, LO      //   c1 = c0
58
59        ADD x12, x11, x4         // a2 = a1 + a_stride
60        ADD x10, x9, x7          // c2 = c1 + cm_stride
61                                 // if mr <= 2
62        CSEL x12, x11, x12, LS   //   a2 = a1
63        CSEL x10, x9, x10, LS    //   c2 = c1
64
65        CMP x0, 4                // if mr < 4
66        ADD x4, x12, x4          // a3 = a2 + a_stride
67        ADD x7, x10, x7          // c3 = c2 + cm_stride
68        CSEL x4, x12, x4, LO     //   a3 = a2
69        CSEL x7, x10, x7, LO     //   c3 = c2
70
710:
72        $if INC:
73          # Load initial accumulators
74          LDP q16, q17, [x15], 32
75          LDP q18, q19, [x15], 32
76          LDP q28, q29, [x15], 32
77          LDP q30, q31, [x15], 32
78        $else:
79          # Load initial bias from w into accumulators
80          LDP q16, q17, [x5], 32
81          MOV v18.16b, v16.16b
82          MOV v19.16b, v17.16b
83          MOV v28.16b, v16.16b
84          MOV v29.16b, v17.16b
85          MOV v30.16b, v16.16b
86          MOV v31.16b, v17.16b
87
88        # Is there at least 4 floats (16 bytes)?
89        SUBS x0, x2, 16  // k = kc - 16
90        B.LO 3f
91
92        # Main loop - 4 floats of A (16 bytes)
931:
94        LDR q0, [x3], 16
95        LDP q20, q21, [x5], 32
96        LDR q1, [x11], 16
97        LDR q2, [x12], 16
98        LDR q3, [x4], 16
99        FMLA v16.4s, v20.4s, v0.s[0]
100        FMLA v17.4s, v21.4s, v0.s[0]
101        FMLA v18.4s, v20.4s, v1.s[0]
102        FMLA v19.4s, v21.4s, v1.s[0]
103        LDP q22, q23, [x5], 32
104        FMLA v28.4s, v20.4s, v2.s[0]
105        FMLA v29.4s, v21.4s, v2.s[0]
106        FMLA v30.4s, v20.4s, v3.s[0]
107        FMLA v31.4s, v21.4s, v3.s[0]
108        LDP q24, q25, [x5], 32
109        FMLA v16.4s, v22.4s, v0.s[1]
110        FMLA v17.4s, v23.4s, v0.s[1]
111        FMLA v18.4s, v22.4s, v1.s[1]
112        FMLA v19.4s, v23.4s, v1.s[1]
113        LDP q26, q27, [x5], 32
114        FMLA v28.4s, v22.4s, v2.s[1]
115        FMLA v29.4s, v23.4s, v2.s[1]
116        FMLA v30.4s, v22.4s, v3.s[1]
117        FMLA v31.4s, v23.4s, v3.s[1]
118        FMLA v16.4s, v24.4s, v0.s[2]
119        FMLA v17.4s, v25.4s, v0.s[2]
120        FMLA v18.4s, v24.4s, v1.s[2]
121        FMLA v19.4s, v25.4s, v1.s[2]
122        FMLA v28.4s, v24.4s, v2.s[2]
123        FMLA v29.4s, v25.4s, v2.s[2]
124        FMLA v30.4s, v24.4s, v3.s[2]
125        FMLA v31.4s, v25.4s, v3.s[2]
126        FMLA v16.4s, v26.4s, v0.s[3]
127        FMLA v17.4s, v27.4s, v0.s[3]
128        FMLA v18.4s, v26.4s, v1.s[3]
129        FMLA v19.4s, v27.4s, v1.s[3]
130        FMLA v28.4s, v26.4s, v2.s[3]
131        FMLA v29.4s, v27.4s, v2.s[3]
132        SUBS x0, x0, 16
133        FMLA v30.4s, v26.4s, v3.s[3]
134        FMLA v31.4s, v27.4s, v3.s[3]
135        B.HS 1b
136
137        TST x0, 15
138        B.NE 3f
139
1402:
141        # Clamp
142        FMAX v16.4s, v16.4s, v4.4s
143        SUBS x1, x1, 8
144        FMAX v17.4s, v17.4s, v4.4s
145        FMAX v18.4s, v18.4s, v4.4s
146        FMAX v19.4s, v19.4s, v4.4s
147        FMAX v28.4s, v28.4s, v4.4s
148        FMAX v29.4s, v29.4s, v4.4s
149        FMAX v30.4s, v30.4s, v4.4s
150        FMAX v31.4s, v31.4s, v4.4s
151        FMIN v16.4s, v16.4s, v5.4s
152        FMIN v17.4s, v17.4s, v5.4s
153        FMIN v18.4s, v18.4s, v5.4s
154        FMIN v19.4s, v19.4s, v5.4s
155        FMIN v28.4s, v28.4s, v5.4s
156        FMIN v29.4s, v29.4s, v5.4s
157        FMIN v30.4s, v30.4s, v5.4s
158        FMIN v31.4s, v31.4s, v5.4s
159
160        # Store full 4 x 8
161        B.LO 5f
162
163        $if INC:
164          ST1 {v30.16b, v31.16b},  [x7], x14
165          SUB  x3,  x3, x2 // a0 -= kc
166          ST1 {v28.16b, v29.16b}, [x10], x14
167          SUB x11, x11, x2 // a1 -= kc
168          ST1 {v18.16b, v19.16b},  [x9], x14
169          SUB x12, x12, x2 // a2 -= kc
170          ST1 {v16.16b, v17.16b},  [x6], x14
171          SUB  x4,  x4, x2 // a3 -= kc
172        $else:
173          ST1 {v16.16b, v17.16b},  [x6], x14
174          SUB  x3,  x3, x2 // a0 -= kc
175          ST1 {v18.16b, v19.16b},  [x9], x14
176          SUB x11, x11, x2 // a1 -= kc
177          ST1 {v28.16b, v29.16b}, [x10], x14
178          SUB x12, x12, x2 // a2 -= kc
179          ST1 {v30.16b, v31.16b},  [x7], x14
180          SUB  x4,  x4, x2 // a3 -= kc
181
182        B.HI 0b
183        RET
184
185        # Remainder- 2 floats of A (8 bytes)
1863:
187        # Is there a remainder?- 2 floats of A (8 bytes)
188        TBZ x0, 3, 4f
189
190        # Remainder- 2 floats of A (8 bytes)
191        LDR d0,  [x3], 8
192        LDP q20, q21, [x5], 32
193        LDR d1, [x11], 8
194        LDR d2, [x12], 8
195        LDR d3,  [x4], 8
196        FMLA v16.4s, v20.4s, v0.s[0]
197        FMLA v17.4s, v21.4s, v0.s[0]
198        FMLA v18.4s, v20.4s, v1.s[0]
199        FMLA v19.4s, v21.4s, v1.s[0]
200        LDP q22, q23, [x5], 32
201        FMLA v28.4s, v20.4s, v2.s[0]
202        FMLA v29.4s, v21.4s, v2.s[0]
203        FMLA v30.4s, v20.4s, v3.s[0]
204        FMLA v31.4s, v21.4s, v3.s[0]
205        FMLA v16.4s, v22.4s, v0.s[1]
206        FMLA v17.4s, v23.4s, v0.s[1]
207        FMLA v18.4s, v22.4s, v1.s[1]
208        FMLA v19.4s, v23.4s, v1.s[1]
209        FMLA v28.4s, v22.4s, v2.s[1]
210        FMLA v29.4s, v23.4s, v2.s[1]
211        FMLA v30.4s, v22.4s, v3.s[1]
212        FMLA v31.4s, v23.4s, v3.s[1]
213
214        # Is there a remainder?- 1 floats of A (4 bytes)
215        TBZ x0, 2, 2b
216
217        # Remainder- 1 float of A (4 bytes)
2184:
219        LDR s0,  [x3], 4
220        LDP q20, q21, [x5], 32
221        LDR s1, [x11], 4
222        LDR s2, [x12], 4
223        LDR s3,  [x4], 4
224        FMLA v16.4s, v20.4s, v0.s[0]
225        FMLA v17.4s, v21.4s, v0.s[0]
226        FMLA v18.4s, v20.4s, v1.s[0]
227        FMLA v19.4s, v21.4s, v1.s[0]
228        FMLA v28.4s, v20.4s, v2.s[0]
229        FMLA v29.4s, v21.4s, v2.s[0]
230        FMLA v30.4s, v20.4s, v3.s[0]
231        FMLA v31.4s, v21.4s, v3.s[0]
232        B 2b
233
234
235        # Store odd width
2365:
237        TBZ x1, 2, 6f
238        $if INC:
239          STR q30, [x7], 16
240          MOV v30.16b, v31.16b
241          STR q28, [x10], 16
242          MOV v28.16b, v29.16b
243          STR q18, [x9], 16
244          MOV v18.16b, v19.16b
245          STR q16, [x6], 16
246          MOV v16.16b, v17.16b
247        $else:
248          STR q16, [x6], 16
249          MOV v16.16b, v17.16b
250          STR q18, [x9], 16
251          MOV v18.16b, v19.16b
252          STR q28, [x10], 16
253          MOV v28.16b, v29.16b
254          STR q30, [x7], 16
255          MOV v30.16b, v31.16b
256
2576:
258        TBZ x1, 1, 7f
259        $if INC:
260          STR d30, [x7], 8
261          DUP d30, v30.d[1]
262          STR d28, [x10], 8
263          DUP d28, v28.d[1]
264          STR d18, [x9], 8
265          DUP d18, v18.d[1]
266          STR d16, [x6], 8
267          DUP d16, v16.d[1]
268        $else:
269          STR d16, [x6], 8
270          DUP d16, v16.d[1]
271          STR d18, [x9], 8
272          DUP d18, v18.d[1]
273          STR d28, [x10], 8
274          DUP d28, v28.d[1]
275          STR d30, [x7], 8
276          DUP d30, v30.d[1]
277
2787:
279        TBZ x1, 0, 8f
280        $if INC:
281          STR s30,  [x7]
282          STR s28, [x10]
283          STR s18,  [x9]
284          STR s16,  [x6]
285        $else:
286          STR s16,  [x6]
287          STR s18,  [x9]
288          STR s28, [x10]
289          STR s30,  [x7]
290
2918:
292        RET
293
294END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_ld128
295
296#ifdef __ELF__
297.section ".note.GNU-stack","",%progbits
298#endif
299