1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27# x3  a0
28# x11 a1
29# x12 a2
30# x4  a3 / a_stride
31
32# C pointers
33# x6  c0
34# x9  c1
35# x10 c2
36# x7  c3 / cm_stride
37
38# Vector register usage
39# A0  v0  v4
40# A1  v1  v5
41# A2  v2  v6
42# A3  v3  v7
43# B   v8  v9 v10 v11
44# B  v12 v13 v14 v15
45# B  v20 v21 v22 v23
46# B  v24 v25 v26 v27
47# C  v16 v17
48# C  v18 v19
49# C  v28 v29
50# C  v30 v31
51# Clamp v4 v5
52
53BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
54
55        $if INC:
56          # Load cn_stride, acc
57          LDP x14, x15, [sp]
58          # Load params pointer
59          LDR x8, [sp, 16]
60        $else:
61          # Load cn_stride, params pointer
62          LDP x14, x8, [sp]
63
64        # Load min/max values
65        LD2R {v4.4s, v5.4s}, [x8]
66
67        # Save d8-d15 on stack
68        STP  d8,  d9, [sp, -64]!
69        STP d10, d11, [sp, 16]
70        STP d12, d13, [sp, 32]
71        STP d14, d15, [sp, 48]
72
73        # Clamp A and C pointers
74        CMP x0, 2                // if mr < 2
75        ADD x11, x3, x4          // a1 = a0 + a_stride
76        ADD x9, x6, x7           // c1 = c0 + cm_stride
77        CSEL x11, x3, x11, LO    //   a1 = a0
78        CSEL x9, x6, x9, LO      //   c1 = c0
79
80        ADD x12, x11, x4         // a2 = a1 + a_stride
81        ADD x10, x9, x7          // c2 = c1 + cm_stride
82                                 // if mr <= 2
83        CSEL x12, x11, x12, LS   //   a2 = a1
84        CSEL x10, x9, x10, LS    //   c2 = c1
85
86        CMP x0, 4                // if mr < 4
87        ADD x4, x12, x4          // a3 = a2 + a_stride
88        ADD x7, x10, x7          // c3 = c2 + cm_stride
89        CSEL x4, x12, x4, LO     //   a3 = a2
90        CSEL x7, x10, x7, LO     //   c3 = c2
91
920:
93        $if INC:
94          # Load initial accumulators
95          LDP q16, q17, [x15], 32
96          LDP q18, q19, [x15], 32
97          LDP q28, q29, [x15], 32
98          LDP q30, q31, [x15], 32
99        $else:
100          # Load initial bias from w into accumulators
101          LDP q16, q17, [x5], 32
102          MOV v18.16b, v16.16b
103          MOV v19.16b, v17.16b
104          MOV v28.16b, v16.16b
105          MOV v29.16b, v17.16b
106          MOV v30.16b, v16.16b
107          MOV v31.16b, v17.16b
108
109        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
110        SUBS x0, x2, 32  // k = kc - 32
111        B.LO 3f
112
113        # 16 prologue
114        # Read first block of 4 A and B.
115        LDR q0,  [x3], 16
116        LDP q20, q21, [x5], 32
117        LDR q1, [x11], 16
118        LDR q2, [x12], 16
119        LDR q3,  [x4], 16
120        LDP q22, q23, [x5], 32
121        LDP q24, q25, [x5], 32
122        LDP q26, q27, [x5], 32
123
124        # Is there at least 32.  yes do main loop
125        SUBS x0, x0, 32
126        B.LO 2f
127
128        # Main loop - 8 floats of A (32 bytes)
1291:
130        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
131        FMLA v16.4s, v20.4s, v0.s[0]
132        LDP q8, q9, [x5], 32
133        FMLA v17.4s, v21.4s, v0.s[0]
134        FMLA v18.4s, v20.4s, v1.s[0]
135        LDP q10, q11, [x5], 32
136        FMLA v19.4s, v21.4s, v1.s[0]
137        FMLA v28.4s, v20.4s, v2.s[0]
138        LDP q12, q13, [x5], 32
139        FMLA v29.4s, v21.4s, v2.s[0]
140        FMLA v30.4s, v20.4s, v3.s[0]
141        LDP q14, q15, [x5], 32
142        FMLA v31.4s, v21.4s, v3.s[0]
143        FMLA v16.4s, v22.4s, v0.s[1]
144        LDR q4, [x3], 16
145        FMLA v17.4s, v23.4s, v0.s[1]
146        FMLA v18.4s, v22.4s, v1.s[1]
147        LDR q5, [x11], 16
148        FMLA v19.4s, v23.4s, v1.s[1]
149        FMLA v28.4s, v22.4s, v2.s[1]
150        LDR q6, [x12], 16
151        FMLA v29.4s, v23.4s, v2.s[1]
152        FMLA v30.4s, v22.4s, v3.s[1]
153        LDR q7, [x4], 16
154        FMLA v31.4s, v23.4s, v3.s[1]
155        FMLA v16.4s, v24.4s, v0.s[2]
156        $if PREFETCH:
157          PRFM PLDL1KEEP, [x5, 128]
158        FMLA v17.4s, v25.4s, v0.s[2]
159        FMLA v18.4s, v24.4s, v1.s[2]
160        $if PREFETCH:
161          PRFM PLDL1KEEP, [x5, 192]
162        FMLA v19.4s, v25.4s, v1.s[2]
163        FMLA v28.4s, v24.4s, v2.s[2]
164        $if PREFETCH:
165          PRFM PLDL1KEEP, [x5, 256]
166        FMLA v29.4s, v25.4s, v2.s[2]
167        FMLA v30.4s, v24.4s, v3.s[2]
168        $if PREFETCH:
169          PRFM PLDL1KEEP, [x5, 320]
170        FMLA v31.4s, v25.4s, v3.s[2]
171        FMLA v16.4s, v26.4s, v0.s[3]
172        FMLA v17.4s, v27.4s, v0.s[3]
173        FMLA v18.4s, v26.4s, v1.s[3]
174        FMLA v19.4s, v27.4s, v1.s[3]
175        FMLA v28.4s, v26.4s, v2.s[3]
176        FMLA v29.4s, v27.4s, v2.s[3]
177        FMLA v30.4s, v26.4s, v3.s[3]
178        FMLA v31.4s, v27.4s, v3.s[3]
179
180        # Second block of 4.  FMA for second 4, loads for 1nd block of 4.
181        FMLA v16.4s, v8.4s, v4.s[0]
182        LDP q20, q21, [x5], 32
183        FMLA v17.4s, v9.4s, v4.s[0]
184        FMLA v18.4s, v8.4s, v5.s[0]
185        LDP q22, q23, [x5], 32
186        FMLA v19.4s, v9.4s, v5.s[0]
187        FMLA v28.4s, v8.4s, v6.s[0]
188        LDP q24, q25, [x5], 32
189        FMLA v29.4s, v9.4s, v6.s[0]
190        FMLA v30.4s, v8.4s, v7.s[0]
191        LDP q26, q27, [x5], 32
192        FMLA v31.4s, v9.4s, v7.s[0]
193        FMLA v16.4s, v10.4s, v4.s[1]
194        LDR q0, [x3], 16
195        FMLA v17.4s, v11.4s, v4.s[1]
196        FMLA v18.4s, v10.4s, v5.s[1]
197        LDR q1, [x11], 16
198        FMLA v19.4s, v11.4s, v5.s[1]
199        FMLA v28.4s, v10.4s, v6.s[1]
200        LDR q2, [x12], 16
201        FMLA v29.4s, v11.4s, v6.s[1]
202        FMLA v30.4s, v10.4s, v7.s[1]
203        LDR q3, [x4], 16
204        FMLA v31.4s, v11.4s, v7.s[1]
205        FMLA v16.4s, v12.4s, v4.s[2]
206        FMLA v17.4s, v13.4s, v4.s[2]
207        FMLA v18.4s, v12.4s, v5.s[2]
208        FMLA v19.4s, v13.4s, v5.s[2]
209        FMLA v28.4s, v12.4s, v6.s[2]
210        FMLA v29.4s, v13.4s, v6.s[2]
211        FMLA v30.4s, v12.4s, v7.s[2]
212        FMLA v31.4s, v13.4s, v7.s[2]
213        FMLA v16.4s, v14.4s, v4.s[3]
214        FMLA v17.4s, v15.4s, v4.s[3]
215        FMLA v18.4s, v14.4s, v5.s[3]
216        FMLA v19.4s, v15.4s, v5.s[3]
217        FMLA v28.4s, v14.4s, v6.s[3]
218        FMLA v29.4s, v15.4s, v6.s[3]
219        SUBS x0, x0, 32
220        FMLA v30.4s, v14.4s, v7.s[3]
221        FMLA v31.4s, v15.4s, v7.s[3]
222        B.HS 1b
223
2242:
225        # Epilogue
226        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
227        FMLA v16.4s, v20.4s, v0.s[0]
228        LDP q8, q9, [x5], 32
229        FMLA v17.4s, v21.4s, v0.s[0]
230        FMLA v18.4s, v20.4s, v1.s[0]
231        LDP q10, q11, [x5], 32
232        FMLA v19.4s, v21.4s, v1.s[0]
233        FMLA v28.4s, v20.4s, v2.s[0]
234        LDP q12, q13, [x5], 32
235        FMLA v29.4s, v21.4s, v2.s[0]
236        FMLA v30.4s, v20.4s, v3.s[0]
237        LDP q14, q15, [x5], 32
238        FMLA v31.4s, v21.4s, v3.s[0]
239        FMLA v16.4s, v22.4s, v0.s[1]
240        LDR q4, [x3], 16
241        FMLA v17.4s, v23.4s, v0.s[1]
242        FMLA v18.4s, v22.4s, v1.s[1]
243        LDR q5, [x11], 16
244        FMLA v19.4s, v23.4s, v1.s[1]
245        FMLA v28.4s, v22.4s, v2.s[1]
246        LDR q6, [x12], 16
247        FMLA v29.4s, v23.4s, v2.s[1]
248        FMLA v30.4s, v22.4s, v3.s[1]
249        LDR q7, [x4], 16
250        FMLA v31.4s, v23.4s, v3.s[1]
251        FMLA v16.4s, v24.4s, v0.s[2]
252        FMLA v17.4s, v25.4s, v0.s[2]
253        FMLA v18.4s, v24.4s, v1.s[2]
254        FMLA v19.4s, v25.4s, v1.s[2]
255        FMLA v28.4s, v24.4s, v2.s[2]
256        FMLA v29.4s, v25.4s, v2.s[2]
257        FMLA v30.4s, v24.4s, v3.s[2]
258        FMLA v31.4s, v25.4s, v3.s[2]
259        FMLA v16.4s, v26.4s, v0.s[3]
260        FMLA v17.4s, v27.4s, v0.s[3]
261        FMLA v18.4s, v26.4s, v1.s[3]
262        FMLA v19.4s, v27.4s, v1.s[3]
263        FMLA v28.4s, v26.4s, v2.s[3]
264        FMLA v29.4s, v27.4s, v2.s[3]
265        FMLA v30.4s, v26.4s, v3.s[3]
266        FMLA v31.4s, v27.4s, v3.s[3]
267
268        # Second block of 4.  FMA for second 4, noloads
269        FMLA v16.4s, v8.4s, v4.s[0]
270        FMLA v17.4s, v9.4s, v4.s[0]
271        FMLA v18.4s, v8.4s, v5.s[0]
272        FMLA v19.4s, v9.4s, v5.s[0]
273        FMLA v28.4s, v8.4s, v6.s[0]
274        FMLA v29.4s, v9.4s, v6.s[0]
275        FMLA v30.4s, v8.4s, v7.s[0]
276        FMLA v31.4s, v9.4s, v7.s[0]
277
278        FMLA v16.4s, v10.4s, v4.s[1]
279        FMLA v17.4s, v11.4s, v4.s[1]
280        FMLA v18.4s, v10.4s, v5.s[1]
281        FMLA v19.4s, v11.4s, v5.s[1]
282        FMLA v28.4s, v10.4s, v6.s[1]
283        FMLA v29.4s, v11.4s, v6.s[1]
284        FMLA v30.4s, v10.4s, v7.s[1]
285        FMLA v31.4s, v11.4s, v7.s[1]
286
287        FMLA v16.4s, v12.4s, v4.s[2]
288        FMLA v17.4s, v13.4s, v4.s[2]
289        FMLA v18.4s, v12.4s, v5.s[2]
290        FMLA v19.4s, v13.4s, v5.s[2]
291        FMLA v28.4s, v12.4s, v6.s[2]
292        FMLA v29.4s, v13.4s, v6.s[2]
293        FMLA v30.4s, v12.4s, v7.s[2]
294        FMLA v31.4s, v13.4s, v7.s[2]
295
296        FMLA v16.4s, v14.4s, v4.s[3]
297        FMLA v17.4s, v15.4s, v4.s[3]
298        FMLA v18.4s, v14.4s, v5.s[3]
299        FMLA v19.4s, v15.4s, v5.s[3]
300
301        # Load min/max values
302        LD2R {v4.4s, v5.4s}, [x8]
303
304        FMLA v28.4s, v14.4s, v6.s[3]
305        FMLA v29.4s, v15.4s, v6.s[3]
306        FMLA v30.4s, v14.4s, v7.s[3]
307        FMLA v31.4s, v15.4s, v7.s[3]
308
3093:
310        # Remainder- 4 floats of A (16 bytes)
311        TBZ x0, 4, 4f
312
313        LDR q0,  [x3], 16
314        LDP q20, q21, [x5], 32
315        LDR q1, [x11], 16
316        LDR q2, [x12], 16
317        LDR q3,  [x4], 16
318        FMLA v16.4s, v20.4s, v0.s[0]
319        FMLA v17.4s, v21.4s, v0.s[0]
320        LDP q22, q23, [x5], 32
321        FMLA v18.4s, v20.4s, v1.s[0]
322        FMLA v19.4s, v21.4s, v1.s[0]
323        LDP q24, q25, [x5], 32
324        FMLA v28.4s, v20.4s, v2.s[0]
325        FMLA v29.4s, v21.4s, v2.s[0]
326        LDP q26, q27, [x5], 32
327        FMLA v30.4s, v20.4s, v3.s[0]
328        FMLA v31.4s, v21.4s, v3.s[0]
329        FMLA v16.4s, v22.4s, v0.s[1]
330        FMLA v17.4s, v23.4s, v0.s[1]
331        FMLA v18.4s, v22.4s, v1.s[1]
332        FMLA v19.4s, v23.4s, v1.s[1]
333        FMLA v28.4s, v22.4s, v2.s[1]
334        FMLA v29.4s, v23.4s, v2.s[1]
335        FMLA v30.4s, v22.4s, v3.s[1]
336        FMLA v31.4s, v23.4s, v3.s[1]
337        FMLA v16.4s, v24.4s, v0.s[2]
338        FMLA v17.4s, v25.4s, v0.s[2]
339        FMLA v18.4s, v24.4s, v1.s[2]
340        FMLA v19.4s, v25.4s, v1.s[2]
341        FMLA v28.4s, v24.4s, v2.s[2]
342        FMLA v29.4s, v25.4s, v2.s[2]
343        FMLA v30.4s, v24.4s, v3.s[2]
344        FMLA v31.4s, v25.4s, v3.s[2]
345        FMLA v16.4s, v26.4s, v0.s[3]
346        FMLA v17.4s, v27.4s, v0.s[3]
347        FMLA v18.4s, v26.4s, v1.s[3]
348        FMLA v19.4s, v27.4s, v1.s[3]
349        FMLA v28.4s, v26.4s, v2.s[3]
350        FMLA v29.4s, v27.4s, v2.s[3]
351        FMLA v30.4s, v26.4s, v3.s[3]
352        FMLA v31.4s, v27.4s, v3.s[3]
353
3544:
355        # Remainder- 2 floats of A (8 bytes)
356        TBZ x0, 3, 5f
357
358        LDR d0,  [x3], 8
359        LDP q20, q21, [x5], 32
360        LDR d1, [x11], 8
361        LDR d2, [x12], 8
362        LDR d3,  [x4], 8
363        FMLA v16.4s, v20.4s, v0.s[0]
364        FMLA v17.4s, v21.4s, v0.s[0]
365        LDP q22, q23, [x5], 32
366        FMLA v18.4s, v20.4s, v1.s[0]
367        FMLA v19.4s, v21.4s, v1.s[0]
368        FMLA v28.4s, v20.4s, v2.s[0]
369        FMLA v29.4s, v21.4s, v2.s[0]
370        FMLA v30.4s, v20.4s, v3.s[0]
371        FMLA v31.4s, v21.4s, v3.s[0]
372        FMLA v16.4s, v22.4s, v0.s[1]
373        FMLA v17.4s, v23.4s, v0.s[1]
374        FMLA v18.4s, v22.4s, v1.s[1]
375        FMLA v19.4s, v23.4s, v1.s[1]
376        FMLA v28.4s, v22.4s, v2.s[1]
377        FMLA v29.4s, v23.4s, v2.s[1]
378        FMLA v30.4s, v22.4s, v3.s[1]
379        FMLA v31.4s, v23.4s, v3.s[1]
380
3815:
382        # Remainder- 1 float of A (4 bytes)
383        TBZ x0, 2, 6f
384
385        LDR s0,  [x3], 4
386        LDP q20, q21, [x5], 32
387        LDR s1, [x11], 4
388        LDR s2, [x12], 4
389        LDR s3,  [x4], 4
390        FMLA v16.4s, v20.4s, v0.s[0]
391        FMLA v17.4s, v21.4s, v0.s[0]
392        FMLA v18.4s, v20.4s, v1.s[0]
393        FMLA v19.4s, v21.4s, v1.s[0]
394        FMLA v28.4s, v20.4s, v2.s[0]
395        FMLA v29.4s, v21.4s, v2.s[0]
396        FMLA v30.4s, v20.4s, v3.s[0]
397        FMLA v31.4s, v21.4s, v3.s[0]
398
3996:
400        # Clamp
401        FMAX v16.4s, v16.4s, v4.4s
402        SUBS x1, x1, 8
403        FMAX v17.4s, v17.4s, v4.4s
404        FMAX v18.4s, v18.4s, v4.4s
405        FMAX v19.4s, v19.4s, v4.4s
406        FMAX v28.4s, v28.4s, v4.4s
407        FMAX v29.4s, v29.4s, v4.4s
408        FMAX v30.4s, v30.4s, v4.4s
409        FMAX v31.4s, v31.4s, v4.4s
410        FMIN v16.4s, v16.4s, v5.4s
411        FMIN v17.4s, v17.4s, v5.4s
412        FMIN v18.4s, v18.4s, v5.4s
413        FMIN v19.4s, v19.4s, v5.4s
414        FMIN v28.4s, v28.4s, v5.4s
415        FMIN v29.4s, v29.4s, v5.4s
416        FMIN v30.4s, v30.4s, v5.4s
417        FMIN v31.4s, v31.4s, v5.4s
418
419        # Store full 4 x 8
420        B.LO 7f
421
422        $if INC:
423          STP q30, q31,  [x7]
424          SUB  x3,  x3, x2 // a0 -= kc
425          ADD  x7,  x7, x14
426          STP q28, q29, [x10]
427          SUB x11, x11, x2 // a1 -= kc
428          ADD x10, x10, x14
429          STP q18, q19,  [x9]
430          SUB x12, x12, x2 // a2 -= kc
431          ADD  x9,  x9, x14
432          STP q16, q17,  [x6]
433          SUB  x4,  x4, x2 // a3 -= kc
434          ADD  x6,  x6, x14
435        $else:
436          STP q16, q17,  [x6]
437          SUB  x3,  x3, x2 // a0 -= kc
438          ADD  x6,  x6, x14
439          STP q18, q19,  [x9]
440          SUB x11, x11, x2 // a1 -= kc
441          ADD  x9,  x9, x14
442          STP q28, q29, [x10]
443          SUB x12, x12, x2 // a2 -= kc
444          ADD x10, x10, x14
445          STP q30, q31,  [x7]
446          SUB  x4,  x4, x2 // a3 -= kc
447          ADD  x7,  x7, x14
448
449        B.HI 0b
450
451        # Restore d8-d15 from stack
452        LDP d14, d15, [sp, 48]
453        LDP d12, d13, [sp, 32]
454        LDP d10, d11, [sp, 16]
455        LDP  d8,  d9, [sp], 64
456        RET
457
458        # Store odd width
4597:
460        TBZ x1, 2, 8f
461        $if INC:
462          STR q30, [x7], 16
463          MOV v30.16b, v31.16b
464          STR q28, [x10], 16
465          MOV v28.16b, v29.16b
466          STR q18, [x9], 16
467          MOV v18.16b, v19.16b
468          STR q16, [x6], 16
469          MOV v16.16b, v17.16b
470        $else:
471          STR q16, [x6], 16
472          MOV v16.16b, v17.16b
473          STR q18, [x9], 16
474          MOV v18.16b, v19.16b
475          STR q28, [x10], 16
476          MOV v28.16b, v29.16b
477          STR q30, [x7], 16
478          MOV v30.16b, v31.16b
479
4808:
481        TBZ x1, 1, 9f
482        $if INC:
483          STR d30, [x7], 8
484          DUP d30, v30.d[1]
485          STR d28, [x10], 8
486          DUP d28, v28.d[1]
487          STR d18, [x9], 8
488          DUP d18, v18.d[1]
489          STR d16, [x6], 8
490          DUP d16, v16.d[1]
491        $else:
492          STR d16, [x6], 8
493          DUP d16, v16.d[1]
494          STR d18, [x9], 8
495          DUP d18, v18.d[1]
496          STR d28, [x10], 8
497          DUP d28, v28.d[1]
498          STR d30, [x7], 8
499          DUP d30, v30.d[1]
500
5019:
502        TBZ x1, 0, 10f
503        $if INC:
504          STR s30,  [x7]
505          STR s28, [x10]
506          STR s18,  [x9]
507          STR s16,  [x6]
508        $else:
509          STR s16,  [x6]
510          STR s18,  [x9]
511          STR s28, [x10]
512          STR s30,  [x7]
51310:
514        # Restore d8-d15 from stack
515        LDP d14, d15, [sp, 48]
516        LDP d12, d13, [sp, 32]
517        LDP d10, d11, [sp, 16]
518        LDP  d8,  d9, [sp], 64
519        RET
520
521
522END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
523
524#ifdef __ELF__
525.section ".note.GNU-stack","",%progbits
526#endif
527