1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64(
9#     size_t mr,                 x0
10#     size_t nc,                 x1
11#     size_t kc,                 x2 / x0
12#     const int8_t* restrict a,  x3
13#     size_t a_stride,           x4
14#     const void* restrict w,    x5
15#     int8_t* restrict c,        x6
16#     size_t cm_stride,          x7
17#     size_t cn_stride,          [sp] -> x12
18#     const union xnn_qs8_gemm_params params)  [sp + 8] -> x11
19
20# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
21
22# Register usage
23# A0  x3 v0
24# A1 x15 v1
25# A2 x13 v2
26# A3  x4 v3
27# B   x5 v4  v5  v6  v7
28# C0  x6 v16 v20 v24 v28
29# C1  x8 v17 v21 v25 v29
30# C2  x9 v18 v22 v26 v30
31# C3  x7 v19 v23 v27 v31
32# unused v8 v9 v10 v11 v12 v13 v14 v15
33
34BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64
35
36        # Clamp A and C pointers
37        CMP      x0, 2             // if mr < 2
38        ADD      x2, x2, 3         // kc = (kc + 3) & ~3
39        ADD     x15, x3, x4        // a1 = a0 + a_stride
40        ADD      x8, x6, x7        // c1 = c0 + cm_stride
41        CSEL    x15, x3, x15, LO   //   a1 = a0
42        CSEL     x8, x6,  x8, LO   //   c1 = c0
43        BIC      x2, x2, 3
44
45        ADD     x13, x15, x4       // a2 = a1 + a_stride
46        ADD      x9,  x8, x7       // c2 = c1 + cm_stride
47                                   // if mr <= 2
48        CSEL    x13, x15, x13, LS  //   a2 = a1
49        CSEL     x9,  x8,  x9, LS  //   c2 = c1
50
51        CMP      x0, 4             // if mr < 4
52        ADD      x4, x13, x4       // a3 = a2 + a_stride
53        ADD      x7,  x9, x7       // c3 = c2 + cm_stride
54        CSEL     x4, x13, x4, LO   //   a3 = a2
55        CSEL     x7,  x9, x7, LO   //   c3 = c2
56
57        .p2align 3
580:
59        # Load initial bias from w into accumulators
60        LDP     q16, q20, [x5], 32
61        MOV     v17.16b, v16.16b
62        MOV     v18.16b, v16.16b
63        LDP     q24, q28, [x5], 32
64        MOV     v19.16b, v16.16b
65        MOV     v21.16b, v20.16b
66        LDR     x11, [sp, 8]       // params
67        MOV     v22.16b, v20.16b
68        MOV     v23.16b, v20.16b
69        SUBS    x0, x2, 8          // k = kc - 8
70        MOV     v25.16b, v24.16b
71        MOV     v26.16b, v24.16b
72        MOV     v27.16b, v24.16b
73        MOV     v29.16b, v28.16b
74        MOV     v30.16b, v28.16b
75        MOV     v31.16b, v28.16b
76        # Is there at least 8 bytes?
77        B.LO    3f
78
79        # Main loop - 8 bytes of A
80        .p2align 3
811:
82        LDR     d0,  [x3], 8
83        LDR     q4,  [x5], 16
84        LDR     d1, [x15], 8
85        LDR     d2, [x13], 8
86        LDR     d3,  [x4], 8
87        LDR     q5,  [x5], 16
88        SDOT    v16.4s, v4.16b,  v0.4b[0]
89        SDOT    v17.4s, v4.16b,  v1.4b[0]
90        LDP     q6, q7, [x5], 32
91        SDOT    v18.4s, v4.16b,  v2.4b[0]
92        SDOT    v19.4s, v4.16b,  v3.4b[0]
93        SDOT    v20.4s, v5.16b,  v0.4b[0]
94        SDOT    v21.4s, v5.16b,  v1.4b[0]
95        SDOT    v22.4s, v5.16b,  v2.4b[0]
96        SDOT    v23.4s, v5.16b,  v3.4b[0]
97        SDOT    v24.4s, v6.16b, v0.4b[0]
98        SDOT    v25.4s, v6.16b, v1.4b[0]
99        LDP     q4, q5, [x5], 32
100        SDOT    v26.4s, v6.16b, v2.4b[0]
101        SDOT    v27.4s, v6.16b, v3.4b[0]
102        SDOT    v28.4s, v7.16b, v0.4b[0]
103        SDOT    v29.4s, v7.16b, v1.4b[0]
104        SDOT    v30.4s, v7.16b, v2.4b[0]
105        SDOT    v31.4s, v7.16b, v3.4b[0]
106        SDOT    v16.4s, v4.16b,  v0.4b[1]
107        SDOT    v17.4s, v4.16b,  v1.4b[1]
108        LDP     q6, q7, [x5], 32
109        SDOT    v18.4s, v4.16b,  v2.4b[1]
110        SDOT    v19.4s, v4.16b,  v3.4b[1]
111        SDOT    v20.4s, v5.16b,  v0.4b[1]
112        SDOT    v21.4s, v5.16b,  v1.4b[1]
113        SDOT    v22.4s, v5.16b,  v2.4b[1]
114        SDOT    v23.4s, v5.16b,  v3.4b[1]
115        SDOT    v24.4s, v6.16b,  v0.4b[1]
116        SDOT    v25.4s, v6.16b,  v1.4b[1]
117        SDOT    v26.4s, v6.16b,  v2.4b[1]
118        SDOT    v27.4s, v6.16b,  v3.4b[1]
119        SDOT    v28.4s, v7.16b,  v0.4b[1]
120        SDOT    v29.4s, v7.16b,  v1.4b[1]
121        SDOT    v30.4s, v7.16b,  v2.4b[1]
122        SUBS    x0, x0, 8
123        SDOT    v31.4s, v7.16b,  v3.4b[1]
124        B.HS    1b
125
126        # Is there a remainder?- 4 bytes of A
127        TBNZ    x0, 2, 3f
128
1292:
130        # Apply params - scale, shift, bias and clamp
131        LD2R    {v0.4s, v1.4s}, [x11], 8
132        CMEQ    v2.4s, v1.4s, 0
133
134        BIC     v4.16b, v16.16b, v2.16b
135        BIC     v5.16b, v17.16b, v2.16b
136        BIC     v6.16b, v18.16b, v2.16b
137        BIC     v7.16b, v19.16b, v2.16b
138
139        SQRDMULH  v16.4s, v16.4s, v0.4s
140        SQRDMULH  v17.4s, v17.4s, v0.4s
141        SQRDMULH  v18.4s, v18.4s, v0.4s
142        SQRDMULH  v19.4s, v19.4s, v0.4s
143
144        SSRA    v16.4s, v4.4s, 31  // signed shift right accumulate
145        SSRA    v17.4s, v5.4s, 31
146        SSRA    v18.4s, v6.4s, 31
147        SSRA    v19.4s, v7.4s, 31
148
149        BIC     v4.16b, v20.16b, v2.16b
150        BIC     v5.16b, v21.16b, v2.16b
151        BIC     v6.16b, v22.16b, v2.16b
152        BIC     v7.16b, v23.16b, v2.16b
153
154        SQRDMULH  v20.4s, v20.4s, v0.4s
155        SQRDMULH  v21.4s, v21.4s, v0.4s
156        SQRDMULH  v22.4s, v22.4s, v0.4s
157        SQRDMULH  v23.4s, v23.4s, v0.4s
158
159        SSRA    v20.4s, v4.4s, 31
160        SSRA    v21.4s, v5.4s, 31
161        SSRA    v22.4s, v6.4s, 31
162        SSRA    v23.4s, v7.4s, 31
163
164        BIC     v4.16b, v24.16b, v2.16b
165        BIC     v5.16b, v25.16b, v2.16b
166        BIC     v6.16b, v26.16b, v2.16b
167        BIC     v7.16b, v27.16b, v2.16b
168
169        SQRDMULH  v24.4s, v24.4s, v0.4s
170        SQRDMULH  v25.4s, v25.4s, v0.4s
171        SQRDMULH  v26.4s, v26.4s, v0.4s
172        SQRDMULH  v27.4s, v27.4s, v0.4s
173
174        SSRA    v24.4s, v4.4s, 31
175        SSRA    v25.4s, v5.4s, 31
176        SSRA    v26.4s, v6.4s, 31
177        SSRA    v27.4s, v7.4s, 31
178
179        BIC     v4.16b, v28.16b, v2.16b
180        BIC     v5.16b, v29.16b, v2.16b
181        BIC     v6.16b, v30.16b, v2.16b
182        BIC     v7.16b, v31.16b, v2.16b
183
184        SQRDMULH  v28.4s, v28.4s, v0.4s
185        SQRDMULH  v29.4s, v29.4s, v0.4s
186        SQRDMULH  v30.4s, v30.4s, v0.4s
187        SQRDMULH  v31.4s, v31.4s, v0.4s
188
189        SSRA    v28.4s, v4.4s, 31
190        SSRA    v29.4s, v5.4s, 31
191        SSRA    v30.4s, v6.4s, 31
192        SSRA    v31.4s, v7.4s, 31
193
194        SRSHL   v16.4s, v16.4s, v1.4s  // signed rounding shift left
195        SRSHL   v17.4s, v17.4s, v1.4s
196        SRSHL   v18.4s, v18.4s, v1.4s
197        SRSHL   v19.4s, v19.4s, v1.4s
198        SRSHL   v20.4s, v20.4s, v1.4s
199        SRSHL   v21.4s, v21.4s, v1.4s
200        SRSHL   v22.4s, v22.4s, v1.4s
201        SRSHL   v23.4s, v23.4s, v1.4s
202        SRSHL   v24.4s, v24.4s, v1.4s
203        SRSHL   v25.4s, v25.4s, v1.4s
204        SRSHL   v26.4s, v26.4s, v1.4s
205        SRSHL   v27.4s, v27.4s, v1.4s
206        SRSHL   v28.4s, v28.4s, v1.4s
207        SRSHL   v29.4s, v29.4s, v1.4s
208        SRSHL   v30.4s, v30.4s, v1.4s
209        SRSHL   v31.4s, v31.4s, v1.4s
210
211        SQXTN   v16.4h, v16.4s
212        SQXTN   v17.4h, v17.4s
213        SQXTN   v18.4h, v18.4s
214        SQXTN   v19.4h, v19.4s
215        SQXTN   v24.4h, v24.4s
216        SQXTN   v25.4h, v25.4s
217        SQXTN   v26.4h, v26.4s
218        SQXTN   v27.4h, v27.4s
219        LD1R    {v2.8h}, [x11], 2   // add bias
220
221        SQXTN2  v16.8h, v20.4s
222        SQXTN2  v17.8h, v21.4s
223        SQXTN2  v18.8h, v22.4s
224        SQXTN2  v19.8h, v23.4s
225        SQXTN2  v24.8h, v28.4s
226        SQXTN2  v25.8h, v29.4s
227        SQXTN2  v26.8h, v30.4s
228        SQXTN2  v27.8h, v31.4s
229
230        SQADD   v16.8h, v16.8h, v2.8h
231        SQADD   v17.8h, v17.8h, v2.8h
232        SQADD   v18.8h, v18.8h, v2.8h
233        SQADD   v19.8h, v19.8h, v2.8h
234        SQADD   v24.8h, v24.8h, v2.8h
235        SQADD   v25.8h, v25.8h, v2.8h
236        SQADD   v26.8h, v26.8h, v2.8h
237        SQADD   v27.8h, v27.8h, v2.8h
238        LD1R    {v0.16b}, [x11], 1  // clamp min value
239
240        SQXTN    v4.8b, v16.8h
241        SQXTN    v5.8b, v17.8h
242        SQXTN    v6.8b, v18.8h
243        SQXTN    v7.8b, v19.8h
244        LD1R    {v1.16b}, [x11]     // clamp max value
245        SQXTN2   v4.16b, v24.8h
246        SQXTN2   v5.16b, v25.8h
247        SQXTN2   v6.16b, v26.8h
248        SQXTN2   v7.16b, v27.8h
249        LDR     x12, [sp]   // cn_stride
250
251        SMAX    v4.16b, v4.16b, v0.16b
252        SMAX    v5.16b, v5.16b, v0.16b
253        SMAX    v6.16b, v6.16b, v0.16b
254        SMAX    v7.16b, v7.16b, v0.16b
255        SUBS    x1, x1, 16
256        SMIN    v4.16b, v4.16b, v1.16b
257        SMIN    v5.16b, v5.16b, v1.16b
258        SMIN    v6.16b, v6.16b, v1.16b
259        SMIN    v7.16b, v7.16b, v1.16b
260        B.LO    4f
261
262        # Store full 4 x 16
263        ST1     {v4.16b}, [x6], x12
264        SUB      x3,  x3, x2         // a0 -= kc
265        ST1     {v5.16b}, [x8], x12
266        SUB     x15, x15, x2         // a1 -= kc
267        ST1     {v6.16b}, [x9], x12
268        SUB     x13, x13, x2         // a2 -= kc
269        ST1     {v7.16b}, [x7], x12
270        SUB      x4,  x4, x2         // a3 -= kc
271        B.NE    0b
272        RET
273
274        # Remainder- 4 bytes of A
275        .p2align 3
2763:
277        LDR     s0,  [x3], 4
278        LDR     q4, [x5], 16
279        LDR     s1, [x15], 4
280        LDR     s2, [x13], 4
281        LDR     s3,  [x4], 4
282        SDOT    v16.4s, v4.16b,  v0.4b[0]
283        LDR     q5, [x5], 16
284        SDOT    v17.4s, v4.16b,  v1.4b[0]
285        SDOT    v18.4s, v4.16b,  v2.4b[0]
286        SDOT    v19.4s, v4.16b,  v3.4b[0]
287        SDOT    v20.4s, v5.16b,  v0.4b[0]
288        LDP     q6, q7, [x5], 32
289        SDOT    v21.4s, v5.16b,  v1.4b[0]
290        SDOT    v22.4s, v5.16b,  v2.4b[0]
291        SDOT    v23.4s, v5.16b,  v3.4b[0]
292        SDOT    v24.4s, v6.16b, v0.4b[0]
293        SDOT    v25.4s, v6.16b, v1.4b[0]
294        SDOT    v26.4s, v6.16b, v2.4b[0]
295        SDOT    v27.4s, v6.16b, v3.4b[0]
296        SDOT    v28.4s, v7.16b, v0.4b[0]
297        SDOT    v29.4s, v7.16b, v1.4b[0]
298        SDOT    v30.4s, v7.16b, v2.4b[0]
299        SDOT    v31.4s, v7.16b, v3.4b[0]
300        B       2b
301
302        # Store odd width
303        .p2align 3
3044:
305        TBZ     x1, 3, 5f
306        STR     d4, [x6], 8
307        DUP     d4, v4.d[1]
308        STR     d5, [x8], 8
309        DUP     d5, v5.d[1]
310        STR     d6, [x9], 8
311        DUP     d6, v6.d[1]
312        STR     d7, [x7], 8
313        DUP     d7, v7.d[1]
3145:
315        TBZ     x1, 2, 6f
316        STR     s4, [x6], 4
317        DUP     s4, v4.s[1]
318        STR     s5, [x8], 4
319        DUP     s5, v5.s[1]
320        STR     s6, [x9], 4
321        DUP     s6, v6.s[1]
322        STR     s7, [x7], 4
323        DUP     s7, v7.s[1]
3246:
325        TBZ     x1, 1, 7f
326        ST1     {v4.h}[0], [x6], 2
327        DUP     h4, v4.h[1]
328        ST1     {v5.h}[0], [x8], 2
329        DUP     h5, v5.h[1]
330        ST1     {v6.h}[0], [x9], 2
331        DUP     h6, v6.h[1]
332        ST1     {v7.h}[0], [x7], 2
333        DUP     h7, v7.h[1]
3347:
335        TBZ     x1, 0, 8f
336        ST1     {v4.b}[0], [x6]
337        ST1     {v5.b}[0], [x8]
338        ST1     {v6.b}[0], [x9]
339        ST1     {v7.b}[0], [x7]
3408:
341        RET
342
343END_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64
344
345#ifdef __ELF__
346.section ".note.GNU-stack","",%progbits
347#endif
348