1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8.syntax unified
9
10// void xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64(
11//     size_t mr,                            r0
12//     size_t nc,                            r1
13//     size_t kc,                            r2 -> r5
14//     const uint8_t*restrict a,             r3
15//     size_t a_stride,          sp + 96  -> (r11)
16//     const void*restrict w,    sp + 100 -> r9
17//     uint8_t*restrict c,       sp + 104 -> r6
18//     size_t cm_stride,         sp + 108 -> (r7)
19//     size_t cn_stride,         sp + 112 -> r11
20//     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])  sp + 116 -> (r11)
21
22
23// inner loop registers
24
25// A0   r3  s0-s1  d0
26// A1  r12  s2-s3  d1
27// A2  r10  s4-s5  d2
28// A3   r0  s6-s7  d3
29
30// B    r9   s8,  s9, s10, s11 d4-d5
31// B        s12, s13, s14, s15 d6-d7
32
33// C0   r6 s16-s17  d8  s18-s19  d9
34// C1   r4 s20-s21 d10  s22-s23 d11
35// C2   r8 s24-s25 d12  s26-s27 d13
36// C3   r7 s28-s29 d14  s30-s31 d15
37
38BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64
39        .arm
40#ifndef __APPLE__
41        .arch armv6
42        .fpu vfp
43#endif
44        // Push 96 bytes
45        PUSH   {r4, r5, r6, r7, r8, r9, r10, r11}  // 32
46        VPUSH  {d8-d15}                            // +64 = 96
47
48        LDR     r11, [sp, 96]         // Load a_stride
49        LDRD    r6, r7, [sp, 104]     // Load c and cm_stride
50
51        // Clamp A and C pointers
52        CMP    r0, 2                 // if mr >= 2
53        ADD    r12, r3, r11          //   a1 = a0 + a_stride
54        ADD    r4, r6, r7            //   c1 = c0 + cm_stride
55        MOVLO  r12, r3               // a1
56        MOVLO  r4, r6                // c1
57
58        LDR     r9, [sp, 100]        // Load w
59
60                                     // if mr > 2
61        ADD    r10, r12, r11         //   a2 = a1 + a_stride
62        ADD    r8, r4, r7            //   c2 = c1 + cm_stride
63        MOVLS  r10, r12              // a2
64        MOVLS  r8, r4                // c2
65
66        CMP    r0, 4                 // if mr >=4
67        ADD    r0, r10, r11          //   a3 = a2 + a_stride
68        ADD    r7, r8, r7            //   c3 = c2 + cm_stride
69        LDR    r11, [sp, 112]        // Load cn_stride
70        MOVLO  r0, r10               // a3
71        MOVLO  r7, r8                // c3
72
730:
74        # Load initial bias from w into accumulators
75        VLDM        r9!, {d8-d9}   // Bias
76        SUBS        r5, r2, 8
77        VMOV.F64    d10, d8
78        VMOV.F64    d12, d8
79        VMOV.F64    d14, d8
80        VMOV.F64    d11, d9
81        VMOV.F64    d13, d9
82        VMOV.F64    d15, d9
83        BLO         3f               // less than 2 channels?
84
85        // Main loop - 2 floats of A (8 bytes)
861:
87        VLDM        r3!, {d0}        // A0
88        VLDM        r9!, {d4-d5}     // B0
89        VLDM       r12!, {d1}        // A1
90        VLDM       r10!, {d2}        // A2
91        VLDM        r0!, {d3}        // A3
92
93        VMLA.F32    s16, s8, s0
94        VMLA.F32    s17, s9, s0
95        VMLA.F32    s20, s8, s2
96        VMLA.F32    s21, s9, s2
97        VMLA.F32    s24, s8, s4
98        VMLA.F32    s25, s9, s4
99        VMLA.F32    s28, s8, s6
100        VMLA.F32    s29, s9, s6
101
102        VLDM        r9!, {d6-d7}     // B1
103
104        VMLA.F32    s18, s10, s0
105        VMLA.F32    s19, s11, s0
106        VMLA.F32    s22, s10, s2
107        VMLA.F32    s23, s11, s2
108        VMLA.F32    s26, s10, s4
109        VMLA.F32    s27, s11, s4
110        VMLA.F32    s30, s10, s6
111        VMLA.F32    s31, s11, s6
112
113        VMLA.F32    s16, s12, s1
114        VMLA.F32    s17, s13, s1
115        VMLA.F32    s20, s12, s3
116        VMLA.F32    s21, s13, s3
117        VMLA.F32    s24, s12, s5
118        VMLA.F32    s25, s13, s5
119        VMLA.F32    s28, s12, s7
120        VMLA.F32    s29, s13, s7
121
122        SUBS        r5, r5, 8
123
124        VMLA.F32    s18, s14, s1
125        VMLA.F32    s19, s15, s1
126        VMLA.F32    s22, s14, s3
127        VMLA.F32    s23, s15, s3
128        VMLA.F32    s26, s14, s5
129        VMLA.F32    s27, s15, s5
130        VMLA.F32    s30, s14, s7
131        VMLA.F32    s31, s15, s7
132
133        BHS         1b
134
135        // Is there a remainder?- 1 floats of A (4 bytes)
136        TST         r5, 4
137        BNE         3f
138
1392:
140
141        SUBS        r1, r1, 4
142        BLO         4f
143
144        // Store full 4 x 4
145        VSTM        r6, {d8-d9}
146        SUB         r0, r0, r2
147        ADD         r6, r11
148        VSTM        r4, {d10-d11}
149        SUB         r10, r10, r2
150        ADD         r4, r11
151        VSTM        r8, {d12-d13}
152        SUB         r12, r12, r2
153        ADD         r8, r11
154        VSTM        r7, {d14-d15}
155        SUB         r3, r3, r2
156        ADD         r7, r11
157        BHI         0b
158
159        VPOP        {d8-d15}
160        POP         {r4, r5, r6, r7, r8, r9, r10, r11}
161        BX          lr
162
1633:
164        // Remainder- 1 floats of A (4 bytes)
165        VLDM         r3!, {s0}       // A0
166        VLDM         r9!, {d6-d7}    // B
167        VLDM        r12!, {s1}       // A1
168        VLDM        r10!, {s2}       // A2
169        VLDM         r0!, {s3}       // A3
170
171        VMLA.F32    s16, s12, s0
172        VMLA.F32    s17, s13, s0
173        VMLA.F32    s18, s14, s0
174        VMLA.F32    s19, s15, s0
175
176        VMLA.F32    s20, s12, s1
177        VMLA.F32    s21, s13, s1
178        VMLA.F32    s22, s14, s1
179        VMLA.F32    s23, s15, s1
180
181        VMLA.F32    s24, s12, s2
182        VMLA.F32    s25, s13, s2
183        VMLA.F32    s26, s14, s2
184        VMLA.F32    s27, s15, s2
185
186        VMLA.F32    s28, s12, s3
187        VMLA.F32    s29, s13, s3
188        VMLA.F32    s30, s14, s3
189        VMLA.F32    s31, s15, s3
190
191        B           2b
192
193        // Store odd width
1944:
195        TST        r1, 2
196        BEQ        5f
197        VSTM       r6!, {d8}
198        VMOV.F32   s16, s18
199        VSTM       r4!, {d10}
200        VMOV.F32   s20, s22
201        VSTM       r8!, {d12}
202        VMOV.F32   s24, s26
203        VSTM       r7!, {d14}
204        VMOV.F32   s28, s30
205
2065:
207        TST         r1, 1
208        BEQ         6f
209        VSTR        s16, [r6]
210        VSTR        s20, [r4]
211        VSTR        s24, [r8]
212        VSTR        s28, [r7]
213
2146:
215        VPOP        {d8-d15}
216        POP         {r4, r5, r6, r7, r8, r9, r10, r11}
217        BX          lr
218
219END_FUNCTION xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64
220
221#ifdef __ELF__
222.section ".note.GNU-stack","",%progbits
223#endif
224