1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma(
9#     size_t channels,                   x0
10#     size_t output_width,               x1
11#     const float** input,               x2
12#     const float* weights,              x3
13#     float* output,                     x4
14#     size_t input_stride,               x5
15#     size_t output_increment,           x6
16#     size_t input_offset,               (x7) -> x20
17#     const float* zero,                 [sp + 80] -> x19
18#     const xnn_f32_minmax_params params [sp + 88] -> (x20)
19
20# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
21
22BEGIN_FUNCTION xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma
23
24        # Save x19-x20,d8-d15 on stack
25        STP x19, x20, [sp, -80]!
26        STP  d8,  d9, [sp, 16]
27        STP d10, d11, [sp, 32]
28        STP d12, d13, [sp, 48]
29        STP d14, d15, [sp, 64]
30
31        # Load zero, params pointer
32        LDP x19, x20, [sp, 80]
33
34        # Load min/max values
35        LD2R {v30.4s, v31.4s}, [x20]
36        MOV x20, x7               // input_offset
37
380:
39        #  x7 := i0
40        #  x8 := i1
41        LDP x7, x8, [x2]
42        #  x9 := i2
43        # x10 := i3
44        LDP x9, x10, [x2, 16]
45        # x11 := i4
46        # x12 := i5
47        LDP x11, x12, [x2, 32]
48        # x13 := i6
49        # x14 := i7
50        LDP x13, x14, [x2, 48]
51        # x15 := i8
52        LDR x15, [x2, 64]
53
54        CMP x7, x19               // if i0 == zero
55        ADD x7, x7, x20           // i0 += input_offset
56        CSEL x7, x19, x7, EQ      //   i0 = zero, else += i0 + input_offset
57        CMP x8, x19               // if i1 == zero
58        ADD x8, x8, x20           // i1 += input_offset
59        CSEL x8, x19, x8, EQ      //   i1 = zero, else += i1 + input_offset
60        CMP x9, x19               // if i2 == zero
61        ADD x9, x9, x20           // i2 += input_offset
62        CSEL x9, x19, x9, EQ      //   i2 = zero, else += i2 + input_offset
63        CMP x10, x19              // if i3 == zero
64        ADD x10, x10, x20         // i3 += input_offset
65        CSEL x10, x19, x10, EQ    //   i3 = zero, else += i3 + input_offset
66        CMP x11, x19              // if i4 == zero
67        ADD x11, x11, x20         // i4 += input_offset
68        CSEL x11, x19, x11, EQ    //   i4 = zero, else += i4 + input_offset
69        CMP x12, x19              // if i5 == zero
70        ADD x12, x12, x20         // i5 += input_offset
71        CSEL x12, x19, x12, EQ    //   i5 = zero, else += i5 + input_offset
72        CMP x13, x19              // if i6 == zero
73        ADD x13, x13, x20         // i6 += input_offset
74        CSEL x13, x19, x13, EQ    //   i6 = zero, else += i6 + input_offset
75        CMP x14, x19              // if i7 == zero
76        ADD x14, x14, x20         // i7 += input_offset
77        CSEL x14, x19, x14, EQ    //   i7 = zero, else += i7 + input_offset
78        CMP x15, x19              // if i8 == zero
79        ADD x15, x15, x20         // i8 += input_offset
80        CSEL x15, x19, x15, EQ    //   i8 = zero, else += i8 + input_offset
81
82        # input += input_stride
83        ADD x2, x2, x5
84
85        # x16 := c = channels
86        # c -= 4
87        SUBS x16, x0, 4
88        # x17 := w = weights
89        MOV x17, x3
90
91        # skip main loop if c <= 4
92        B.LO 2f
931:
94        LDP q0, q1, [x17], 32
95        LDP q2, q3, [x17], 32
96        LDP q4, q5, [x17], 32
97        LDP q6, q7, [x17], 32
98        LDP q8, q9, [x17], 32
99        LDR q10, [x7], 16
100        LDR q11, [x8], 16
101        LDR q12, [x9], 16
102        LDR q13, [x10], 16
103        LDR q14, [x11], 16
104        LDR q15, [x12], 16
105        LDR q16, [x13], 16
106        LDR q17, [x14], 16
107        LDR q18, [x15], 16
108
109        FMLA v0.4S, v1.4S, v10.4S
110        FMLA v0.4S, v2.4S, v11.4S
111        FMLA v0.4S, v3.4S, v12.4S
112        FMLA v0.4S, v4.4S, v13.4S
113        FMLA v0.4S, v5.4S, v14.4S
114        FMLA v0.4S, v6.4S, v15.4S
115        FMLA v0.4S, v7.4S, v16.4S
116        FMLA v0.4S, v8.4S, v17.4S
117        FMLA v0.4S, v9.4S, v18.4S
118
119        FMAX v0.4S, v0.4S, v30.4S
120        FMIN v0.4S, v0.4S, v31.4S
121
122        STR q0, [x4], 16
123        SUBS x16, x16, 4
124        B.HS 1b
125
1262:
127        # restore actual c value
128        ADD x16, x16, 4
129        # skip processing remainder channels unless c != 0
130        CBZ x16, 4f
131
132        LDP q0, q1, [x17], 32
133        LDP q2, q3, [x17], 32
134        LDP q4, q5, [x17], 32
135        LDP q6, q7, [x17], 32
136        LDP q8, q9, [x17], 32
137        LDR q10, [x7], 16
138        LDR q11, [x8], 16
139        LDR q12, [x9], 16
140        LDR q13, [x10], 16
141        LDR q14, [x11], 16
142        LDR q15, [x12], 16
143        LDR q16, [x13], 16
144        LDR q17, [x14], 16
145        LDR q18, [x15], 16
146
147        FMLA v0.4S, v1.4S, v10.4S
148        FMLA v0.4S, v2.4S, v11.4S
149        FMLA v0.4S, v3.4S, v12.4S
150        FMLA v0.4S, v4.4S, v13.4S
151        FMLA v0.4S, v5.4S, v14.4S
152        FMLA v0.4S, v6.4S, v15.4S
153        FMLA v0.4S, v7.4S, v16.4S
154        FMLA v0.4S, v8.4S, v17.4S
155        FMLA v0.4S, v9.4S, v18.4S
156
157        FMAX v0.4S, v0.4S, v30.4S
158        FMIN v0.4S, v0.4S, v31.4S
159
160        TBZ x16, 1, 3f
161
162        ST1 {v0.2S}, [x4], 8
163        DUP d0, v0.D[1]
164
1653:
166        TBZ x16, 0, 4f
167
168        ST1 {v0.S}[0], [x4], 4
169
1704:
171        # output_width -= 1
172        SUBS x1, x1, 1
173        # output += output_increment
174        ADD x4, x4, x6
175        # process next pixel if output_width != 0
176        B.NE 0b
177
178        # Restore x19-x20,d8-d15 from stack
179        LDP d14, d15, [sp, 64]
180        LDP d12, d13, [sp, 48]
181        LDP d10, d11, [sp, 32]
182        LDP  d8,  d9, [sp, 16]
183        LDP x19, x20, [sp], 80
184        RET
185
186END_FUNCTION xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma
187
188#ifdef __ELF__
189.section ".note.GNU-stack","",%progbits
190#endif
191