1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma( 9# size_t channels, x0 10# size_t output_width, x1 11# const float** input, x2 12# const float* weights, x3 13# float* output, x4 14# size_t input_stride, x5 15# size_t output_increment, x6 16# size_t input_offset, (x7) -> x20 17# const float* zero, [sp + 80] -> x19 18# const xnn_f32_minmax_params params [sp + 88] -> (x20) 19 20# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 21 22BEGIN_FUNCTION xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma 23 24 # Save x19-x20,d8-d15 on stack 25 STP x19, x20, [sp, -80]! 26 STP d8, d9, [sp, 16] 27 STP d10, d11, [sp, 32] 28 STP d12, d13, [sp, 48] 29 STP d14, d15, [sp, 64] 30 31 # Load zero, params pointer 32 LDP x19, x20, [sp, 80] 33 34 # Load min/max values 35 LD2R {v30.4s, v31.4s}, [x20] 36 MOV x20, x7 // input_offset 37 380: 39 # x7 := i0 40 # x8 := i1 41 LDP x7, x8, [x2] 42 # x9 := i2 43 # x10 := i3 44 LDP x9, x10, [x2, 16] 45 # x11 := i4 46 # x12 := i5 47 LDP x11, x12, [x2, 32] 48 # x13 := i6 49 # x14 := i7 50 LDP x13, x14, [x2, 48] 51 # x15 := i8 52 LDR x15, [x2, 64] 53 54 CMP x7, x19 // if i0 == zero 55 ADD x7, x7, x20 // i0 += input_offset 56 CSEL x7, x19, x7, EQ // i0 = zero, else += i0 + input_offset 57 CMP x8, x19 // if i1 == zero 58 ADD x8, x8, x20 // i1 += input_offset 59 CSEL x8, x19, x8, EQ // i1 = zero, else += i1 + input_offset 60 CMP x9, x19 // if i2 == zero 61 ADD x9, x9, x20 // i2 += input_offset 62 CSEL x9, x19, x9, EQ // i2 = zero, else += i2 + input_offset 63 CMP x10, x19 // if i3 == zero 64 ADD x10, x10, x20 // i3 += input_offset 65 CSEL x10, x19, x10, EQ // i3 = zero, else += i3 + input_offset 66 CMP x11, x19 // if i4 == zero 67 ADD x11, x11, x20 // i4 += input_offset 68 CSEL x11, x19, x11, EQ // i4 = zero, else += i4 + input_offset 69 CMP x12, x19 // if i5 == zero 70 ADD x12, x12, x20 // i5 += input_offset 71 CSEL x12, x19, x12, EQ // i5 = zero, else += i5 + input_offset 72 CMP x13, x19 // if i6 == zero 73 ADD x13, x13, x20 // i6 += input_offset 74 CSEL x13, x19, x13, EQ // i6 = zero, else += i6 + input_offset 75 CMP x14, x19 // if i7 == zero 76 ADD x14, x14, x20 // i7 += input_offset 77 CSEL x14, x19, x14, EQ // i7 = zero, else += i7 + input_offset 78 CMP x15, x19 // if i8 == zero 79 ADD x15, x15, x20 // i8 += input_offset 80 CSEL x15, x19, x15, EQ // i8 = zero, else += i8 + input_offset 81 82 # input += input_stride 83 ADD x2, x2, x5 84 85 # x16 := c = channels 86 # c -= 4 87 SUBS x16, x0, 4 88 # x17 := w = weights 89 MOV x17, x3 90 91 # skip main loop if c <= 4 92 B.LO 2f 931: 94 LDP q0, q1, [x17], 32 95 LDP q2, q3, [x17], 32 96 LDP q4, q5, [x17], 32 97 LDP q6, q7, [x17], 32 98 LDP q8, q9, [x17], 32 99 LDR q10, [x7], 16 100 LDR q11, [x8], 16 101 LDR q12, [x9], 16 102 LDR q13, [x10], 16 103 LDR q14, [x11], 16 104 LDR q15, [x12], 16 105 LDR q16, [x13], 16 106 LDR q17, [x14], 16 107 LDR q18, [x15], 16 108 109 FMLA v0.4S, v1.4S, v10.4S 110 FMLA v0.4S, v2.4S, v11.4S 111 FMLA v0.4S, v3.4S, v12.4S 112 FMLA v0.4S, v4.4S, v13.4S 113 FMLA v0.4S, v5.4S, v14.4S 114 FMLA v0.4S, v6.4S, v15.4S 115 FMLA v0.4S, v7.4S, v16.4S 116 FMLA v0.4S, v8.4S, v17.4S 117 FMLA v0.4S, v9.4S, v18.4S 118 119 FMAX v0.4S, v0.4S, v30.4S 120 FMIN v0.4S, v0.4S, v31.4S 121 122 STR q0, [x4], 16 123 SUBS x16, x16, 4 124 B.HS 1b 125 1262: 127 # restore actual c value 128 ADD x16, x16, 4 129 # skip processing remainder channels unless c != 0 130 CBZ x16, 4f 131 132 LDP q0, q1, [x17], 32 133 LDP q2, q3, [x17], 32 134 LDP q4, q5, [x17], 32 135 LDP q6, q7, [x17], 32 136 LDP q8, q9, [x17], 32 137 LDR q10, [x7], 16 138 LDR q11, [x8], 16 139 LDR q12, [x9], 16 140 LDR q13, [x10], 16 141 LDR q14, [x11], 16 142 LDR q15, [x12], 16 143 LDR q16, [x13], 16 144 LDR q17, [x14], 16 145 LDR q18, [x15], 16 146 147 FMLA v0.4S, v1.4S, v10.4S 148 FMLA v0.4S, v2.4S, v11.4S 149 FMLA v0.4S, v3.4S, v12.4S 150 FMLA v0.4S, v4.4S, v13.4S 151 FMLA v0.4S, v5.4S, v14.4S 152 FMLA v0.4S, v6.4S, v15.4S 153 FMLA v0.4S, v7.4S, v16.4S 154 FMLA v0.4S, v8.4S, v17.4S 155 FMLA v0.4S, v9.4S, v18.4S 156 157 FMAX v0.4S, v0.4S, v30.4S 158 FMIN v0.4S, v0.4S, v31.4S 159 160 TBZ x16, 1, 3f 161 162 ST1 {v0.2S}, [x4], 8 163 DUP d0, v0.D[1] 164 1653: 166 TBZ x16, 0, 4f 167 168 ST1 {v0.S}[0], [x4], 4 169 1704: 171 # output_width -= 1 172 SUBS x1, x1, 1 173 # output += output_increment 174 ADD x4, x4, x6 175 # process next pixel if output_width != 0 176 B.NE 0b 177 178 # Restore x19-x20,d8-d15 from stack 179 LDP d14, d15, [sp, 64] 180 LDP d12, d13, [sp, 48] 181 LDP d10, d11, [sp, 32] 182 LDP d8, d9, [sp, 16] 183 LDP x19, x20, [sp], 80 184 RET 185 186END_FUNCTION xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma 187 188#ifdef __ELF__ 189.section ".note.GNU-stack","",%progbits 190#endif 191