1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53( 9# size_t mr, (x0) - unused. mr = 1 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const float**restrict a, x4 14# const float*restrict w, x5 15# float*restrict c, x6 16# size_t cm_stride, (x7) - unused 17# size_t cn_stride, [sp] -> x10 18# size_t a_offset, [sp + 8] -> x11 19# const float* zero, [sp + 16] -> x12 20# const xnn_f32_minmax_params params [sp + 24] -> x8 21 22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 23 24# A pointer 25# x8 a0 26 27# C pointer 28# x6 c0 29 30# A53 based on a53/75 but with LD64 31 32BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53 33 34 # Load cn_stride, a_offset 35 LDP x10, x11, [sp] 36 37 # Load zero, params pointer 38 LDP x12, x8, [sp, 16] 39 40 # Load min/max values 41 LD2R {v30.4s, v31.4s}, [x8] 42 430: 44 # Load initial bias from w into accumulators 45 LDP q16, q17, [x5], 32 46 MOVI v18.4s, 0 // second set of C for pipelining FMLA 47 PRFM PLDL1KEEP, [x5] 48 MOVI v19.4s, 0 49 PRFM PLDL1KEEP, [x5, 64] 50 PRFM PLDL1KEEP, [x5, 128] 51 PRFM PLDL1KEEP, [x5, 192] 52 53 MOV x9, x3 // p = ks 54 551: 56 # Load next A pointer 57 LDR x8, [x4], 8 58 59 CMP x8, x12 // if a0 == zero 60 ADD x8, x8, x11 // a0 += a_offset 61 CSEL x8, x12, x8, EQ // a0 = zero, else += a0 + a_offset 62 63 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 64 SUBS x0, x2, 32 // k = kc - 32 // k = kc 65 B.LO 5f 66 67 # 16 prologue 68 # Read first block of A and B. 69 LDP q20, q21, [x5], 32 70 LDP q22, q23, [x5], 32 71 LDP q24, q25, [x5], 32 72 LDP q26, q27, [x5], 32 73 LDR q0, [x8], 16 74 75 # Is there at least 8. yes do main loop 76 SUBS x0, x0, 32 77 B.LO 3f 78 79 # Main loop - 8 floats of A (32 bytes) 802: 81 # First block of 4. FMA for first 4, loads for 2nd block of 4. 82 FMLA v16.4s, v20.4s, v0.s[0] 83 LDR q1, [x8], 16 84 FMLA v17.4s, v21.4s, v0.s[0] 85 LDR q20, [x5], 16 86 FMLA v18.4s, v22.4s, v0.s[1] 87 LDR q21, [x5], 16 88 FMLA v19.4s, v23.4s, v0.s[1] 89 LDR q22, [x5], 16 90 FMLA v16.4s, v24.4s, v0.s[2] 91 LDR q23, [x5], 16 92 FMLA v17.4s, v25.4s, v0.s[2] 93 LDR q24, [x5], 16 94 FMLA v18.4s, v26.4s, v0.s[3] 95 LDR q25, [x5], 16 96 FMLA v19.4s, v27.4s, v0.s[3] 97 LDR q26, [x5], 16 98 LDR q27, [x5], 16 99 100 # Second block of 4. FMA for second 4, loads for 1st block of 4. 101 FMLA v16.4s, v20.4s, v1.s[0] 102 LDR q0, [x8], 16 103 FMLA v17.4s, v21.4s, v1.s[0] 104 LDR q20, [x5], 16 105 FMLA v18.4s, v22.4s, v1.s[1] 106 LDR q21, [x5], 16 107 FMLA v19.4s, v23.4s, v1.s[1] 108 LDR q22, [x5], 16 109 FMLA v16.4s, v24.4s, v1.s[2] 110 LDR q23, [x5], 16 111 FMLA v17.4s, v25.4s, v1.s[2] 112 LDR q24, [x5], 16 113 FMLA v18.4s, v26.4s, v1.s[3] 114 LDR q25, [x5], 16 115 FMLA v19.4s, v27.4s, v1.s[3] 116 LDR q26, [x5], 16 117 SUBS x0, x0, 32 118 LDR q27, [x5], 16 119 B.HS 2b 120 1213: 122 # Epilogue 123 124 # First block of 4. FMA for first 4, loads for 2nd block of 4. 125 FMLA v16.4s, v20.4s, v0.s[0] 126 LDR q1, [x8], 16 127 FMLA v17.4s, v21.4s, v0.s[0] 128 LDR q20, [x5], 16 129 FMLA v18.4s, v22.4s, v0.s[1] 130 LDR q21, [x5], 16 131 FMLA v19.4s, v23.4s, v0.s[1] 132 LDR q22, [x5], 16 133 FMLA v16.4s, v24.4s, v0.s[2] 134 LDR q23, [x5], 16 135 FMLA v17.4s, v25.4s, v0.s[2] 136 LDR q24, [x5], 16 137 FMLA v18.4s, v26.4s, v0.s[3] 138 LDR q25, [x5], 16 139 FMLA v19.4s, v27.4s, v0.s[3] 140 LDR q26, [x5], 16 141 142 # Second block of 4. no loads 143 FMLA v16.4s, v20.4s, v1.s[0] 144 LDR q27, [x5], 16 145 FMLA v17.4s, v21.4s, v1.s[0] 146 FMLA v18.4s, v22.4s, v1.s[1] 147 FMLA v19.4s, v23.4s, v1.s[1] 148 FMLA v16.4s, v24.4s, v1.s[2] 149 FMLA v17.4s, v25.4s, v1.s[2] 150 TST x0, 31 151 FMLA v18.4s, v26.4s, v1.s[3] 152 FMLA v19.4s, v27.4s, v1.s[3] 153 # Is there a remainder?- 4 floats of A (16 bytes) or less 154 B.NE 5f 155 1564: 157 # ks loop 158 SUBS x9, x9, 8 // ks -= MR * sizeof(void*) 159 B.HI 1b 160 161 FADD v16.4s, v16.4s, v18.4s 162 FADD v17.4s, v17.4s, v19.4s 163 164 # Clamp 165 FMAX v16.4s, v16.4s, v30.4s 166 FMAX v17.4s, v17.4s, v30.4s 167 FMIN v16.4s, v16.4s, v31.4s 168 FMIN v17.4s, v17.4s, v31.4s 169 170 # Store full 1 x 8 171 SUBS x1, x1, 8 172 B.LO 8f 173 174 ST1 {v16.16b, v17.16b}, [x6], x10 175 SUB x4, x4, x3 // a -= ks 176 177 # nc loop 178 B.HI 0b 179 180 RET 181 1825: 183 # Is there a remainder?- 2 floats of A (8 bytes) 184 TBZ x0, 4, 6f 185 186 # Remainder- 4 floats of A (16 bytes) 187 LDR q20, [x5], 16 188 LDR q21, [x5], 16 189 LDR q0, [x8], 16 190 FMLA v16.4s, v20.4s, v0.s[0] 191 FMLA v17.4s, v21.4s, v0.s[0] 192 LDR q22, [x5], 16 193 LDR q23, [x5], 16 194 LDR q24, [x5], 16 195 LDR q25, [x5], 16 196 LDR q26, [x5], 16 197 LDR q27, [x5], 16 198 FMLA v18.4s, v22.4s, v0.s[1] 199 FMLA v19.4s, v23.4s, v0.s[1] 200 FMLA v16.4s, v24.4s, v0.s[2] 201 FMLA v17.4s, v25.4s, v0.s[2] 202 FMLA v18.4s, v26.4s, v0.s[3] 203 FMLA v19.4s, v27.4s, v0.s[3] 204 2056: 206 TBZ x0, 3, 7f 207 # Remainder- 2 floats of A (8 bytes) 208 LDR q20, [x5], 16 209 LDR q21, [x5], 16 210 LDR d0, [x8], 8 211 FMLA v16.4s, v20.4s, v0.s[0] 212 FMLA v17.4s, v21.4s, v0.s[0] 213 LDR q22, [x5], 16 214 LDR q23, [x5], 16 215 FMLA v18.4s, v22.4s, v0.s[1] 216 FMLA v19.4s, v23.4s, v0.s[1] 2177: 218 TBZ x0, 2, 4b 219 # Remainder- 1 float of A (4 bytes) 220 LDR q20, [x5], 16 221 LDR q21, [x5], 16 222 LDR s0, [x8], 4 223 FMLA v16.4s, v20.4s, v0.s[0] 224 FMLA v17.4s, v21.4s, v0.s[0] 225 B 4b 226 2278: 228 # Store odd channels 229 TBZ x1, 2, 9f 230 STR q16, [x6], 16 231 MOV v16.16b, v17.16b 232 2339: 234 TBZ x1, 1, 10f 235 STR d16, [x6], 8 236 DUP d16, v16.d[1] 237 23810: 239 TBZ x1, 0, 11f 240 STR s16, [x6], 4 24111: 242 RET 243 244END_FUNCTION xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53 245 246#ifdef __ELF__ 247.section ".note.GNU-stack","",%progbits 248#endif 249