1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, (x4) - unused 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x14 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointer 28# x3 a0 29 30# C pointer 31# x6 c0 32 33# Clamp v2 v3 34 35# A53 based on A57/A75 but with LD64 36 37BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53 38 39 # Load cn_stride, acc 40 LDP x14, x15, [sp] 41 # Load params pointer 42 LDR x8, [sp, 16] 43 44 # Load min/max values 45 LD2R {v2.4s, v3.4s}, [x8] 460: 47 # Load initial accumulators 48 LD1 {v16.16b, v17.16b, v18.16b}, [x15], 48 49 50 MOVI v5.4s, 0 // second set of C for pipelining FMLA 51 PRFM PLDL1KEEP, [x5] 52 MOVI v6.4s, 0 53 PRFM PLDL1KEEP, [x5, 64] 54 MOVI v7.4s, 0 55 PRFM PLDL1KEEP, [x5, 128] 56 PRFM PLDL1KEEP, [x5, 192] 57 58 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 59 SUBS x0, x2, 32 // k = kc - 32 60 61 B.LO 3f 62 63 # 16 prologue 64 # Read first block of 1 A and B. 65 LDP q20, q21, [x5], 32 66 LDP q22, q23, [x5], 32 67 LDP q24, q25, [x5], 32 68 LDP q26, q27, [x5], 32 69 LDP q28, q29, [x5], 32 70 LDP q30, q31, [x5], 32 71 LDR q0, [x3], 16 72 73 # Is there at least 32. yes do main loop 74 SUBS x0, x0, 32 75 B.LO 2f 76 77 # Main loop - 8 floats of A (32 bytes) 781: 79 # First block of 4. FMA for first 4, loads for 2nd block of 4. 80 FMLA v16.4s, v20.4s, v0.s[0] 81 LDR q1, [x3], 16 82 FMLA v17.4s, v21.4s, v0.s[0] 83 LDR q20, [x5], 16 84 FMLA v18.4s, v22.4s, v0.s[0] 85 LDR q21, [x5], 16 86 FMLA v5.4s, v23.4s, v0.s[1] 87 LDR q22, [x5], 16 88 FMLA v6.4s, v24.4s, v0.s[1] 89 LDR q23, [x5], 16 90 FMLA v7.4s, v25.4s, v0.s[1] 91 LDR q24, [x5], 16 92 FMLA v16.4s, v26.4s, v0.s[2] 93 LDR q25, [x5], 16 94 FMLA v17.4s, v27.4s, v0.s[2] 95 LDR q26, [x5], 16 96 FMLA v18.4s, v28.4s, v0.s[2] 97 LDR q27, [x5], 16 98 FMLA v5.4s, v29.4s, v0.s[3] 99 LDR q28, [x5], 16 100 FMLA v6.4s, v30.4s, v0.s[3] 101 LDR q29, [x5], 16 102 FMLA v7.4s, v31.4s, v0.s[3] 103 LDR q30, [x5], 16 104 LDR q31, [x5], 16 105 106 # Second block of 4. FMA for second 4, loads for 1st block of 4. 107 FMLA v16.4s, v20.4s, v1.s[0] 108 LDR q0, [x3], 16 109 FMLA v17.4s, v21.4s, v1.s[0] 110 LDR q20, [x5], 16 111 FMLA v18.4s, v22.4s, v1.s[0] 112 LDR q21, [x5], 16 113 FMLA v5.4s, v23.4s, v1.s[1] 114 LDR q22, [x5], 16 115 FMLA v6.4s, v24.4s, v1.s[1] 116 LDR q23, [x5], 16 117 FMLA v7.4s, v25.4s, v1.s[1] 118 LDR q24, [x5], 16 119 FMLA v16.4s, v26.4s, v1.s[2] 120 LDR q25, [x5], 16 121 FMLA v17.4s, v27.4s, v1.s[2] 122 LDR q26, [x5], 16 123 FMLA v18.4s, v28.4s, v1.s[2] 124 LDR q27, [x5], 16 125 FMLA v5.4s, v29.4s, v1.s[3] 126 LDR q28, [x5], 16 127 FMLA v6.4s, v30.4s, v1.s[3] 128 LDR q29, [x5], 16 129 FMLA v7.4s, v31.4s, v1.s[3] 130 LDR q30, [x5], 16 131 SUBS x0, x0, 32 132 LDR q31, [x5], 16 133 B.HS 1b 134 1352: 136 # Epilogue 137 138 # First block of 4. FMA for first 4, loads for 2nd block of 4. 139 FMLA v16.4s, v20.4s, v0.s[0] 140 LDR q1, [x3], 16 141 FMLA v17.4s, v21.4s, v0.s[0] 142 LDR q20, [x5], 16 143 FMLA v18.4s, v22.4s, v0.s[0] 144 LDR q21, [x5], 16 145 FMLA v5.4s, v23.4s, v0.s[1] 146 LDR q22, [x5], 16 147 FMLA v6.4s, v24.4s, v0.s[1] 148 LDR q23, [x5], 16 149 FMLA v7.4s, v25.4s, v0.s[1] 150 LDR q24, [x5], 16 151 FMLA v16.4s, v26.4s, v0.s[2] 152 LDR q25, [x5], 16 153 FMLA v17.4s, v27.4s, v0.s[2] 154 LDR q26, [x5], 16 155 FMLA v18.4s, v28.4s, v0.s[2] 156 LDR q27, [x5], 16 157 FMLA v5.4s, v29.4s, v0.s[3] 158 LDR q28, [x5], 16 159 FMLA v6.4s, v30.4s, v0.s[3] 160 LDR q29, [x5], 16 161 FMLA v7.4s, v31.4s, v0.s[3] 162 LDR q30, [x5], 16 163 164 # Second block of 4. FMA for second 4, no loads. 165 FMLA v16.4s, v20.4s, v1.s[0] 166 LDR q31, [x5], 16 167 FMLA v17.4s, v21.4s, v1.s[0] 168 FMLA v18.4s, v22.4s, v1.s[0] 169 FMLA v5.4s, v23.4s, v1.s[1] 170 FMLA v6.4s, v24.4s, v1.s[1] 171 FMLA v7.4s, v25.4s, v1.s[1] 172 FMLA v16.4s, v26.4s, v1.s[2] 173 FMLA v17.4s, v27.4s, v1.s[2] 174 FMLA v18.4s, v28.4s, v1.s[2] 175 FMLA v5.4s, v29.4s, v1.s[3] 176 FMLA v6.4s, v30.4s, v1.s[3] 177 FMLA v7.4s, v31.4s, v1.s[3] 178 1793: 180 # Is there a remainder?- 4 floats of A (16 bytes) 181 TBNZ x0, 4, 5f 182 # Is there a remainder?- 2 floats of A (8 bytes) 183 TBNZ x0, 3, 6f 184 # Is there a remainder?- 1 floats of A (4 bytes) 185 TBNZ x0, 2, 8f 186 1874: 188 FADD v16.4s, v16.4s, v5.4s 189 FADD v17.4s, v17.4s, v6.4s 190 FADD v18.4s, v18.4s, v7.4s 191 SUBS x1, x1, 12 192 193 # Clamp 194 FMAX v16.4s, v16.4s, v2.4s 195 FMAX v17.4s, v17.4s, v2.4s 196 FMAX v18.4s, v18.4s, v2.4s 197 FMIN v16.4s, v16.4s, v3.4s 198 FMIN v17.4s, v17.4s, v3.4s 199 FMIN v18.4s, v18.4s, v3.4s 200 201 # Store full 1 x 12 202 B.LO 9f 203 204 ST1 {v16.16b, v17.16b, v18.16b}, [x6], x14 205 SUB x3, x3, x2 // a0 -= kc 206 207 B.HI 0b 208 209 RET 210 2115: 212 # Remainder- 4 floats of A (16 bytes) 213 LDR q0, [x3], 16 214 LDR q20, [x5], 16 215 LDR q21, [x5], 16 216 LDR q22, [x5], 16 217 FMLA v16.4s, v20.4s, v0.s[0] 218 FMLA v17.4s, v21.4s, v0.s[0] 219 FMLA v18.4s, v22.4s, v0.s[0] 220 221 LDR q20, [x5], 16 222 LDR q21, [x5], 16 223 LDR q22, [x5], 16 224 FMLA v16.4s, v20.4s, v0.s[1] 225 FMLA v17.4s, v21.4s, v0.s[1] 226 FMLA v18.4s, v22.4s, v0.s[1] 227 228 LDR q20, [x5], 16 229 LDR q21, [x5], 16 230 LDR q22, [x5], 16 231 FMLA v16.4s, v20.4s, v0.s[2] 232 FMLA v17.4s, v21.4s, v0.s[2] 233 FMLA v18.4s, v22.4s, v0.s[2] 234 235 LDR q20, [x5], 16 236 LDR q21, [x5], 16 237 LDR q22, [x5], 16 238 FMLA v16.4s, v20.4s, v0.s[3] 239 FMLA v17.4s, v21.4s, v0.s[3] 240 FMLA v18.4s, v22.4s, v0.s[3] 241 242 TBZ x0, 3, 7f 2436: 244 # Remainder- 2 floats of A (8 bytes) 245 LDR d0, [x3], 8 246 LDR q20, [x5], 16 247 LDR q21, [x5], 16 248 LDR q22, [x5], 16 249 FMLA v16.4s, v20.4s, v0.s[0] 250 FMLA v17.4s, v21.4s, v0.s[0] 251 FMLA v18.4s, v22.4s, v0.s[0] 252 253 LDR q20, [x5], 16 254 LDR q21, [x5], 16 255 LDR q22, [x5], 16 256 FMLA v16.4s, v20.4s, v0.s[1] 257 FMLA v17.4s, v21.4s, v0.s[1] 258 FMLA v18.4s, v22.4s, v0.s[1] 2597: 260 TBZ x0, 2, 4b 2618: 262 # Remainder- 1 float of A (4 bytes) 263 LDR s0, [x3], 4 264 LDR q20, [x5], 16 265 LDR q21, [x5], 16 266 LDR q22, [x5], 16 267 FMLA v16.4s, v20.4s, v0.s[0] 268 FMLA v17.4s, v21.4s, v0.s[0] 269 FMLA v18.4s, v22.4s, v0.s[0] 270 B 4b 271 272 # Store odd channels 2739: 274 ADD x1, x1, 12 275 TBZ x1, 3, 10f 276 STP q16, q17, [x6], 32 277 MOV v16.16b, v18.16b 278 27910: 280 TBZ x1, 2, 11f 281 STR q16, [x6], 16 282 MOV v16.16b, v17.16b 283 28411: 285 TBZ x1, 1, 12f 286 STR d16, [x6], 8 287 DUP d16, v16.d[1] 288 28912: 290 TBZ x1, 0, 13f 291 STR s16, [x6] 29213: 293 RET 294 295END_FUNCTION xnn_f32_gemminc_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53 296 297#ifdef __ELF__ 298.section ".note.GNU-stack","",%progbits 299#endif 300