1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> x8 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointers 29# x14 a0 30# x15 a1 31# x20 a2 32# x21 a3 33# x22 a4 34# x23 a5 35 36# C pointers 37# x6 c0 38# x16 c1 39# x17 c2 40# x10 c3 41# x13 c4 42# x7 c5 43 44# Vector register usage 45# A0 v0 v6 46# A1 v1 v7 47# A2 v2 v8 48# A3 v3 v9 49# A4 v4 v10 50# A5 v5 v11 51# B v12 v13 v14 v15 52# B v16 v17 v18 v19 53# C v20 v21 54# C v22 v23 55# C v24 v25 56# C v26 v27 57# C v28 v29 58# C v30 v31 59# Clamp v6 v7 60 61BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75 62 63 # Clamp C pointers / Save d8-d15 on stack 64 STP d8, d9, [sp, -96]! 65 CMP x0, 2 // if mr < 2 66 ADD x16, x6, x7 // c1 = c0 + cm_stride 67 CSEL x16, x6, x16, LO // c1 = c0 68 69 STP d10, d11, [sp, 16] 70 ADD x17, x16, x7 // c2 = c1 + cm_stride 71 // if mr <= 2 72 CSEL x17, x16, x17, LS // c2 = c1 73 74 STP d12, d13, [sp, 32] 75 CMP x0, 4 // if mr < 4 76 ADD x10, x17, x7 // c3 = c2 + cm_stride 77 CSEL x10, x17, x10, LO // c3 = c2 78 79 STP d14, d15, [sp, 48] 80 ADD x13, x10, x7 // c4 = c3 + cm_stride 81 // if mr <= 4 82 CSEL x13, x10, x13, LS // c4 = c3 83 84 # Save x20,x21,x22,x23 on stack 85 STP x20, x21, [sp, 64] 86 STP x22, x23, [sp, 80] 87 88 CMP x0, 6 // if mr < 6 89 ADD x7, x13, x7 // c5 = c4 + cm_stride 90 CSEL x7, x13, x7, LO // c5 = c4 91 92 # Load a_offset 93 LDR x11, [sp, 104] 94 95 # Load zero, params pointer 96 LDP x12, x8, [sp, 112] 97 980: 99 # Load initial bias from w into accumulators 100 LDP q20, q21, [x5], 32 101 MOV v22.16b, v20.16b 102 MOV v23.16b, v21.16b 103 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 104 MOV v24.16b, v20.16b 105 MOV v25.16b, v21.16b 106 PRFM PLDL1KEEP, [x5, 64] 107 MOV v26.16b, v20.16b 108 MOV v27.16b, v21.16b 109 PRFM PLDL1KEEP, [x5, 128] 110 MOV v28.16b, v20.16b 111 MOV v29.16b, v21.16b 112 PRFM PLDL1KEEP, [x5, 192] 113 MOV v30.16b, v20.16b 114 MOV v31.16b, v21.16b 115 116 MOV x9, x3 // p = ks 117 1181: 119 # Load next 6 A pointers 120 LDP x14, x15, [x4], 16 121 LDP x20, x21, [x4], 16 122 LDP x22, x23, [x4], 16 123 124 CMP x14, x12 // if a0 == zero 125 ADD x14, x14, x11 // a0 += a_offset 126 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 127 CMP x15, x12 // if a1 == zero 128 ADD x15, x15, x11 // a1 += a_offset 129 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 130 CMP x20, x12 // if a2 == zero 131 ADD x20, x20, x11 // a2 += a_offset 132 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 133 CMP x21, x12 // if a3 == zero 134 ADD x21, x21, x11 // a3 += a_offset 135 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 136 CMP x22, x12 // if a4 == zero 137 ADD x22, x22, x11 // a4 += a_offset 138 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 139 CMP x23, x12 // if a5 == zero 140 ADD x23, x23, x11 // a5 += a_offset 141 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 142 143 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 144 SUBS x0, x2, 32 // k = kc - 32 145 B.LO 5f 146 147 # Prologue - loads for main loop of 96 FMA 148 LDR q0, [x14], 16 149 LDR q1, [x15], 16 150 LDR q2, [x20], 16 151 LDR q3, [x21], 16 152 LDR q4, [x22], 16 153 LDR q5, [x23], 16 154 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 155 LDP q14, q15, [x5], 32 156 LDP q16, q17, [x5], 32 157 158 # Is there at least 8 floats (32 bytes) for main loop? 159 SUBS x0, x0, 32 160 B.LO 3f 161 162 # Main loop - 8 floats of A (32 bytes) 163 # 96 FMA + 6 LDP A + 8 LDP B 1642: 165 # First group of 4 A. 48 FMA. 166 FMLA v20.4s, v12.4s, v0.s[0] 167 LDP q18, q19, [x5], 32 // Load last B 168 FMLA v22.4s, v12.4s, v1.s[0] 169 FMLA v24.4s, v12.4s, v2.s[0] 170 FMLA v26.4s, v12.4s, v3.s[0] 171 FMLA v28.4s, v12.4s, v4.s[0] 172 FMLA v30.4s, v12.4s, v5.s[0] 173 FMLA v21.4s, v13.4s, v0.s[0] 174 FMLA v23.4s, v13.4s, v1.s[0] 175 FMLA v25.4s, v13.4s, v2.s[0] 176 FMLA v27.4s, v13.4s, v3.s[0] 177 FMLA v29.4s, v13.4s, v4.s[0] 178 179 FMLA v31.4s, v13.4s, v5.s[0] 180 FMLA v20.4s, v14.4s, v0.s[1] 181 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 182 FMLA v22.4s, v14.4s, v1.s[1] 183 FMLA v24.4s, v14.4s, v2.s[1] 184 FMLA v26.4s, v14.4s, v3.s[1] 185 FMLA v28.4s, v14.4s, v4.s[1] 186 PRFM PLDL1KEEP, [x5, 256] 187 FMLA v30.4s, v14.4s, v5.s[1] 188 FMLA v21.4s, v15.4s, v0.s[1] 189 FMLA v23.4s, v15.4s, v1.s[1] 190 FMLA v25.4s, v15.4s, v2.s[1] 191 LDR q6, [x14], 16 // Load next 6 A 192 FMLA v27.4s, v15.4s, v3.s[1] 193 FMLA v29.4s, v15.4s, v4.s[1] 194 FMLA v31.4s, v15.4s, v5.s[1] 195 LDR q7, [x15], 16 196 197 FMLA v20.4s, v16.4s, v0.s[2] 198 FMLA v22.4s, v16.4s, v1.s[2] 199 FMLA v24.4s, v16.4s, v2.s[2] 200 LDR q8, [x20], 16 201 FMLA v26.4s, v16.4s, v3.s[2] 202 FMLA v28.4s, v16.4s, v4.s[2] 203 FMLA v30.4s, v16.4s, v5.s[2] 204 LDR q9, [x21], 16 205 FMLA v21.4s, v17.4s, v0.s[2] 206 FMLA v23.4s, v17.4s, v1.s[2] 207 FMLA v25.4s, v17.4s, v2.s[2] 208 LDR q10, [x22], 16 209 FMLA v27.4s, v17.4s, v3.s[2] 210 FMLA v29.4s, v17.4s, v4.s[2] 211 FMLA v31.4s, v17.4s, v5.s[2] 212 LDR q11, [x23], 16 213 214 FMLA v20.4s, v18.4s, v0.s[3] 215 FMLA v22.4s, v18.4s, v1.s[3] 216 FMLA v24.4s, v18.4s, v2.s[3] 217 LDP q12, q13, [x5], 32 // Load 4 B 218 FMLA v26.4s, v18.4s, v3.s[3] 219 FMLA v28.4s, v18.4s, v4.s[3] 220 FMLA v30.4s, v18.4s, v5.s[3] 221 LDP q14, q15, [x5], 32 222 FMLA v21.4s, v19.4s, v0.s[3] 223 FMLA v23.4s, v19.4s, v1.s[3] 224 FMLA v25.4s, v19.4s, v2.s[3] 225 LDP q16, q17, [x5], 32 226 FMLA v27.4s, v19.4s, v3.s[3] 227 FMLA v29.4s, v19.4s, v4.s[3] 228 FMLA v31.4s, v19.4s, v5.s[3] 229 LDP q18, q19, [x5], 32 230 231 # Second group of 4 A. 48 FMA. 232 FMLA v20.4s, v12.4s, v6.s[0] 233 FMLA v22.4s, v12.4s, v7.s[0] 234 FMLA v24.4s, v12.4s, v8.s[0] 235 LDR q0, [x14], 16 // Load next 6 A 236 FMLA v26.4s, v12.4s, v9.s[0] 237 FMLA v28.4s, v12.4s, v10.s[0] 238 FMLA v30.4s, v12.4s, v11.s[0] 239 LDR q1, [x15], 16 240 FMLA v21.4s, v13.4s, v6.s[0] 241 FMLA v23.4s, v13.4s, v7.s[0] 242 FMLA v25.4s, v13.4s, v8.s[0] 243 LDR q2, [x20], 16 244 FMLA v27.4s, v13.4s, v9.s[0] 245 FMLA v29.4s, v13.4s, v10.s[0] 246 FMLA v31.4s, v13.4s, v11.s[0] 247 LDR q3, [x21], 16 248 249 FMLA v20.4s, v14.4s, v6.s[1] 250 FMLA v22.4s, v14.4s, v7.s[1] 251 FMLA v24.4s, v14.4s, v8.s[1] 252 LDR q4, [x22], 16 253 FMLA v26.4s, v14.4s, v9.s[1] 254 FMLA v28.4s, v14.4s, v10.s[1] 255 FMLA v30.4s, v14.4s, v11.s[1] 256 LDR q5, [x23], 16 257 FMLA v21.4s, v15.4s, v6.s[1] 258 FMLA v23.4s, v15.4s, v7.s[1] 259 FMLA v25.4s, v15.4s, v8.s[1] 260 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 261 FMLA v27.4s, v15.4s, v9.s[1] 262 FMLA v29.4s, v15.4s, v10.s[1] 263 FMLA v31.4s, v15.4s, v11.s[1] 264 LDP q14, q15, [x5], 32 265 266 FMLA v20.4s, v16.4s, v6.s[2] 267 FMLA v22.4s, v16.4s, v7.s[2] 268 FMLA v24.4s, v16.4s, v8.s[2] 269 FMLA v26.4s, v16.4s, v9.s[2] 270 FMLA v28.4s, v16.4s, v10.s[2] 271 FMLA v30.4s, v16.4s, v11.s[2] 272 FMLA v21.4s, v17.4s, v6.s[2] 273 FMLA v23.4s, v17.4s, v7.s[2] 274 FMLA v25.4s, v17.4s, v8.s[2] 275 FMLA v27.4s, v17.4s, v9.s[2] 276 FMLA v29.4s, v17.4s, v10.s[2] 277 FMLA v31.4s, v17.4s, v11.s[2] 278 LDP q16, q17, [x5], 32 279 280 FMLA v20.4s, v18.4s, v6.s[3] 281 FMLA v22.4s, v18.4s, v7.s[3] 282 SUBS x0, x0, 32 283 FMLA v24.4s, v18.4s, v8.s[3] 284 FMLA v26.4s, v18.4s, v9.s[3] 285 FMLA v28.4s, v18.4s, v10.s[3] 286 FMLA v30.4s, v18.4s, v11.s[3] 287 FMLA v21.4s, v19.4s, v6.s[3] 288 FMLA v23.4s, v19.4s, v7.s[3] 289 FMLA v25.4s, v19.4s, v8.s[3] 290 FMLA v27.4s, v19.4s, v9.s[3] 291 FMLA v29.4s, v19.4s, v10.s[3] 292 FMLA v31.4s, v19.4s, v11.s[3] 293 B.HS 2b 294 295 # Epilogue - 8 floats of A (32 bytes) 296 # 96 FMA + 6 LDP A + 8 LDP B 297 # First block same as main loop. Second block has no preloads. 2983: 299 # First group of 4 A. 48 FMA. 300 FMLA v20.4s, v12.4s, v0.s[0] 301 LDP q18, q19, [x5], 32 // Load last B 302 FMLA v22.4s, v12.4s, v1.s[0] 303 FMLA v24.4s, v12.4s, v2.s[0] 304 FMLA v26.4s, v12.4s, v3.s[0] 305 FMLA v28.4s, v12.4s, v4.s[0] 306 FMLA v30.4s, v12.4s, v5.s[0] 307 FMLA v21.4s, v13.4s, v0.s[0] 308 FMLA v23.4s, v13.4s, v1.s[0] 309 FMLA v25.4s, v13.4s, v2.s[0] 310 FMLA v27.4s, v13.4s, v3.s[0] 311 FMLA v29.4s, v13.4s, v4.s[0] 312 313 FMLA v31.4s, v13.4s, v5.s[0] 314 FMLA v20.4s, v14.4s, v0.s[1] 315 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 316 FMLA v22.4s, v14.4s, v1.s[1] 317 FMLA v24.4s, v14.4s, v2.s[1] 318 FMLA v26.4s, v14.4s, v3.s[1] 319 FMLA v28.4s, v14.4s, v4.s[1] 320 PRFM PLDL1KEEP, [x5, 256] 321 FMLA v30.4s, v14.4s, v5.s[1] 322 FMLA v21.4s, v15.4s, v0.s[1] 323 FMLA v23.4s, v15.4s, v1.s[1] 324 FMLA v25.4s, v15.4s, v2.s[1] 325 LDR q6, [x14], 16 // Load next 6 A 326 FMLA v27.4s, v15.4s, v3.s[1] 327 FMLA v29.4s, v15.4s, v4.s[1] 328 FMLA v31.4s, v15.4s, v5.s[1] 329 LDR q7, [x15], 16 330 331 FMLA v20.4s, v16.4s, v0.s[2] 332 FMLA v22.4s, v16.4s, v1.s[2] 333 FMLA v24.4s, v16.4s, v2.s[2] 334 LDR q8, [x20], 16 335 FMLA v26.4s, v16.4s, v3.s[2] 336 FMLA v28.4s, v16.4s, v4.s[2] 337 FMLA v30.4s, v16.4s, v5.s[2] 338 LDR q9, [x21], 16 339 FMLA v21.4s, v17.4s, v0.s[2] 340 FMLA v23.4s, v17.4s, v1.s[2] 341 FMLA v25.4s, v17.4s, v2.s[2] 342 LDR q10, [x22], 16 343 FMLA v27.4s, v17.4s, v3.s[2] 344 FMLA v29.4s, v17.4s, v4.s[2] 345 FMLA v31.4s, v17.4s, v5.s[2] 346 LDR q11, [x23], 16 347 348 FMLA v20.4s, v18.4s, v0.s[3] 349 FMLA v22.4s, v18.4s, v1.s[3] 350 FMLA v24.4s, v18.4s, v2.s[3] 351 LDP q12, q13, [x5], 32 // Load 4 B 352 FMLA v26.4s, v18.4s, v3.s[3] 353 FMLA v28.4s, v18.4s, v4.s[3] 354 FMLA v30.4s, v18.4s, v5.s[3] 355 LDP q14, q15, [x5], 32 356 FMLA v21.4s, v19.4s, v0.s[3] 357 FMLA v23.4s, v19.4s, v1.s[3] 358 FMLA v25.4s, v19.4s, v2.s[3] 359 LDP q16, q17, [x5], 32 360 FMLA v27.4s, v19.4s, v3.s[3] 361 FMLA v29.4s, v19.4s, v4.s[3] 362 FMLA v31.4s, v19.4s, v5.s[3] 363 LDP q18, q19, [x5], 32 364 365 # Second group of 4 A. 48 FMA. 366 FMLA v20.4s, v12.4s, v6.s[0] 367 FMLA v22.4s, v12.4s, v7.s[0] 368 FMLA v24.4s, v12.4s, v8.s[0] 369 FMLA v26.4s, v12.4s, v9.s[0] 370 FMLA v28.4s, v12.4s, v10.s[0] 371 FMLA v30.4s, v12.4s, v11.s[0] 372 FMLA v21.4s, v13.4s, v6.s[0] 373 FMLA v23.4s, v13.4s, v7.s[0] 374 FMLA v25.4s, v13.4s, v8.s[0] 375 FMLA v27.4s, v13.4s, v9.s[0] 376 FMLA v29.4s, v13.4s, v10.s[0] 377 FMLA v31.4s, v13.4s, v11.s[0] 378 379 FMLA v20.4s, v14.4s, v6.s[1] 380 FMLA v22.4s, v14.4s, v7.s[1] 381 FMLA v24.4s, v14.4s, v8.s[1] 382 FMLA v26.4s, v14.4s, v9.s[1] 383 FMLA v28.4s, v14.4s, v10.s[1] 384 FMLA v30.4s, v14.4s, v11.s[1] 385 FMLA v21.4s, v15.4s, v6.s[1] 386 FMLA v23.4s, v15.4s, v7.s[1] 387 FMLA v25.4s, v15.4s, v8.s[1] 388 FMLA v27.4s, v15.4s, v9.s[1] 389 FMLA v29.4s, v15.4s, v10.s[1] 390 FMLA v31.4s, v15.4s, v11.s[1] 391 392 FMLA v20.4s, v16.4s, v6.s[2] 393 FMLA v22.4s, v16.4s, v7.s[2] 394 FMLA v24.4s, v16.4s, v8.s[2] 395 FMLA v26.4s, v16.4s, v9.s[2] 396 FMLA v28.4s, v16.4s, v10.s[2] 397 FMLA v30.4s, v16.4s, v11.s[2] 398 FMLA v21.4s, v17.4s, v6.s[2] 399 FMLA v23.4s, v17.4s, v7.s[2] 400 FMLA v25.4s, v17.4s, v8.s[2] 401 FMLA v27.4s, v17.4s, v9.s[2] 402 FMLA v29.4s, v17.4s, v10.s[2] 403 FMLA v31.4s, v17.4s, v11.s[2] 404 405 FMLA v20.4s, v18.4s, v6.s[3] 406 FMLA v22.4s, v18.4s, v7.s[3] 407 FMLA v24.4s, v18.4s, v8.s[3] 408 FMLA v26.4s, v18.4s, v9.s[3] 409 FMLA v28.4s, v18.4s, v10.s[3] 410 FMLA v30.4s, v18.4s, v11.s[3] 411 FMLA v21.4s, v19.4s, v6.s[3] 412 FMLA v23.4s, v19.4s, v7.s[3] 413 414 # Load min/max values 415 LD2R {v6.4s, v7.4s}, [x8] 416 417 FMLA v25.4s, v19.4s, v8.s[3] 418 FMLA v27.4s, v19.4s, v9.s[3] 419 # Is there a remainder?- 4 floats of A (16 bytes) or less 420 TST x0, 31 421 FMLA v29.4s, v19.4s, v10.s[3] 422 FMLA v31.4s, v19.4s, v11.s[3] 423 B.NE 5f 424 4254: 426 # ks loop 427 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 428 B.HI 1b 429 430 # Clamp 431 FMAX v20.4s, v20.4s, v6.4s 432 # Load cn_stride 433 LDR x0, [sp, 96] 434 FMAX v21.4s, v21.4s, v6.4s 435 FMAX v22.4s, v22.4s, v6.4s 436 FMAX v23.4s, v23.4s, v6.4s 437 FMAX v24.4s, v24.4s, v6.4s 438 FMAX v25.4s, v25.4s, v6.4s 439 FMAX v26.4s, v26.4s, v6.4s 440 FMAX v27.4s, v27.4s, v6.4s 441 FMAX v28.4s, v28.4s, v6.4s 442 FMAX v29.4s, v29.4s, v6.4s 443 FMAX v30.4s, v30.4s, v6.4s 444 FMAX v31.4s, v31.4s, v6.4s 445 SUBS x1, x1, 8 446 FMIN v20.4s, v20.4s, v7.4s 447 FMIN v21.4s, v21.4s, v7.4s 448 FMIN v22.4s, v22.4s, v7.4s 449 FMIN v23.4s, v23.4s, v7.4s 450 FMIN v24.4s, v24.4s, v7.4s 451 FMIN v25.4s, v25.4s, v7.4s 452 FMIN v26.4s, v26.4s, v7.4s 453 FMIN v27.4s, v27.4s, v7.4s 454 FMIN v28.4s, v28.4s, v7.4s 455 FMIN v29.4s, v29.4s, v7.4s 456 FMIN v30.4s, v30.4s, v7.4s 457 FMIN v31.4s, v31.4s, v7.4s 458 459 # Store full 6 x 8 460 B.LO 8f 461 462 STP q30, q31, [x7] 463 ADD x7, x7, x0 464 STP q28, q29, [x13] 465 ADD x13, x13, x0 466 STP q26, q27, [x10] 467 ADD x10, x10, x0 468 STP q24, q25, [x17] 469 ADD x17, x17, x0 470 STP q22, q23, [x16] 471 ADD x16, x16, x0 472 STP q20, q21, [x6] 473 ADD x6, x6, x0 474 475 SUB x4, x4, x3 // a -= ks 476 477 # nc loop 478 B.HI 0b 479 480 # Restore x20,x21,x22,x23 from stack 481 LDP x22, x23, [sp, 80] 482 LDP x20, x21, [sp, 64] 483 484 # Restore d8-d15 from stack 485 LDP d14, d15, [sp, 48] 486 LDP d12, d13, [sp, 32] 487 LDP d10, d11, [sp, 16] 488 LDP d8, d9, [sp], 96 489 RET 490 4915: 492 # Load min/max values 493 LD2R {v6.4s, v7.4s}, [x8] 494 495 # Is there a remainder?- 4 floats of A (16 bytes) 496 TBZ x0, 4, 6f 497 498 # Remainder- 4 floats of A (16 bytes) 499 # Load A 500 LDR q0, [x14], 16 501 LDR q1, [x15], 16 502 LDR q2, [x20], 16 503 LDR q3, [x21], 16 504 LDR q4, [x22], 16 505 LDR q5, [x23], 16 506 # Load B 507 LDP q12, q13, [x5], 32 508 LDP q14, q15, [x5], 32 509 LDP q16, q17, [x5], 32 510 LDP q18, q19, [x5], 32 511 512 FMLA v20.4s, v12.4s, v0.s[0] 513 FMLA v22.4s, v12.4s, v1.s[0] 514 FMLA v24.4s, v12.4s, v2.s[0] 515 FMLA v26.4s, v12.4s, v3.s[0] 516 FMLA v28.4s, v12.4s, v4.s[0] 517 FMLA v30.4s, v12.4s, v5.s[0] 518 FMLA v21.4s, v13.4s, v0.s[0] 519 FMLA v23.4s, v13.4s, v1.s[0] 520 FMLA v25.4s, v13.4s, v2.s[0] 521 FMLA v27.4s, v13.4s, v3.s[0] 522 FMLA v29.4s, v13.4s, v4.s[0] 523 FMLA v31.4s, v13.4s, v5.s[0] 524 525 FMLA v20.4s, v14.4s, v0.s[1] 526 FMLA v22.4s, v14.4s, v1.s[1] 527 FMLA v24.4s, v14.4s, v2.s[1] 528 FMLA v26.4s, v14.4s, v3.s[1] 529 FMLA v28.4s, v14.4s, v4.s[1] 530 FMLA v30.4s, v14.4s, v5.s[1] 531 FMLA v21.4s, v15.4s, v0.s[1] 532 FMLA v23.4s, v15.4s, v1.s[1] 533 FMLA v25.4s, v15.4s, v2.s[1] 534 FMLA v27.4s, v15.4s, v3.s[1] 535 FMLA v29.4s, v15.4s, v4.s[1] 536 FMLA v31.4s, v15.4s, v5.s[1] 537 538 FMLA v20.4s, v16.4s, v0.s[2] 539 FMLA v22.4s, v16.4s, v1.s[2] 540 FMLA v24.4s, v16.4s, v2.s[2] 541 FMLA v26.4s, v16.4s, v3.s[2] 542 FMLA v28.4s, v16.4s, v4.s[2] 543 FMLA v30.4s, v16.4s, v5.s[2] 544 FMLA v21.4s, v17.4s, v0.s[2] 545 FMLA v23.4s, v17.4s, v1.s[2] 546 FMLA v25.4s, v17.4s, v2.s[2] 547 FMLA v27.4s, v17.4s, v3.s[2] 548 FMLA v29.4s, v17.4s, v4.s[2] 549 FMLA v31.4s, v17.4s, v5.s[2] 550 551 FMLA v20.4s, v18.4s, v0.s[3] 552 FMLA v22.4s, v18.4s, v1.s[3] 553 FMLA v24.4s, v18.4s, v2.s[3] 554 FMLA v26.4s, v18.4s, v3.s[3] 555 FMLA v28.4s, v18.4s, v4.s[3] 556 FMLA v30.4s, v18.4s, v5.s[3] 557 FMLA v21.4s, v19.4s, v0.s[3] 558 FMLA v23.4s, v19.4s, v1.s[3] 559 FMLA v25.4s, v19.4s, v2.s[3] 560 FMLA v27.4s, v19.4s, v3.s[3] 561 FMLA v29.4s, v19.4s, v4.s[3] 562 FMLA v31.4s, v19.4s, v5.s[3] 563 564 # Is there a remainder?- 2 floats of A (8 bytes) 5656: 566 TBZ x0, 3, 7f 567 568 # Remainder- 2 floats of A (8 bytes) 569 # Load A 570 LDR d0, [x14], 8 571 LDR d1, [x15], 8 572 LDR d2, [x20], 8 573 LDR d3, [x21], 8 574 LDR d4, [x22], 8 575 LDR d5, [x23], 8 576 # Load B 577 LDP q12, q13, [x5], 32 578 LDP q14, q15, [x5], 32 579 580 FMLA v20.4s, v12.4s, v0.s[0] 581 FMLA v22.4s, v12.4s, v1.s[0] 582 FMLA v24.4s, v12.4s, v2.s[0] 583 FMLA v26.4s, v12.4s, v3.s[0] 584 FMLA v28.4s, v12.4s, v4.s[0] 585 FMLA v30.4s, v12.4s, v5.s[0] 586 FMLA v21.4s, v13.4s, v0.s[0] 587 FMLA v23.4s, v13.4s, v1.s[0] 588 FMLA v25.4s, v13.4s, v2.s[0] 589 FMLA v27.4s, v13.4s, v3.s[0] 590 FMLA v29.4s, v13.4s, v4.s[0] 591 FMLA v31.4s, v13.4s, v5.s[0] 592 593 FMLA v20.4s, v14.4s, v0.s[1] 594 FMLA v22.4s, v14.4s, v1.s[1] 595 FMLA v24.4s, v14.4s, v2.s[1] 596 FMLA v26.4s, v14.4s, v3.s[1] 597 FMLA v28.4s, v14.4s, v4.s[1] 598 FMLA v30.4s, v14.4s, v5.s[1] 599 FMLA v21.4s, v15.4s, v0.s[1] 600 FMLA v23.4s, v15.4s, v1.s[1] 601 FMLA v25.4s, v15.4s, v2.s[1] 602 FMLA v27.4s, v15.4s, v3.s[1] 603 FMLA v29.4s, v15.4s, v4.s[1] 604 FMLA v31.4s, v15.4s, v5.s[1] 605 606 # Is there a remainder?- 1 float of A (4 bytes) 6077: 608 TBZ x0, 2, 4b 609 610 # Remainder- 1 float of A (4 bytes) 611 # Load A 612 LDR s0, [x14], 4 613 LDR s1, [x15], 4 614 LDR s2, [x20], 4 615 LDR s3, [x21], 4 616 LDR s4, [x22], 4 617 LDR s5, [x23], 4 618 # Load B 619 LDP q12, q13, [x5], 32 620 621 FMLA v20.4s, v12.4s, v0.s[0] 622 FMLA v22.4s, v12.4s, v1.s[0] 623 FMLA v24.4s, v12.4s, v2.s[0] 624 FMLA v26.4s, v12.4s, v3.s[0] 625 FMLA v28.4s, v12.4s, v4.s[0] 626 FMLA v30.4s, v12.4s, v5.s[0] 627 FMLA v21.4s, v13.4s, v0.s[0] 628 FMLA v23.4s, v13.4s, v1.s[0] 629 FMLA v25.4s, v13.4s, v2.s[0] 630 FMLA v27.4s, v13.4s, v3.s[0] 631 FMLA v29.4s, v13.4s, v4.s[0] 632 FMLA v31.4s, v13.4s, v5.s[0] 633 B 4b 634 635 # Store odd width 6368: 637 TBZ x1, 2, 9f 638 STR q30, [x7], 16 639 MOV v30.16b, v31.16b 640 STR q28, [x13], 16 641 MOV v28.16b, v29.16b 642 STR q26, [x10], 16 643 MOV v26.16b, v27.16b 644 STR q24, [x17], 16 645 MOV v24.16b, v25.16b 646 STR q22, [x16], 16 647 MOV v22.16b, v23.16b 648 STR q20, [x6], 16 649 MOV v20.16b, v21.16b 6509: 651 TBZ x1, 1, 10f 652 STR d30, [x7], 8 653 DUP d30, v30.d[1] 654 STR d28, [x13], 8 655 DUP d28, v28.d[1] 656 STR d26, [x10], 8 657 DUP d26, v26.d[1] 658 STR d24, [x17], 8 659 DUP d24, v24.d[1] 660 STR d22, [x16], 8 661 DUP d22, v22.d[1] 662 STR d20, [x6], 8 663 DUP d20, v20.d[1] 664 66510: 666 TBZ x1, 0, 11f 667 STR s30, [x7] 668 STR s28, [x13] 669 STR s26, [x10] 670 STR s24, [x17] 671 STR s22, [x16] 672 STR s20, [x6] 67311: 674 # Restore x20,x21,x22,x23 from stack 675 LDP x22, x23, [sp, 80] 676 LDP x20, x21, [sp, 64] 677 678 # Restore d8-d15 from stack 679 LDP d14, d15, [sp, 48] 680 LDP d12, d13, [sp, 32] 681 LDP d10, d11, [sp, 16] 682 LDP d8, d9, [sp], 96 683 RET 684 685END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75 686 687#ifdef __ELF__ 688.section ".note.GNU-stack","",%progbits 689#endif 690