1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/6x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> x8 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointers 29# x14 a0 30# x15 a1 31# x20 a2 32# x21 a3 33# x22 a4 34# x23 a5 35 36# C pointers 37# x6 c0 38# x16 c1 39# x17 c2 40# x10 c3 41# x13 c4 42# x7 c5 43 44# Vector register usage 45# A0 v0 v6 46# A1 v1 v7 47# A2 v2 v8 48# A3 v3 v9 49# A4 v4 v10 50# A5 v5 v11 51# B v12 v13 v14 v15 52# B v16 v17 v18 v19 53# C v20 v21 54# C v22 v23 55# C v24 v25 56# C v26 v27 57# C v28 v29 58# C v30 v31 59# Clamp v6 v7 60 61BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57 62 63 # Clamp C pointers / Save d8-d15 on stack 64 STP d8, d9, [sp, -96]! 65 CMP x0, 2 // if mr < 2 66 ADD x16, x6, x7 // c1 = c0 + cm_stride 67 CSEL x16, x6, x16, LO // c1 = c0 68 69 STP d10, d11, [sp, 16] 70 ADD x17, x16, x7 // c2 = c1 + cm_stride 71 // if mr <= 2 72 CSEL x17, x16, x17, LS // c2 = c1 73 74 STP d12, d13, [sp, 32] 75 CMP x0, 4 // if mr < 4 76 ADD x10, x17, x7 // c3 = c2 + cm_stride 77 CSEL x10, x17, x10, LO // c3 = c2 78 79 STP d14, d15, [sp, 48] 80 ADD x13, x10, x7 // c4 = c3 + cm_stride 81 // if mr <= 4 82 CSEL x13, x10, x13, LS // c4 = c3 83 84 # Save x20,x21,x22,x23 on stack 85 STP x20, x21, [sp, 64] 86 STP x22, x23, [sp, 80] 87 88 CMP x0, 6 // if mr < 6 89 ADD x7, x13, x7 // c5 = c4 + cm_stride 90 CSEL x7, x13, x7, LO // c5 = c4 91 92 # Load a_offset 93 LDR x11, [sp, 104] 94 95 # Load zero, params pointer 96 LDP x12, x8, [sp, 112] 97 980: 99 # Load initial bias from w into accumulators 100 LDP q20, q21, [x5], 32 101 MOV v22.16b, v20.16b 102 MOV v23.16b, v21.16b 103 MOV v24.16b, v20.16b 104 MOV v25.16b, v21.16b 105 MOV v26.16b, v20.16b 106 MOV v27.16b, v21.16b 107 MOV v28.16b, v20.16b 108 MOV v29.16b, v21.16b 109 MOV v30.16b, v20.16b 110 MOV v31.16b, v21.16b 111 112 MOV x9, x3 // p = ks 113 1141: 115 # Load next 6 A pointers 116 LDP x14, x15, [x4], 16 117 LDP x20, x21, [x4], 16 118 LDP x22, x23, [x4], 16 119 120 CMP x14, x12 // if a0 == zero 121 ADD x14, x14, x11 // a0 += a_offset 122 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 123 CMP x15, x12 // if a1 == zero 124 ADD x15, x15, x11 // a1 += a_offset 125 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 126 CMP x20, x12 // if a2 == zero 127 ADD x20, x20, x11 // a2 += a_offset 128 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 129 CMP x21, x12 // if a3 == zero 130 ADD x21, x21, x11 // a3 += a_offset 131 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 132 CMP x22, x12 // if a4 == zero 133 ADD x22, x22, x11 // a4 += a_offset 134 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 135 CMP x23, x12 // if a5 == zero 136 ADD x23, x23, x11 // a5 += a_offset 137 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 138 139 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 140 SUBS x0, x2, 32 // k = kc - 32 141 B.LO 5f 142 143 # Prologue - loads for main loop of 96 FMA 144 LDR q0, [x14], 16 145 LDR q1, [x15], 16 146 LDR q2, [x20], 16 147 LDR q3, [x21], 16 148 LDR q4, [x22], 16 149 LDR q5, [x23], 16 150 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 151 LDP q14, q15, [x5], 32 152 LDP q16, q17, [x5], 32 153 154 # Is there at least 8 floats (32 bytes) for main loop? 155 SUBS x0, x0, 32 156 B.LO 3f 157 158 # Main loop - 8 floats of A (32 bytes) 159 # 96 FMA + 6 LDP A + 8 LDP B 1602: 161 # First group of 4 A. 48 FMA. 162 FMLA v20.4s, v12.4s, v0.s[0] 163 LDP q18, q19, [x5], 32 // Load last B 164 FMLA v22.4s, v12.4s, v1.s[0] 165 FMLA v24.4s, v12.4s, v2.s[0] 166 FMLA v26.4s, v12.4s, v3.s[0] 167 FMLA v28.4s, v12.4s, v4.s[0] 168 FMLA v30.4s, v12.4s, v5.s[0] 169 FMLA v21.4s, v13.4s, v0.s[0] 170 FMLA v23.4s, v13.4s, v1.s[0] 171 FMLA v25.4s, v13.4s, v2.s[0] 172 FMLA v27.4s, v13.4s, v3.s[0] 173 FMLA v29.4s, v13.4s, v4.s[0] 174 175 FMLA v31.4s, v13.4s, v5.s[0] 176 FMLA v20.4s, v14.4s, v0.s[1] 177 FMLA v22.4s, v14.4s, v1.s[1] 178 FMLA v24.4s, v14.4s, v2.s[1] 179 FMLA v26.4s, v14.4s, v3.s[1] 180 FMLA v28.4s, v14.4s, v4.s[1] 181 FMLA v30.4s, v14.4s, v5.s[1] 182 FMLA v21.4s, v15.4s, v0.s[1] 183 FMLA v23.4s, v15.4s, v1.s[1] 184 FMLA v25.4s, v15.4s, v2.s[1] 185 LDR q6, [x14], 16 // Load next 6 A 186 FMLA v27.4s, v15.4s, v3.s[1] 187 FMLA v29.4s, v15.4s, v4.s[1] 188 FMLA v31.4s, v15.4s, v5.s[1] 189 LDR q7, [x15], 16 190 191 FMLA v20.4s, v16.4s, v0.s[2] 192 FMLA v22.4s, v16.4s, v1.s[2] 193 FMLA v24.4s, v16.4s, v2.s[2] 194 LDR q8, [x20], 16 195 FMLA v26.4s, v16.4s, v3.s[2] 196 FMLA v28.4s, v16.4s, v4.s[2] 197 FMLA v30.4s, v16.4s, v5.s[2] 198 LDR q9, [x21], 16 199 FMLA v21.4s, v17.4s, v0.s[2] 200 FMLA v23.4s, v17.4s, v1.s[2] 201 FMLA v25.4s, v17.4s, v2.s[2] 202 LDR q10, [x22], 16 203 FMLA v27.4s, v17.4s, v3.s[2] 204 FMLA v29.4s, v17.4s, v4.s[2] 205 FMLA v31.4s, v17.4s, v5.s[2] 206 LDR q11, [x23], 16 207 208 FMLA v20.4s, v18.4s, v0.s[3] 209 FMLA v22.4s, v18.4s, v1.s[3] 210 FMLA v24.4s, v18.4s, v2.s[3] 211 LDP q12, q13, [x5], 32 // Load 4 B 212 FMLA v26.4s, v18.4s, v3.s[3] 213 FMLA v28.4s, v18.4s, v4.s[3] 214 FMLA v30.4s, v18.4s, v5.s[3] 215 LDP q14, q15, [x5], 32 216 FMLA v21.4s, v19.4s, v0.s[3] 217 FMLA v23.4s, v19.4s, v1.s[3] 218 FMLA v25.4s, v19.4s, v2.s[3] 219 LDP q16, q17, [x5], 32 220 FMLA v27.4s, v19.4s, v3.s[3] 221 FMLA v29.4s, v19.4s, v4.s[3] 222 FMLA v31.4s, v19.4s, v5.s[3] 223 LDP q18, q19, [x5], 32 224 225 # Second group of 4 A. 48 FMA. 226 FMLA v20.4s, v12.4s, v6.s[0] 227 FMLA v22.4s, v12.4s, v7.s[0] 228 FMLA v24.4s, v12.4s, v8.s[0] 229 LDR q0, [x14], 16 // Load next 6 A 230 FMLA v26.4s, v12.4s, v9.s[0] 231 FMLA v28.4s, v12.4s, v10.s[0] 232 FMLA v30.4s, v12.4s, v11.s[0] 233 LDR q1, [x15], 16 234 FMLA v21.4s, v13.4s, v6.s[0] 235 FMLA v23.4s, v13.4s, v7.s[0] 236 FMLA v25.4s, v13.4s, v8.s[0] 237 LDR q2, [x20], 16 238 FMLA v27.4s, v13.4s, v9.s[0] 239 FMLA v29.4s, v13.4s, v10.s[0] 240 FMLA v31.4s, v13.4s, v11.s[0] 241 LDR q3, [x21], 16 242 243 FMLA v20.4s, v14.4s, v6.s[1] 244 FMLA v22.4s, v14.4s, v7.s[1] 245 FMLA v24.4s, v14.4s, v8.s[1] 246 LDR q4, [x22], 16 247 FMLA v26.4s, v14.4s, v9.s[1] 248 FMLA v28.4s, v14.4s, v10.s[1] 249 FMLA v30.4s, v14.4s, v11.s[1] 250 LDR q5, [x23], 16 251 FMLA v21.4s, v15.4s, v6.s[1] 252 FMLA v23.4s, v15.4s, v7.s[1] 253 FMLA v25.4s, v15.4s, v8.s[1] 254 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 255 FMLA v27.4s, v15.4s, v9.s[1] 256 FMLA v29.4s, v15.4s, v10.s[1] 257 FMLA v31.4s, v15.4s, v11.s[1] 258 LDP q14, q15, [x5], 32 259 260 FMLA v20.4s, v16.4s, v6.s[2] 261 FMLA v22.4s, v16.4s, v7.s[2] 262 FMLA v24.4s, v16.4s, v8.s[2] 263 FMLA v26.4s, v16.4s, v9.s[2] 264 FMLA v28.4s, v16.4s, v10.s[2] 265 FMLA v30.4s, v16.4s, v11.s[2] 266 FMLA v21.4s, v17.4s, v6.s[2] 267 FMLA v23.4s, v17.4s, v7.s[2] 268 FMLA v25.4s, v17.4s, v8.s[2] 269 FMLA v27.4s, v17.4s, v9.s[2] 270 FMLA v29.4s, v17.4s, v10.s[2] 271 FMLA v31.4s, v17.4s, v11.s[2] 272 LDP q16, q17, [x5], 32 273 274 FMLA v20.4s, v18.4s, v6.s[3] 275 FMLA v22.4s, v18.4s, v7.s[3] 276 SUBS x0, x0, 32 277 FMLA v24.4s, v18.4s, v8.s[3] 278 FMLA v26.4s, v18.4s, v9.s[3] 279 FMLA v28.4s, v18.4s, v10.s[3] 280 FMLA v30.4s, v18.4s, v11.s[3] 281 FMLA v21.4s, v19.4s, v6.s[3] 282 FMLA v23.4s, v19.4s, v7.s[3] 283 FMLA v25.4s, v19.4s, v8.s[3] 284 FMLA v27.4s, v19.4s, v9.s[3] 285 FMLA v29.4s, v19.4s, v10.s[3] 286 FMLA v31.4s, v19.4s, v11.s[3] 287 B.HS 2b 288 289 # Epilogue - 8 floats of A (32 bytes) 290 # 96 FMA + 6 LDP A + 8 LDP B 291 # First block same as main loop. Second block has no preloads. 2923: 293 # First group of 4 A. 48 FMA. 294 FMLA v20.4s, v12.4s, v0.s[0] 295 LDP q18, q19, [x5], 32 // Load last B 296 FMLA v22.4s, v12.4s, v1.s[0] 297 FMLA v24.4s, v12.4s, v2.s[0] 298 FMLA v26.4s, v12.4s, v3.s[0] 299 FMLA v28.4s, v12.4s, v4.s[0] 300 FMLA v30.4s, v12.4s, v5.s[0] 301 FMLA v21.4s, v13.4s, v0.s[0] 302 FMLA v23.4s, v13.4s, v1.s[0] 303 FMLA v25.4s, v13.4s, v2.s[0] 304 FMLA v27.4s, v13.4s, v3.s[0] 305 FMLA v29.4s, v13.4s, v4.s[0] 306 307 FMLA v31.4s, v13.4s, v5.s[0] 308 FMLA v20.4s, v14.4s, v0.s[1] 309 FMLA v22.4s, v14.4s, v1.s[1] 310 FMLA v24.4s, v14.4s, v2.s[1] 311 FMLA v26.4s, v14.4s, v3.s[1] 312 FMLA v28.4s, v14.4s, v4.s[1] 313 FMLA v30.4s, v14.4s, v5.s[1] 314 FMLA v21.4s, v15.4s, v0.s[1] 315 FMLA v23.4s, v15.4s, v1.s[1] 316 FMLA v25.4s, v15.4s, v2.s[1] 317 LDR q6, [x14], 16 // Load next 6 A 318 FMLA v27.4s, v15.4s, v3.s[1] 319 FMLA v29.4s, v15.4s, v4.s[1] 320 FMLA v31.4s, v15.4s, v5.s[1] 321 LDR q7, [x15], 16 322 323 FMLA v20.4s, v16.4s, v0.s[2] 324 FMLA v22.4s, v16.4s, v1.s[2] 325 FMLA v24.4s, v16.4s, v2.s[2] 326 LDR q8, [x20], 16 327 FMLA v26.4s, v16.4s, v3.s[2] 328 FMLA v28.4s, v16.4s, v4.s[2] 329 FMLA v30.4s, v16.4s, v5.s[2] 330 LDR q9, [x21], 16 331 FMLA v21.4s, v17.4s, v0.s[2] 332 FMLA v23.4s, v17.4s, v1.s[2] 333 FMLA v25.4s, v17.4s, v2.s[2] 334 LDR q10, [x22], 16 335 FMLA v27.4s, v17.4s, v3.s[2] 336 FMLA v29.4s, v17.4s, v4.s[2] 337 FMLA v31.4s, v17.4s, v5.s[2] 338 LDR q11, [x23], 16 339 340 FMLA v20.4s, v18.4s, v0.s[3] 341 FMLA v22.4s, v18.4s, v1.s[3] 342 FMLA v24.4s, v18.4s, v2.s[3] 343 LDP q12, q13, [x5], 32 // Load 4 B 344 FMLA v26.4s, v18.4s, v3.s[3] 345 FMLA v28.4s, v18.4s, v4.s[3] 346 FMLA v30.4s, v18.4s, v5.s[3] 347 LDP q14, q15, [x5], 32 348 FMLA v21.4s, v19.4s, v0.s[3] 349 FMLA v23.4s, v19.4s, v1.s[3] 350 FMLA v25.4s, v19.4s, v2.s[3] 351 LDP q16, q17, [x5], 32 352 FMLA v27.4s, v19.4s, v3.s[3] 353 FMLA v29.4s, v19.4s, v4.s[3] 354 FMLA v31.4s, v19.4s, v5.s[3] 355 LDP q18, q19, [x5], 32 356 357 # Second group of 4 A. 48 FMA. 358 FMLA v20.4s, v12.4s, v6.s[0] 359 FMLA v22.4s, v12.4s, v7.s[0] 360 FMLA v24.4s, v12.4s, v8.s[0] 361 FMLA v26.4s, v12.4s, v9.s[0] 362 FMLA v28.4s, v12.4s, v10.s[0] 363 FMLA v30.4s, v12.4s, v11.s[0] 364 FMLA v21.4s, v13.4s, v6.s[0] 365 FMLA v23.4s, v13.4s, v7.s[0] 366 FMLA v25.4s, v13.4s, v8.s[0] 367 FMLA v27.4s, v13.4s, v9.s[0] 368 FMLA v29.4s, v13.4s, v10.s[0] 369 FMLA v31.4s, v13.4s, v11.s[0] 370 371 FMLA v20.4s, v14.4s, v6.s[1] 372 FMLA v22.4s, v14.4s, v7.s[1] 373 FMLA v24.4s, v14.4s, v8.s[1] 374 FMLA v26.4s, v14.4s, v9.s[1] 375 FMLA v28.4s, v14.4s, v10.s[1] 376 FMLA v30.4s, v14.4s, v11.s[1] 377 FMLA v21.4s, v15.4s, v6.s[1] 378 FMLA v23.4s, v15.4s, v7.s[1] 379 FMLA v25.4s, v15.4s, v8.s[1] 380 FMLA v27.4s, v15.4s, v9.s[1] 381 FMLA v29.4s, v15.4s, v10.s[1] 382 FMLA v31.4s, v15.4s, v11.s[1] 383 384 FMLA v20.4s, v16.4s, v6.s[2] 385 FMLA v22.4s, v16.4s, v7.s[2] 386 FMLA v24.4s, v16.4s, v8.s[2] 387 FMLA v26.4s, v16.4s, v9.s[2] 388 FMLA v28.4s, v16.4s, v10.s[2] 389 FMLA v30.4s, v16.4s, v11.s[2] 390 FMLA v21.4s, v17.4s, v6.s[2] 391 FMLA v23.4s, v17.4s, v7.s[2] 392 FMLA v25.4s, v17.4s, v8.s[2] 393 FMLA v27.4s, v17.4s, v9.s[2] 394 FMLA v29.4s, v17.4s, v10.s[2] 395 FMLA v31.4s, v17.4s, v11.s[2] 396 397 FMLA v20.4s, v18.4s, v6.s[3] 398 FMLA v22.4s, v18.4s, v7.s[3] 399 FMLA v24.4s, v18.4s, v8.s[3] 400 FMLA v26.4s, v18.4s, v9.s[3] 401 FMLA v28.4s, v18.4s, v10.s[3] 402 FMLA v30.4s, v18.4s, v11.s[3] 403 FMLA v21.4s, v19.4s, v6.s[3] 404 FMLA v23.4s, v19.4s, v7.s[3] 405 406 # Load min/max values 407 LD2R {v6.4s, v7.4s}, [x8] 408 409 FMLA v25.4s, v19.4s, v8.s[3] 410 FMLA v27.4s, v19.4s, v9.s[3] 411 # Is there a remainder?- 4 floats of A (16 bytes) or less 412 TST x0, 31 413 FMLA v29.4s, v19.4s, v10.s[3] 414 FMLA v31.4s, v19.4s, v11.s[3] 415 B.NE 5f 416 4174: 418 # ks loop 419 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 420 B.HI 1b 421 422 # Clamp 423 FMAX v20.4s, v20.4s, v6.4s 424 # Load cn_stride 425 LDR x0, [sp, 96] 426 FMAX v21.4s, v21.4s, v6.4s 427 FMAX v22.4s, v22.4s, v6.4s 428 FMAX v23.4s, v23.4s, v6.4s 429 FMAX v24.4s, v24.4s, v6.4s 430 FMAX v25.4s, v25.4s, v6.4s 431 FMAX v26.4s, v26.4s, v6.4s 432 FMAX v27.4s, v27.4s, v6.4s 433 FMAX v28.4s, v28.4s, v6.4s 434 FMAX v29.4s, v29.4s, v6.4s 435 FMAX v30.4s, v30.4s, v6.4s 436 FMAX v31.4s, v31.4s, v6.4s 437 SUBS x1, x1, 8 438 FMIN v20.4s, v20.4s, v7.4s 439 FMIN v21.4s, v21.4s, v7.4s 440 FMIN v22.4s, v22.4s, v7.4s 441 FMIN v23.4s, v23.4s, v7.4s 442 FMIN v24.4s, v24.4s, v7.4s 443 FMIN v25.4s, v25.4s, v7.4s 444 FMIN v26.4s, v26.4s, v7.4s 445 FMIN v27.4s, v27.4s, v7.4s 446 FMIN v28.4s, v28.4s, v7.4s 447 FMIN v29.4s, v29.4s, v7.4s 448 FMIN v30.4s, v30.4s, v7.4s 449 FMIN v31.4s, v31.4s, v7.4s 450 451 # Store full 6 x 8 452 B.LO 8f 453 454 STP q30, q31, [x7] 455 ADD x7, x7, x0 456 STP q28, q29, [x13] 457 ADD x13, x13, x0 458 STP q26, q27, [x10] 459 ADD x10, x10, x0 460 STP q24, q25, [x17] 461 ADD x17, x17, x0 462 STP q22, q23, [x16] 463 ADD x16, x16, x0 464 STP q20, q21, [x6] 465 ADD x6, x6, x0 466 467 SUB x4, x4, x3 // a -= ks 468 469 # nc loop 470 B.HI 0b 471 472 # Restore x20,x21,x22,x23 from stack 473 LDP x22, x23, [sp, 80] 474 LDP x20, x21, [sp, 64] 475 476 # Restore d8-d15 from stack 477 LDP d14, d15, [sp, 48] 478 LDP d12, d13, [sp, 32] 479 LDP d10, d11, [sp, 16] 480 LDP d8, d9, [sp], 96 481 RET 482 4835: 484 # Load min/max values 485 LD2R {v6.4s, v7.4s}, [x8] 486 487 # Is there a remainder?- 4 floats of A (16 bytes) 488 TBZ x0, 4, 6f 489 490 # Remainder- 4 floats of A (16 bytes) 491 # Load A 492 LDR q0, [x14], 16 493 LDR q1, [x15], 16 494 LDR q2, [x20], 16 495 LDR q3, [x21], 16 496 LDR q4, [x22], 16 497 LDR q5, [x23], 16 498 # Load B 499 LDP q12, q13, [x5], 32 500 LDP q14, q15, [x5], 32 501 LDP q16, q17, [x5], 32 502 LDP q18, q19, [x5], 32 503 504 FMLA v20.4s, v12.4s, v0.s[0] 505 FMLA v22.4s, v12.4s, v1.s[0] 506 FMLA v24.4s, v12.4s, v2.s[0] 507 FMLA v26.4s, v12.4s, v3.s[0] 508 FMLA v28.4s, v12.4s, v4.s[0] 509 FMLA v30.4s, v12.4s, v5.s[0] 510 FMLA v21.4s, v13.4s, v0.s[0] 511 FMLA v23.4s, v13.4s, v1.s[0] 512 FMLA v25.4s, v13.4s, v2.s[0] 513 FMLA v27.4s, v13.4s, v3.s[0] 514 FMLA v29.4s, v13.4s, v4.s[0] 515 FMLA v31.4s, v13.4s, v5.s[0] 516 517 FMLA v20.4s, v14.4s, v0.s[1] 518 FMLA v22.4s, v14.4s, v1.s[1] 519 FMLA v24.4s, v14.4s, v2.s[1] 520 FMLA v26.4s, v14.4s, v3.s[1] 521 FMLA v28.4s, v14.4s, v4.s[1] 522 FMLA v30.4s, v14.4s, v5.s[1] 523 FMLA v21.4s, v15.4s, v0.s[1] 524 FMLA v23.4s, v15.4s, v1.s[1] 525 FMLA v25.4s, v15.4s, v2.s[1] 526 FMLA v27.4s, v15.4s, v3.s[1] 527 FMLA v29.4s, v15.4s, v4.s[1] 528 FMLA v31.4s, v15.4s, v5.s[1] 529 530 FMLA v20.4s, v16.4s, v0.s[2] 531 FMLA v22.4s, v16.4s, v1.s[2] 532 FMLA v24.4s, v16.4s, v2.s[2] 533 FMLA v26.4s, v16.4s, v3.s[2] 534 FMLA v28.4s, v16.4s, v4.s[2] 535 FMLA v30.4s, v16.4s, v5.s[2] 536 FMLA v21.4s, v17.4s, v0.s[2] 537 FMLA v23.4s, v17.4s, v1.s[2] 538 FMLA v25.4s, v17.4s, v2.s[2] 539 FMLA v27.4s, v17.4s, v3.s[2] 540 FMLA v29.4s, v17.4s, v4.s[2] 541 FMLA v31.4s, v17.4s, v5.s[2] 542 543 FMLA v20.4s, v18.4s, v0.s[3] 544 FMLA v22.4s, v18.4s, v1.s[3] 545 FMLA v24.4s, v18.4s, v2.s[3] 546 FMLA v26.4s, v18.4s, v3.s[3] 547 FMLA v28.4s, v18.4s, v4.s[3] 548 FMLA v30.4s, v18.4s, v5.s[3] 549 FMLA v21.4s, v19.4s, v0.s[3] 550 FMLA v23.4s, v19.4s, v1.s[3] 551 FMLA v25.4s, v19.4s, v2.s[3] 552 FMLA v27.4s, v19.4s, v3.s[3] 553 FMLA v29.4s, v19.4s, v4.s[3] 554 FMLA v31.4s, v19.4s, v5.s[3] 555 556 # Is there a remainder?- 2 floats of A (8 bytes) 5576: 558 TBZ x0, 3, 7f 559 560 # Remainder- 2 floats of A (8 bytes) 561 # Load A 562 LDR d0, [x14], 8 563 LDR d1, [x15], 8 564 LDR d2, [x20], 8 565 LDR d3, [x21], 8 566 LDR d4, [x22], 8 567 LDR d5, [x23], 8 568 # Load B 569 LDP q12, q13, [x5], 32 570 LDP q14, q15, [x5], 32 571 572 FMLA v20.4s, v12.4s, v0.s[0] 573 FMLA v22.4s, v12.4s, v1.s[0] 574 FMLA v24.4s, v12.4s, v2.s[0] 575 FMLA v26.4s, v12.4s, v3.s[0] 576 FMLA v28.4s, v12.4s, v4.s[0] 577 FMLA v30.4s, v12.4s, v5.s[0] 578 FMLA v21.4s, v13.4s, v0.s[0] 579 FMLA v23.4s, v13.4s, v1.s[0] 580 FMLA v25.4s, v13.4s, v2.s[0] 581 FMLA v27.4s, v13.4s, v3.s[0] 582 FMLA v29.4s, v13.4s, v4.s[0] 583 FMLA v31.4s, v13.4s, v5.s[0] 584 585 FMLA v20.4s, v14.4s, v0.s[1] 586 FMLA v22.4s, v14.4s, v1.s[1] 587 FMLA v24.4s, v14.4s, v2.s[1] 588 FMLA v26.4s, v14.4s, v3.s[1] 589 FMLA v28.4s, v14.4s, v4.s[1] 590 FMLA v30.4s, v14.4s, v5.s[1] 591 FMLA v21.4s, v15.4s, v0.s[1] 592 FMLA v23.4s, v15.4s, v1.s[1] 593 FMLA v25.4s, v15.4s, v2.s[1] 594 FMLA v27.4s, v15.4s, v3.s[1] 595 FMLA v29.4s, v15.4s, v4.s[1] 596 FMLA v31.4s, v15.4s, v5.s[1] 597 598 # Is there a remainder?- 1 float of A (4 bytes) 5997: 600 TBZ x0, 2, 4b 601 602 # Remainder- 1 float of A (4 bytes) 603 # Load A 604 LDR s0, [x14], 4 605 LDR s1, [x15], 4 606 LDR s2, [x20], 4 607 LDR s3, [x21], 4 608 LDR s4, [x22], 4 609 LDR s5, [x23], 4 610 # Load B 611 LDP q12, q13, [x5], 32 612 613 FMLA v20.4s, v12.4s, v0.s[0] 614 FMLA v22.4s, v12.4s, v1.s[0] 615 FMLA v24.4s, v12.4s, v2.s[0] 616 FMLA v26.4s, v12.4s, v3.s[0] 617 FMLA v28.4s, v12.4s, v4.s[0] 618 FMLA v30.4s, v12.4s, v5.s[0] 619 FMLA v21.4s, v13.4s, v0.s[0] 620 FMLA v23.4s, v13.4s, v1.s[0] 621 FMLA v25.4s, v13.4s, v2.s[0] 622 FMLA v27.4s, v13.4s, v3.s[0] 623 FMLA v29.4s, v13.4s, v4.s[0] 624 FMLA v31.4s, v13.4s, v5.s[0] 625 B 4b 626 627 # Store odd width 6288: 629 TBZ x1, 2, 9f 630 STR q30, [x7], 16 631 MOV v30.16b, v31.16b 632 STR q28, [x13], 16 633 MOV v28.16b, v29.16b 634 STR q26, [x10], 16 635 MOV v26.16b, v27.16b 636 STR q24, [x17], 16 637 MOV v24.16b, v25.16b 638 STR q22, [x16], 16 639 MOV v22.16b, v23.16b 640 STR q20, [x6], 16 641 MOV v20.16b, v21.16b 6429: 643 TBZ x1, 1, 10f 644 STR d30, [x7], 8 645 DUP d30, v30.d[1] 646 STR d28, [x13], 8 647 DUP d28, v28.d[1] 648 STR d26, [x10], 8 649 DUP d26, v26.d[1] 650 STR d24, [x17], 8 651 DUP d24, v24.d[1] 652 STR d22, [x16], 8 653 DUP d22, v22.d[1] 654 STR d20, [x6], 8 655 DUP d20, v20.d[1] 656 65710: 658 TBZ x1, 0, 11f 659 STR s30, [x7] 660 STR s28, [x13] 661 STR s26, [x10] 662 STR s24, [x17] 663 STR s22, [x16] 664 STR s20, [x6] 66511: 666 # Restore x20,x21,x22,x23 from stack 667 LDP x22, x23, [sp, 80] 668 LDP x20, x21, [sp, 64] 669 670 # Restore d8-d15 from stack 671 LDP d14, d15, [sp, 48] 672 LDP d12, d13, [sp, 32] 673 LDP d10, d11, [sp, 16] 674 LDP d8, d9, [sp], 96 675 RET 676 677END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57 678 679#ifdef __ELF__ 680.section ".note.GNU-stack","",%progbits 681#endif 682