1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_minmax_ukernel_5x8__aarch64_neonfma_cortex_a57( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 24 25# unused compared to 5x8 26# x4 a5 27# x7 c5 28# A5 v10 v11 29# C v30 v31 30 31# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 32 33# A pointers 34# x3 a0 35# x9 a1 36# x10 a2 37# x11 a3 38# x12 a4 39 40# C pointers 41# x6 c0 42# x16 c1 43# x17 c2 44# x13 c3 45# x7 c4 46 47# Vector register usage 48# A0 v0 v1 49# A1 v2 v3 50# A2 v4 v5 51# A3 v6 v7 52# A4 v8 v9 53# B v12 v13 v14 v15 54# B v16 v17 v18 v19 55# C v20 v21 56# C v22 v23 57# C v24 v25 58# C v26 v27 59# C v28 v29 60# Clamp v30 v31 61 62BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_5x8__aarch64_neonfma_cortex_a57 63 64 # Load cn_stride, acc 65 LDP x14, x15, [sp] 66 # Load params pointer 67 LDR x8, [sp, 16] 68 69 # Clamp A and C pointers / Save d8-d15 on stack 70 STP d8, d9, [sp, -48]! 71 CMP x0, 2 // if mr < 2 72 ADD x9, x3, x4 // a1 = a0 + a_stride 73 ADD x16, x6, x7 // c1 = c0 + cm_stride 74 CSEL x9, x3, x9, LO // a1 = a0 75 CSEL x16, x6, x16, LO // c1 = c0 76 77 STP d12, d13, [sp, 16] 78 ADD x10, x9, x4 // a2 = a1 + a_stride 79 ADD x17, x16, x7 // c2 = c1 + cm_stride 80 // if mr <= 2 81 CSEL x10, x9, x10, LS // a2 = a1 82 CSEL x17, x16, x17, LS // c2 = c1 83 84 STP d14, d15, [sp, 32] 85 CMP x0, 4 // if mr < 4 86 ADD x11, x10, x4 // a3 = a2 + a_stride 87 ADD x13, x17, x7 // c3 = c2 + cm_stride 88 CSEL x11, x10, x11, LO // a3 = a2 89 CSEL x13, x17, x13, LO // c3 = c2 90 91 ADD x12, x11, x4 // a4 = a3 + a_stride 92 ADD x7, x13, x7 // c4 = c3 + cm_stride 93 // if mr <= 4 94 CSEL x12, x11, x12, LS // a4 = a3 95 CSEL x7, x13, x7, LS // c4 = c3 96 97 # Load clamp values 98 LD2R {v30.4s, v31.4s}, [x8] 99 1000: 101 # Load initial accumulators 102 LDP q20, q21, [x15], 32 103 LDP q22, q23, [x15], 32 104 LDP q24, q25, [x15], 32 105 LDP q26, q27, [x15], 32 106 LDP q28, q29, [x15], 32 107 108 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 109 SUBS x0, x2, 32 // k = kc - 32 110 B.LO 4f 111 112 # Prologue - loads for main loop of 80 FMA 113 LDR q0, [x3], 16 114 LDR q2, [x9], 16 115 LDR q4, [x10], 16 116 LDR q6, [x11], 16 117 LDR q8, [x12], 16 118 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 119 LDP q14, q15, [x5], 32 120 LDP q16, q17, [x5], 32 121 122 # Is there at least 8 floats (32 bytes) for main loop? 123 SUBS x0, x0, 32 124 B.LO 2f 125 126 # Main loop - 8 floats of A (32 bytes) 127 # 80 FMA + 5 LDP A + 8 LDP B 1281: 129 # First group of 4 A. 40 FMA. 130 FMLA v20.4s, v12.4s, v0.s[0] 131 LDP q18, q19, [x5], 32 // Load last B 132 FMLA v22.4s, v12.4s, v2.s[0] 133 FMLA v24.4s, v12.4s, v4.s[0] 134 FMLA v26.4s, v12.4s, v6.s[0] 135 FMLA v28.4s, v12.4s, v8.s[0] 136 FMLA v21.4s, v13.4s, v0.s[0] 137 FMLA v23.4s, v13.4s, v2.s[0] 138 FMLA v25.4s, v13.4s, v4.s[0] 139 FMLA v27.4s, v13.4s, v6.s[0] 140 FMLA v29.4s, v13.4s, v8.s[0] 141 LDR q1, [x3], 16 // Load next 5 A 142 143 FMLA v20.4s, v14.4s, v0.s[1] 144 FMLA v22.4s, v14.4s, v2.s[1] 145 FMLA v24.4s, v14.4s, v4.s[1] 146 LDR q3, [x9], 16 147 FMLA v26.4s, v14.4s, v6.s[1] 148 FMLA v28.4s, v14.4s, v8.s[1] 149 FMLA v21.4s, v15.4s, v0.s[1] 150 LDR q5, [x10], 16 151 FMLA v23.4s, v15.4s, v2.s[1] 152 FMLA v25.4s, v15.4s, v4.s[1] 153 FMLA v27.4s, v15.4s, v6.s[1] 154 LDR q7, [x11], 16 155 FMLA v29.4s, v15.4s, v8.s[1] 156 157 FMLA v20.4s, v16.4s, v0.s[2] 158 FMLA v22.4s, v16.4s, v2.s[2] 159 LDR q9, [x12], 16 160 FMLA v24.4s, v16.4s, v4.s[2] 161 FMLA v26.4s, v16.4s, v6.s[2] 162 FMLA v28.4s, v16.4s, v8.s[2] 163 LDP q12, q13, [x5], 32 // Load 4 B 164 FMLA v21.4s, v17.4s, v0.s[2] 165 FMLA v23.4s, v17.4s, v2.s[2] 166 FMLA v25.4s, v17.4s, v4.s[2] 167 LDP q14, q15, [x5], 32 168 FMLA v27.4s, v17.4s, v6.s[2] 169 FMLA v29.4s, v17.4s, v8.s[2] 170 171 FMLA v20.4s, v18.4s, v0.s[3] 172 LDP q16, q17, [x5], 32 173 FMLA v22.4s, v18.4s, v2.s[3] 174 FMLA v24.4s, v18.4s, v4.s[3] 175 FMLA v26.4s, v18.4s, v6.s[3] 176 FMLA v28.4s, v18.4s, v8.s[3] 177 FMLA v21.4s, v19.4s, v0.s[3] 178 FMLA v23.4s, v19.4s, v2.s[3] 179 FMLA v25.4s, v19.4s, v4.s[3] 180 FMLA v27.4s, v19.4s, v6.s[3] 181 FMLA v29.4s, v19.4s, v8.s[3] 182 LDP q18, q19, [x5], 32 183 184 # Second group of 4 A. 40 FMA. 185 FMLA v20.4s, v12.4s, v1.s[0] 186 FMLA v22.4s, v12.4s, v3.s[0] 187 FMLA v24.4s, v12.4s, v5.s[0] 188 LDR q0, [x3], 16 // Load next 5 A 189 FMLA v26.4s, v12.4s, v7.s[0] 190 FMLA v28.4s, v12.4s, v9.s[0] 191 FMLA v21.4s, v13.4s, v1.s[0] 192 LDR q2, [x9], 16 193 FMLA v23.4s, v13.4s, v3.s[0] 194 FMLA v25.4s, v13.4s, v5.s[0] 195 FMLA v27.4s, v13.4s, v7.s[0] 196 LDR q4, [x10], 16 197 FMLA v29.4s, v13.4s, v9.s[0] 198 199 FMLA v20.4s, v14.4s, v1.s[1] 200 FMLA v22.4s, v14.4s, v3.s[1] 201 LDR q6, [x11], 16 202 FMLA v24.4s, v14.4s, v5.s[1] 203 FMLA v26.4s, v14.4s, v7.s[1] 204 FMLA v28.4s, v14.4s, v9.s[1] 205 LDR q8, [x12], 16 206 FMLA v21.4s, v15.4s, v1.s[1] 207 FMLA v23.4s, v15.4s, v3.s[1] 208 FMLA v25.4s, v15.4s, v5.s[1] 209 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 210 FMLA v27.4s, v15.4s, v7.s[1] 211 FMLA v29.4s, v15.4s, v9.s[1] 212 213 FMLA v20.4s, v16.4s, v1.s[2] 214 LDP q14, q15, [x5], 32 215 FMLA v22.4s, v16.4s, v3.s[2] 216 FMLA v24.4s, v16.4s, v5.s[2] 217 FMLA v26.4s, v16.4s, v7.s[2] 218 FMLA v28.4s, v16.4s, v9.s[2] 219 FMLA v21.4s, v17.4s, v1.s[2] 220 FMLA v23.4s, v17.4s, v3.s[2] 221 FMLA v25.4s, v17.4s, v5.s[2] 222 FMLA v27.4s, v17.4s, v7.s[2] 223 FMLA v29.4s, v17.4s, v9.s[2] 224 LDP q16, q17, [x5], 32 225 226 FMLA v20.4s, v18.4s, v1.s[3] 227 FMLA v22.4s, v18.4s, v3.s[3] 228 SUBS x0, x0, 32 229 FMLA v24.4s, v18.4s, v5.s[3] 230 FMLA v26.4s, v18.4s, v7.s[3] 231 FMLA v28.4s, v18.4s, v9.s[3] 232 FMLA v21.4s, v19.4s, v1.s[3] 233 FMLA v23.4s, v19.4s, v3.s[3] 234 FMLA v25.4s, v19.4s, v5.s[3] 235 FMLA v27.4s, v19.4s, v7.s[3] 236 FMLA v29.4s, v19.4s, v9.s[3] 237 B.HS 1b 238 239 # Epilogue - 8 floats of A (32 bytes) 240 # 80 FMA + 5 LDP A + 8 LDP B 241 # First block same as main loop. Second block has no preloads. 2422: 243 # First group of 4 A. 40 FMA. 244 FMLA v20.4s, v12.4s, v0.s[0] 245 LDP q18, q19, [x5], 32 // Load last B 246 FMLA v22.4s, v12.4s, v2.s[0] 247 FMLA v24.4s, v12.4s, v4.s[0] 248 FMLA v26.4s, v12.4s, v6.s[0] 249 FMLA v28.4s, v12.4s, v8.s[0] 250 FMLA v21.4s, v13.4s, v0.s[0] 251 FMLA v23.4s, v13.4s, v2.s[0] 252 FMLA v25.4s, v13.4s, v4.s[0] 253 FMLA v27.4s, v13.4s, v6.s[0] 254 FMLA v29.4s, v13.4s, v8.s[0] 255 LDR q1, [x3], 16 // Load next 5 A 256 257 FMLA v20.4s, v14.4s, v0.s[1] 258 FMLA v22.4s, v14.4s, v2.s[1] 259 FMLA v24.4s, v14.4s, v4.s[1] 260 LDR q3, [x9], 16 261 FMLA v26.4s, v14.4s, v6.s[1] 262 FMLA v28.4s, v14.4s, v8.s[1] 263 FMLA v21.4s, v15.4s, v0.s[1] 264 LDR q5, [x10], 16 265 FMLA v23.4s, v15.4s, v2.s[1] 266 FMLA v25.4s, v15.4s, v4.s[1] 267 FMLA v27.4s, v15.4s, v6.s[1] 268 LDR q7, [x11], 16 269 FMLA v29.4s, v15.4s, v8.s[1] 270 271 FMLA v20.4s, v16.4s, v0.s[2] 272 FMLA v22.4s, v16.4s, v2.s[2] 273 LDR q9, [x12], 16 274 FMLA v24.4s, v16.4s, v4.s[2] 275 FMLA v26.4s, v16.4s, v6.s[2] 276 FMLA v28.4s, v16.4s, v8.s[2] 277 LDP q12, q13, [x5], 32 // Load 4 B 278 FMLA v21.4s, v17.4s, v0.s[2] 279 FMLA v23.4s, v17.4s, v2.s[2] 280 FMLA v25.4s, v17.4s, v4.s[2] 281 LDP q14, q15, [x5], 32 282 FMLA v27.4s, v17.4s, v6.s[2] 283 FMLA v29.4s, v17.4s, v8.s[2] 284 285 FMLA v20.4s, v18.4s, v0.s[3] 286 LDP q16, q17, [x5], 32 287 FMLA v22.4s, v18.4s, v2.s[3] 288 FMLA v24.4s, v18.4s, v4.s[3] 289 FMLA v26.4s, v18.4s, v6.s[3] 290 FMLA v28.4s, v18.4s, v8.s[3] 291 FMLA v21.4s, v19.4s, v0.s[3] 292 FMLA v23.4s, v19.4s, v2.s[3] 293 FMLA v25.4s, v19.4s, v4.s[3] 294 FMLA v27.4s, v19.4s, v6.s[3] 295 FMLA v29.4s, v19.4s, v8.s[3] 296 LDP q18, q19, [x5], 32 297 298 # Second group of 4 A. 40 FMA. 299 FMLA v20.4s, v12.4s, v1.s[0] 300 FMLA v22.4s, v12.4s, v3.s[0] 301 FMLA v24.4s, v12.4s, v5.s[0] 302 FMLA v26.4s, v12.4s, v7.s[0] 303 FMLA v28.4s, v12.4s, v9.s[0] 304 FMLA v21.4s, v13.4s, v1.s[0] 305 FMLA v23.4s, v13.4s, v3.s[0] 306 FMLA v25.4s, v13.4s, v5.s[0] 307 FMLA v27.4s, v13.4s, v7.s[0] 308 FMLA v29.4s, v13.4s, v9.s[0] 309 310 FMLA v20.4s, v14.4s, v1.s[1] 311 FMLA v22.4s, v14.4s, v3.s[1] 312 FMLA v24.4s, v14.4s, v5.s[1] 313 FMLA v26.4s, v14.4s, v7.s[1] 314 FMLA v28.4s, v14.4s, v9.s[1] 315 FMLA v21.4s, v15.4s, v1.s[1] 316 FMLA v23.4s, v15.4s, v3.s[1] 317 FMLA v25.4s, v15.4s, v5.s[1] 318 FMLA v27.4s, v15.4s, v7.s[1] 319 FMLA v29.4s, v15.4s, v9.s[1] 320 321 FMLA v20.4s, v16.4s, v1.s[2] 322 FMLA v22.4s, v16.4s, v3.s[2] 323 FMLA v24.4s, v16.4s, v5.s[2] 324 FMLA v26.4s, v16.4s, v7.s[2] 325 FMLA v28.4s, v16.4s, v9.s[2] 326 FMLA v21.4s, v17.4s, v1.s[2] 327 FMLA v23.4s, v17.4s, v3.s[2] 328 FMLA v25.4s, v17.4s, v5.s[2] 329 FMLA v27.4s, v17.4s, v7.s[2] 330 FMLA v29.4s, v17.4s, v9.s[2] 331 TST x0, 31 332 333 FMLA v20.4s, v18.4s, v1.s[3] 334 FMLA v22.4s, v18.4s, v3.s[3] 335 FMLA v24.4s, v18.4s, v5.s[3] 336 FMLA v26.4s, v18.4s, v7.s[3] 337 FMLA v28.4s, v18.4s, v9.s[3] 338 FMLA v21.4s, v19.4s, v1.s[3] 339 FMLA v23.4s, v19.4s, v3.s[3] 340 FMLA v25.4s, v19.4s, v5.s[3] 341 FMLA v27.4s, v19.4s, v7.s[3] 342 FMLA v29.4s, v19.4s, v9.s[3] 343 B.NE 4f 344 345 # Clamp 3463: 347 FMAX v20.4s, v20.4s, v30.4s 348 SUBS x1, x1, 8 349 FMAX v21.4s, v21.4s, v30.4s 350 FMAX v22.4s, v22.4s, v30.4s 351 FMAX v23.4s, v23.4s, v30.4s 352 FMAX v24.4s, v24.4s, v30.4s 353 FMAX v25.4s, v25.4s, v30.4s 354 FMAX v26.4s, v26.4s, v30.4s 355 FMAX v27.4s, v27.4s, v30.4s 356 FMAX v28.4s, v28.4s, v30.4s 357 FMAX v29.4s, v29.4s, v30.4s 358 FMIN v20.4s, v20.4s, v31.4s 359 FMIN v21.4s, v21.4s, v31.4s 360 FMIN v22.4s, v22.4s, v31.4s 361 FMIN v23.4s, v23.4s, v31.4s 362 FMIN v24.4s, v24.4s, v31.4s 363 FMIN v25.4s, v25.4s, v31.4s 364 FMIN v26.4s, v26.4s, v31.4s 365 FMIN v27.4s, v27.4s, v31.4s 366 FMIN v28.4s, v28.4s, v31.4s 367 FMIN v29.4s, v29.4s, v31.4s 368 369 # Store full 5 x 8 370 B.LO 7f 371 372 SUB x3, x3, x2 // a0 -= kc 373 STP q28, q29, [x7] 374 ADD x7, x7, x14 375 SUB x9, x9, x2 // a1 -= kc 376 STP q26, q27, [x13] 377 ADD x13, x13, x14 378 SUB x10, x10, x2 // a2 -= kc 379 STP q24, q25, [x17] 380 ADD x17, x17, x14 381 SUB x11, x11, x2 // a3 -= kc 382 STP q22, q23, [x16] 383 ADD x16, x16, x14 384 SUB x12, x12, x2 // a4 -= kc 385 STP q20, q21, [x6] 386 ADD x6, x6, x14 387 388 B.HI 0b 389 390 # Restore d8-d15 from stack 391 LDP d14, d15, [sp, 32] 392 LDP d12, d13, [sp, 16] 393 LDP d8, d9, [sp], 48 394 RET 395 396 # Load clamp values 3974: 398 # Is there a remainder?- 4 floats of A (16 bytes) 399 TBZ x0, 4, 5f 400 401 # Remainder- 4 floats of A (16 bytes) 402 # Load A 403 LDR q0, [x3], 16 404 LDR q2, [x9], 16 405 LDR q4, [x10], 16 406 LDR q6, [x11], 16 407 LDR q8, [x12], 16 408 # Load B 409 LDP q12, q13, [x5], 32 410 LDP q14, q15, [x5], 32 411 LDP q16, q17, [x5], 32 412 LDP q18, q19, [x5], 32 413 414 FMLA v20.4s, v12.4s, v0.s[0] 415 FMLA v22.4s, v12.4s, v2.s[0] 416 FMLA v24.4s, v12.4s, v4.s[0] 417 FMLA v26.4s, v12.4s, v6.s[0] 418 FMLA v28.4s, v12.4s, v8.s[0] 419 FMLA v21.4s, v13.4s, v0.s[0] 420 FMLA v23.4s, v13.4s, v2.s[0] 421 FMLA v25.4s, v13.4s, v4.s[0] 422 FMLA v27.4s, v13.4s, v6.s[0] 423 FMLA v29.4s, v13.4s, v8.s[0] 424 425 FMLA v20.4s, v14.4s, v0.s[1] 426 FMLA v22.4s, v14.4s, v2.s[1] 427 FMLA v24.4s, v14.4s, v4.s[1] 428 FMLA v26.4s, v14.4s, v6.s[1] 429 FMLA v28.4s, v14.4s, v8.s[1] 430 FMLA v21.4s, v15.4s, v0.s[1] 431 FMLA v23.4s, v15.4s, v2.s[1] 432 FMLA v25.4s, v15.4s, v4.s[1] 433 FMLA v27.4s, v15.4s, v6.s[1] 434 FMLA v29.4s, v15.4s, v8.s[1] 435 436 FMLA v20.4s, v16.4s, v0.s[2] 437 FMLA v22.4s, v16.4s, v2.s[2] 438 FMLA v24.4s, v16.4s, v4.s[2] 439 FMLA v26.4s, v16.4s, v6.s[2] 440 FMLA v28.4s, v16.4s, v8.s[2] 441 FMLA v21.4s, v17.4s, v0.s[2] 442 FMLA v23.4s, v17.4s, v2.s[2] 443 FMLA v25.4s, v17.4s, v4.s[2] 444 FMLA v27.4s, v17.4s, v6.s[2] 445 FMLA v29.4s, v17.4s, v8.s[2] 446 447 FMLA v20.4s, v18.4s, v0.s[3] 448 FMLA v22.4s, v18.4s, v2.s[3] 449 FMLA v24.4s, v18.4s, v4.s[3] 450 FMLA v26.4s, v18.4s, v6.s[3] 451 FMLA v28.4s, v18.4s, v8.s[3] 452 FMLA v21.4s, v19.4s, v0.s[3] 453 FMLA v23.4s, v19.4s, v2.s[3] 454 FMLA v25.4s, v19.4s, v4.s[3] 455 FMLA v27.4s, v19.4s, v6.s[3] 456 FMLA v29.4s, v19.4s, v8.s[3] 457 458 # Is there a remainder?- 2 floats of A (8 bytes) 4595: 460 TBZ x0, 3, 6f 461 462 # Remainder- 2 floats of A (8 bytes) 463 # Load A 464 LDR d0, [x3], 8 465 LDR d2, [x9], 8 466 LDR d4, [x10], 8 467 LDR d6, [x11], 8 468 LDR d8, [x12], 8 469 # Load B 470 LDP q12, q13, [x5], 32 471 LDP q14, q15, [x5], 32 472 473 FMLA v20.4s, v12.4s, v0.s[0] 474 FMLA v22.4s, v12.4s, v2.s[0] 475 FMLA v24.4s, v12.4s, v4.s[0] 476 FMLA v26.4s, v12.4s, v6.s[0] 477 FMLA v28.4s, v12.4s, v8.s[0] 478 FMLA v21.4s, v13.4s, v0.s[0] 479 FMLA v23.4s, v13.4s, v2.s[0] 480 FMLA v25.4s, v13.4s, v4.s[0] 481 FMLA v27.4s, v13.4s, v6.s[0] 482 FMLA v29.4s, v13.4s, v8.s[0] 483 484 FMLA v20.4s, v14.4s, v0.s[1] 485 FMLA v22.4s, v14.4s, v2.s[1] 486 FMLA v24.4s, v14.4s, v4.s[1] 487 FMLA v26.4s, v14.4s, v6.s[1] 488 FMLA v28.4s, v14.4s, v8.s[1] 489 FMLA v21.4s, v15.4s, v0.s[1] 490 FMLA v23.4s, v15.4s, v2.s[1] 491 FMLA v25.4s, v15.4s, v4.s[1] 492 FMLA v27.4s, v15.4s, v6.s[1] 493 FMLA v29.4s, v15.4s, v8.s[1] 494 495 # Is there a remainder?- 1 float of A (4 bytes) 4966: 497 TBZ x0, 2, 3b 498 499 # Remainder- 1 float of A (4 bytes) 500 # Load A 501 LDR s0, [x3], 4 502 LDR s2, [x9], 4 503 LDR s4, [x10], 4 504 LDR s6, [x11], 4 505 LDR s8, [x12], 4 506 # Load B 507 LDP q12, q13, [x5], 32 508 509 FMLA v20.4s, v12.4s, v0.s[0] 510 FMLA v22.4s, v12.4s, v2.s[0] 511 FMLA v24.4s, v12.4s, v4.s[0] 512 FMLA v26.4s, v12.4s, v6.s[0] 513 FMLA v28.4s, v12.4s, v8.s[0] 514 FMLA v21.4s, v13.4s, v0.s[0] 515 FMLA v23.4s, v13.4s, v2.s[0] 516 FMLA v25.4s, v13.4s, v4.s[0] 517 FMLA v27.4s, v13.4s, v6.s[0] 518 FMLA v29.4s, v13.4s, v8.s[0] 519 B 3b 520 521 # Store odd width 5227: 523 TBZ x1, 2, 8f 524 STR q28, [x7], 16 525 MOV v28.16b, v29.16b 526 STR q26, [x13], 16 527 MOV v26.16b, v27.16b 528 STR q24, [x17], 16 529 MOV v24.16b, v25.16b 530 STR q22, [x16], 16 531 MOV v22.16b, v23.16b 532 STR q20, [x6], 16 533 MOV v20.16b, v21.16b 5348: 535 TBZ x1, 1, 9f 536 STR d28, [x7], 8 537 DUP d28, v28.d[1] 538 STR d26, [x13], 8 539 DUP d26, v26.d[1] 540 STR d24, [x17], 8 541 DUP d24, v24.d[1] 542 STR d22, [x16], 8 543 DUP d22, v22.d[1] 544 STR d20, [x6], 8 545 DUP d20, v20.d[1] 546 5479: 548 TBZ x1, 0, 10f 549 STR s28, [x7] 550 STR s26, [x13] 551 STR s24, [x17] 552 STR s22, [x16] 553 STR s20, [x6] 55410: 555 # Restore d8-d15 from stack 556 LDP d14, d15, [sp, 32] 557 LDP d12, d13, [sp, 16] 558 LDP d8, d9, [sp], 48 559 RET 560 561END_FUNCTION xnn_f32_gemminc_minmax_ukernel_5x8__aarch64_neonfma_cortex_a57 562 563#ifdef __ELF__ 564.section ".note.GNU-stack","",%progbits 565#endif 566