1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31# x12 a4 32# x4 a5 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x14 c3 39# x13 c4 40# x7 c5 41 42# Vector register usage 43# A0 v0 v6 44# A1 v1 v7 45# A2 v2 v8 46# A3 v3 v9 47# A4 v4 v10 48# A5 v5 v11 49# B v12 v13 v14 v15 50# B v16 v17 v18 v19 51# C v20 v21 52# C v22 v23 53# C v24 v25 54# C v26 v27 55# C v28 v29 56# C v30 v31 57# Clamp v6 v7 58 59BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57 60 61 # Load params pointer 62 LDR x8, [sp, 8] 63 64 # Clamp A and C pointers / Save d8-d15 on stack 65 STP d8, d9, [sp, -64]! 66 CMP x0, 2 // if mr < 2 67 ADD x9, x3, x4 // a1 = a0 + a_stride 68 ADD x16, x6, x7 // c1 = c0 + cm_stride 69 CSEL x9, x3, x9, LO // a1 = a0 70 CSEL x16, x6, x16, LO // c1 = c0 71 72 STP d10, d11, [sp, 16] 73 ADD x10, x9, x4 // a2 = a1 + a_stride 74 ADD x17, x16, x7 // c2 = c1 + cm_stride 75 // if mr <= 2 76 CSEL x10, x9, x10, LS // a2 = a1 77 CSEL x17, x16, x17, LS // c2 = c1 78 79 STP d12, d13, [sp, 32] 80 CMP x0, 4 // if mr < 4 81 ADD x11, x10, x4 // a3 = a2 + a_stride 82 ADD x14, x17, x7 // c3 = c2 + cm_stride 83 CSEL x11, x10, x11, LO // a3 = a2 84 CSEL x14, x17, x14, LO // c3 = c2 85 86 STP d14, d15, [sp, 48] 87 ADD x12, x11, x4 // a4 = a3 + a_stride 88 ADD x13, x14, x7 // c4 = c3 + cm_stride 89 // if mr <= 4 90 CSEL x12, x11, x12, LS // a4 = a3 91 CSEL x13, x14, x13, LS // c4 = c3 92 93 CMP x0, 6 // if mr < 6 94 ADD x4, x12, x4 // a5 = a4 + a_stride 95 ADD x7, x13, x7 // c5 = c4 + cm_stride 96 CSEL x4, x12, x4, LO // a5 = a4 97 CSEL x7, x13, x7, LO // c5 = c4 98 990: 100 # Load initial bias from w into accumulators 101 LDP q20, q21, [x5], 32 102 MOV v22.16b, v20.16b 103 MOV v23.16b, v21.16b 104 MOV v24.16b, v20.16b 105 MOV v25.16b, v21.16b 106 MOV v26.16b, v20.16b 107 MOV v27.16b, v21.16b 108 MOV v28.16b, v20.16b 109 MOV v29.16b, v21.16b 110 MOV v30.16b, v20.16b 111 MOV v31.16b, v21.16b 112 113 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 114 SUBS x0, x2, 32 // k = kc - 32 115 B.LO 4f 116 117 # Prologue - loads for main loop of 96 FMA 118 LDR q0, [x3], 16 119 LDR q1, [x9], 16 120 LDR q2, [x10], 16 121 LDR q3, [x11], 16 122 LDR q4, [x12], 16 123 LDR q5, [x4], 16 124 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 125 LDP q14, q15, [x5], 32 126 LDP q16, q17, [x5], 32 127 128 # Is there at least 8 floats (32 bytes) for main loop? 129 SUBS x0, x0, 32 130 B.LO 2f 131 132 # Main loop - 8 floats of A (32 bytes) 133 # 96 FMA + 6 LDP A + 8 LDP B 1341: 135 # First group of 4 A. 48 FMA. 136 FMLA v20.4s, v12.4s, v0.s[0] 137 LDP q18, q19, [x5], 32 // Load last B 138 FMLA v22.4s, v12.4s, v1.s[0] 139 FMLA v24.4s, v12.4s, v2.s[0] 140 FMLA v26.4s, v12.4s, v3.s[0] 141 FMLA v28.4s, v12.4s, v4.s[0] 142 FMLA v30.4s, v12.4s, v5.s[0] 143 FMLA v21.4s, v13.4s, v0.s[0] 144 FMLA v23.4s, v13.4s, v1.s[0] 145 FMLA v25.4s, v13.4s, v2.s[0] 146 FMLA v27.4s, v13.4s, v3.s[0] 147 FMLA v29.4s, v13.4s, v4.s[0] 148 149 FMLA v31.4s, v13.4s, v5.s[0] 150 FMLA v20.4s, v14.4s, v0.s[1] 151 FMLA v22.4s, v14.4s, v1.s[1] 152 FMLA v24.4s, v14.4s, v2.s[1] 153 FMLA v26.4s, v14.4s, v3.s[1] 154 FMLA v28.4s, v14.4s, v4.s[1] 155 FMLA v30.4s, v14.4s, v5.s[1] 156 FMLA v21.4s, v15.4s, v0.s[1] 157 FMLA v23.4s, v15.4s, v1.s[1] 158 FMLA v25.4s, v15.4s, v2.s[1] 159 LDR q6, [x3], 16 // Load next 6 A 160 FMLA v27.4s, v15.4s, v3.s[1] 161 FMLA v29.4s, v15.4s, v4.s[1] 162 FMLA v31.4s, v15.4s, v5.s[1] 163 LDR q7, [x9], 16 164 165 FMLA v20.4s, v16.4s, v0.s[2] 166 FMLA v22.4s, v16.4s, v1.s[2] 167 FMLA v24.4s, v16.4s, v2.s[2] 168 LDR q8, [x10], 16 169 FMLA v26.4s, v16.4s, v3.s[2] 170 FMLA v28.4s, v16.4s, v4.s[2] 171 FMLA v30.4s, v16.4s, v5.s[2] 172 LDR q9, [x11], 16 173 FMLA v21.4s, v17.4s, v0.s[2] 174 FMLA v23.4s, v17.4s, v1.s[2] 175 FMLA v25.4s, v17.4s, v2.s[2] 176 LDR q10, [x12], 16 177 FMLA v27.4s, v17.4s, v3.s[2] 178 FMLA v29.4s, v17.4s, v4.s[2] 179 FMLA v31.4s, v17.4s, v5.s[2] 180 LDR q11, [x4], 16 181 182 FMLA v20.4s, v18.4s, v0.s[3] 183 FMLA v22.4s, v18.4s, v1.s[3] 184 FMLA v24.4s, v18.4s, v2.s[3] 185 LDP q12, q13, [x5], 32 // Load 4 B 186 FMLA v26.4s, v18.4s, v3.s[3] 187 FMLA v28.4s, v18.4s, v4.s[3] 188 FMLA v30.4s, v18.4s, v5.s[3] 189 LDP q14, q15, [x5], 32 190 FMLA v21.4s, v19.4s, v0.s[3] 191 FMLA v23.4s, v19.4s, v1.s[3] 192 FMLA v25.4s, v19.4s, v2.s[3] 193 LDP q16, q17, [x5], 32 194 FMLA v27.4s, v19.4s, v3.s[3] 195 FMLA v29.4s, v19.4s, v4.s[3] 196 FMLA v31.4s, v19.4s, v5.s[3] 197 LDP q18, q19, [x5], 32 198 199 # Second group of 4 A. 48 FMA. 200 FMLA v20.4s, v12.4s, v6.s[0] 201 FMLA v22.4s, v12.4s, v7.s[0] 202 FMLA v24.4s, v12.4s, v8.s[0] 203 LDR q0, [x3], 16 // Load next 6 A 204 FMLA v26.4s, v12.4s, v9.s[0] 205 FMLA v28.4s, v12.4s, v10.s[0] 206 FMLA v30.4s, v12.4s, v11.s[0] 207 LDR q1, [x9], 16 208 FMLA v21.4s, v13.4s, v6.s[0] 209 FMLA v23.4s, v13.4s, v7.s[0] 210 FMLA v25.4s, v13.4s, v8.s[0] 211 LDR q2, [x10], 16 212 FMLA v27.4s, v13.4s, v9.s[0] 213 FMLA v29.4s, v13.4s, v10.s[0] 214 FMLA v31.4s, v13.4s, v11.s[0] 215 LDR q3, [x11], 16 216 217 FMLA v20.4s, v14.4s, v6.s[1] 218 FMLA v22.4s, v14.4s, v7.s[1] 219 FMLA v24.4s, v14.4s, v8.s[1] 220 LDR q4, [x12], 16 221 FMLA v26.4s, v14.4s, v9.s[1] 222 FMLA v28.4s, v14.4s, v10.s[1] 223 FMLA v30.4s, v14.4s, v11.s[1] 224 LDR q5, [x4], 16 225 FMLA v21.4s, v15.4s, v6.s[1] 226 FMLA v23.4s, v15.4s, v7.s[1] 227 FMLA v25.4s, v15.4s, v8.s[1] 228 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 229 FMLA v27.4s, v15.4s, v9.s[1] 230 FMLA v29.4s, v15.4s, v10.s[1] 231 FMLA v31.4s, v15.4s, v11.s[1] 232 LDP q14, q15, [x5], 32 233 234 FMLA v20.4s, v16.4s, v6.s[2] 235 FMLA v22.4s, v16.4s, v7.s[2] 236 FMLA v24.4s, v16.4s, v8.s[2] 237 FMLA v26.4s, v16.4s, v9.s[2] 238 FMLA v28.4s, v16.4s, v10.s[2] 239 FMLA v30.4s, v16.4s, v11.s[2] 240 FMLA v21.4s, v17.4s, v6.s[2] 241 FMLA v23.4s, v17.4s, v7.s[2] 242 FMLA v25.4s, v17.4s, v8.s[2] 243 FMLA v27.4s, v17.4s, v9.s[2] 244 FMLA v29.4s, v17.4s, v10.s[2] 245 FMLA v31.4s, v17.4s, v11.s[2] 246 LDP q16, q17, [x5], 32 247 248 FMLA v20.4s, v18.4s, v6.s[3] 249 FMLA v22.4s, v18.4s, v7.s[3] 250 SUBS x0, x0, 32 251 FMLA v24.4s, v18.4s, v8.s[3] 252 FMLA v26.4s, v18.4s, v9.s[3] 253 FMLA v28.4s, v18.4s, v10.s[3] 254 FMLA v30.4s, v18.4s, v11.s[3] 255 FMLA v21.4s, v19.4s, v6.s[3] 256 FMLA v23.4s, v19.4s, v7.s[3] 257 FMLA v25.4s, v19.4s, v8.s[3] 258 FMLA v27.4s, v19.4s, v9.s[3] 259 FMLA v29.4s, v19.4s, v10.s[3] 260 FMLA v31.4s, v19.4s, v11.s[3] 261 B.HS 1b 262 263 # Epilogue - 8 floats of A (32 bytes) 264 # 96 FMA + 6 LDP A + 8 LDP B 265 # First block same as main loop. Second block has no preloads. 2662: 267 # First group of 4 A. 48 FMA. 268 FMLA v20.4s, v12.4s, v0.s[0] 269 LDP q18, q19, [x5], 32 // Load last B 270 FMLA v22.4s, v12.4s, v1.s[0] 271 FMLA v24.4s, v12.4s, v2.s[0] 272 FMLA v26.4s, v12.4s, v3.s[0] 273 FMLA v28.4s, v12.4s, v4.s[0] 274 FMLA v30.4s, v12.4s, v5.s[0] 275 FMLA v21.4s, v13.4s, v0.s[0] 276 FMLA v23.4s, v13.4s, v1.s[0] 277 FMLA v25.4s, v13.4s, v2.s[0] 278 FMLA v27.4s, v13.4s, v3.s[0] 279 FMLA v29.4s, v13.4s, v4.s[0] 280 281 FMLA v31.4s, v13.4s, v5.s[0] 282 FMLA v20.4s, v14.4s, v0.s[1] 283 FMLA v22.4s, v14.4s, v1.s[1] 284 FMLA v24.4s, v14.4s, v2.s[1] 285 FMLA v26.4s, v14.4s, v3.s[1] 286 FMLA v28.4s, v14.4s, v4.s[1] 287 FMLA v30.4s, v14.4s, v5.s[1] 288 FMLA v21.4s, v15.4s, v0.s[1] 289 FMLA v23.4s, v15.4s, v1.s[1] 290 FMLA v25.4s, v15.4s, v2.s[1] 291 LDR q6, [x3], 16 // Load next 6 A 292 FMLA v27.4s, v15.4s, v3.s[1] 293 FMLA v29.4s, v15.4s, v4.s[1] 294 FMLA v31.4s, v15.4s, v5.s[1] 295 LDR q7, [x9], 16 296 297 FMLA v20.4s, v16.4s, v0.s[2] 298 FMLA v22.4s, v16.4s, v1.s[2] 299 FMLA v24.4s, v16.4s, v2.s[2] 300 LDR q8, [x10], 16 301 FMLA v26.4s, v16.4s, v3.s[2] 302 FMLA v28.4s, v16.4s, v4.s[2] 303 FMLA v30.4s, v16.4s, v5.s[2] 304 LDR q9, [x11], 16 305 FMLA v21.4s, v17.4s, v0.s[2] 306 FMLA v23.4s, v17.4s, v1.s[2] 307 FMLA v25.4s, v17.4s, v2.s[2] 308 LDR q10, [x12], 16 309 FMLA v27.4s, v17.4s, v3.s[2] 310 FMLA v29.4s, v17.4s, v4.s[2] 311 FMLA v31.4s, v17.4s, v5.s[2] 312 LDR q11, [x4], 16 313 314 FMLA v20.4s, v18.4s, v0.s[3] 315 FMLA v22.4s, v18.4s, v1.s[3] 316 FMLA v24.4s, v18.4s, v2.s[3] 317 LDP q12, q13, [x5], 32 // Load 4 B 318 FMLA v26.4s, v18.4s, v3.s[3] 319 FMLA v28.4s, v18.4s, v4.s[3] 320 FMLA v30.4s, v18.4s, v5.s[3] 321 LDP q14, q15, [x5], 32 322 FMLA v21.4s, v19.4s, v0.s[3] 323 FMLA v23.4s, v19.4s, v1.s[3] 324 FMLA v25.4s, v19.4s, v2.s[3] 325 LDP q16, q17, [x5], 32 326 FMLA v27.4s, v19.4s, v3.s[3] 327 FMLA v29.4s, v19.4s, v4.s[3] 328 FMLA v31.4s, v19.4s, v5.s[3] 329 LDP q18, q19, [x5], 32 330 331 # Second group of 4 A. 48 FMA. 332 FMLA v20.4s, v12.4s, v6.s[0] 333 FMLA v22.4s, v12.4s, v7.s[0] 334 FMLA v24.4s, v12.4s, v8.s[0] 335 FMLA v26.4s, v12.4s, v9.s[0] 336 FMLA v28.4s, v12.4s, v10.s[0] 337 FMLA v30.4s, v12.4s, v11.s[0] 338 FMLA v21.4s, v13.4s, v6.s[0] 339 FMLA v23.4s, v13.4s, v7.s[0] 340 FMLA v25.4s, v13.4s, v8.s[0] 341 FMLA v27.4s, v13.4s, v9.s[0] 342 FMLA v29.4s, v13.4s, v10.s[0] 343 FMLA v31.4s, v13.4s, v11.s[0] 344 345 FMLA v20.4s, v14.4s, v6.s[1] 346 FMLA v22.4s, v14.4s, v7.s[1] 347 FMLA v24.4s, v14.4s, v8.s[1] 348 FMLA v26.4s, v14.4s, v9.s[1] 349 FMLA v28.4s, v14.4s, v10.s[1] 350 FMLA v30.4s, v14.4s, v11.s[1] 351 FMLA v21.4s, v15.4s, v6.s[1] 352 FMLA v23.4s, v15.4s, v7.s[1] 353 FMLA v25.4s, v15.4s, v8.s[1] 354 FMLA v27.4s, v15.4s, v9.s[1] 355 FMLA v29.4s, v15.4s, v10.s[1] 356 FMLA v31.4s, v15.4s, v11.s[1] 357 358 FMLA v20.4s, v16.4s, v6.s[2] 359 FMLA v22.4s, v16.4s, v7.s[2] 360 FMLA v24.4s, v16.4s, v8.s[2] 361 FMLA v26.4s, v16.4s, v9.s[2] 362 FMLA v28.4s, v16.4s, v10.s[2] 363 FMLA v30.4s, v16.4s, v11.s[2] 364 FMLA v21.4s, v17.4s, v6.s[2] 365 FMLA v23.4s, v17.4s, v7.s[2] 366 FMLA v25.4s, v17.4s, v8.s[2] 367 FMLA v27.4s, v17.4s, v9.s[2] 368 FMLA v29.4s, v17.4s, v10.s[2] 369 FMLA v31.4s, v17.4s, v11.s[2] 370 371 FMLA v20.4s, v18.4s, v6.s[3] 372 FMLA v22.4s, v18.4s, v7.s[3] 373 FMLA v24.4s, v18.4s, v8.s[3] 374 FMLA v26.4s, v18.4s, v9.s[3] 375 FMLA v28.4s, v18.4s, v10.s[3] 376 FMLA v30.4s, v18.4s, v11.s[3] 377 FMLA v21.4s, v19.4s, v6.s[3] 378 FMLA v23.4s, v19.4s, v7.s[3] 379 380 # Load min/max values 381 LD2R {v6.4s, v7.4s}, [x8] 382 383 FMLA v25.4s, v19.4s, v8.s[3] 384 FMLA v27.4s, v19.4s, v9.s[3] 385 # Is there a remainder?- 4 floats of A (16 bytes) or less 386 TST x0, 31 387 FMLA v29.4s, v19.4s, v10.s[3] 388 FMLA v31.4s, v19.4s, v11.s[3] 389 B.NE 4f 390 391 # Clamp 3923: 393 FMAX v20.4s, v20.4s, v6.4s 394 # Load cn_stride 395 LDR x0, [sp, 64] 396 FMAX v21.4s, v21.4s, v6.4s 397 FMAX v22.4s, v22.4s, v6.4s 398 FMAX v23.4s, v23.4s, v6.4s 399 FMAX v24.4s, v24.4s, v6.4s 400 FMAX v25.4s, v25.4s, v6.4s 401 FMAX v26.4s, v26.4s, v6.4s 402 FMAX v27.4s, v27.4s, v6.4s 403 FMAX v28.4s, v28.4s, v6.4s 404 FMAX v29.4s, v29.4s, v6.4s 405 FMAX v30.4s, v30.4s, v6.4s 406 FMAX v31.4s, v31.4s, v6.4s 407 SUBS x1, x1, 8 408 FMIN v20.4s, v20.4s, v7.4s 409 FMIN v21.4s, v21.4s, v7.4s 410 FMIN v22.4s, v22.4s, v7.4s 411 FMIN v23.4s, v23.4s, v7.4s 412 FMIN v24.4s, v24.4s, v7.4s 413 FMIN v25.4s, v25.4s, v7.4s 414 FMIN v26.4s, v26.4s, v7.4s 415 FMIN v27.4s, v27.4s, v7.4s 416 FMIN v28.4s, v28.4s, v7.4s 417 FMIN v29.4s, v29.4s, v7.4s 418 FMIN v30.4s, v30.4s, v7.4s 419 FMIN v31.4s, v31.4s, v7.4s 420 421 # Store full 6 x 8 422 B.LO 7f 423 424 STP q20, q21, [x6] 425 ADD x6, x6, x0 426 SUB x3, x3, x2 // a0 -= kc 427 STP q22, q23, [x16] 428 ADD x16, x16, x0 429 SUB x9, x9, x2 // a1 -= kc 430 STP q24, q25, [x17] 431 ADD x17, x17, x0 432 SUB x10, x10, x2 // a2 -= kc 433 STP q26, q27, [x14] 434 ADD x14, x14, x0 435 SUB x11, x11, x2 // a3 -= kc 436 STP q28, q29, [x13] 437 ADD x13, x13, x0 438 SUB x12, x12, x2 // a4 -= kc 439 STP q30, q31, [x7] 440 ADD x7, x7, x0 441 SUB x4, x4, x2 // a5 -= kc 442 443 B.HI 0b 444 445 # Restore d8-d15 from stack 446 LDP d14, d15, [sp, 48] 447 LDP d12, d13, [sp, 32] 448 LDP d10, d11, [sp, 16] 449 LDP d8, d9, [sp], 64 450 RET 451 4524: 453 # Load min/max values 454 LD2R {v6.4s, v7.4s}, [x8] 455 456 # Is there a remainder?- 4 floats of A (16 bytes) 457 TBZ x0, 4, 5f 458 459 # Remainder- 4 floats of A (16 bytes) 460 # Load A 461 LDR q0, [x3], 16 462 LDR q1, [x9], 16 463 LDR q2, [x10], 16 464 LDR q3, [x11], 16 465 LDR q4, [x12], 16 466 LDR q5, [x4], 16 467 # Load B 468 LDP q12, q13, [x5], 32 469 LDP q14, q15, [x5], 32 470 LDP q16, q17, [x5], 32 471 LDP q18, q19, [x5], 32 472 473 FMLA v20.4s, v12.4s, v0.s[0] 474 FMLA v22.4s, v12.4s, v1.s[0] 475 FMLA v24.4s, v12.4s, v2.s[0] 476 FMLA v26.4s, v12.4s, v3.s[0] 477 FMLA v28.4s, v12.4s, v4.s[0] 478 FMLA v30.4s, v12.4s, v5.s[0] 479 FMLA v21.4s, v13.4s, v0.s[0] 480 FMLA v23.4s, v13.4s, v1.s[0] 481 FMLA v25.4s, v13.4s, v2.s[0] 482 FMLA v27.4s, v13.4s, v3.s[0] 483 FMLA v29.4s, v13.4s, v4.s[0] 484 FMLA v31.4s, v13.4s, v5.s[0] 485 486 FMLA v20.4s, v14.4s, v0.s[1] 487 FMLA v22.4s, v14.4s, v1.s[1] 488 FMLA v24.4s, v14.4s, v2.s[1] 489 FMLA v26.4s, v14.4s, v3.s[1] 490 FMLA v28.4s, v14.4s, v4.s[1] 491 FMLA v30.4s, v14.4s, v5.s[1] 492 FMLA v21.4s, v15.4s, v0.s[1] 493 FMLA v23.4s, v15.4s, v1.s[1] 494 FMLA v25.4s, v15.4s, v2.s[1] 495 FMLA v27.4s, v15.4s, v3.s[1] 496 FMLA v29.4s, v15.4s, v4.s[1] 497 FMLA v31.4s, v15.4s, v5.s[1] 498 499 FMLA v20.4s, v16.4s, v0.s[2] 500 FMLA v22.4s, v16.4s, v1.s[2] 501 FMLA v24.4s, v16.4s, v2.s[2] 502 FMLA v26.4s, v16.4s, v3.s[2] 503 FMLA v28.4s, v16.4s, v4.s[2] 504 FMLA v30.4s, v16.4s, v5.s[2] 505 FMLA v21.4s, v17.4s, v0.s[2] 506 FMLA v23.4s, v17.4s, v1.s[2] 507 FMLA v25.4s, v17.4s, v2.s[2] 508 FMLA v27.4s, v17.4s, v3.s[2] 509 FMLA v29.4s, v17.4s, v4.s[2] 510 FMLA v31.4s, v17.4s, v5.s[2] 511 512 FMLA v20.4s, v18.4s, v0.s[3] 513 FMLA v22.4s, v18.4s, v1.s[3] 514 FMLA v24.4s, v18.4s, v2.s[3] 515 FMLA v26.4s, v18.4s, v3.s[3] 516 FMLA v28.4s, v18.4s, v4.s[3] 517 FMLA v30.4s, v18.4s, v5.s[3] 518 FMLA v21.4s, v19.4s, v0.s[3] 519 FMLA v23.4s, v19.4s, v1.s[3] 520 FMLA v25.4s, v19.4s, v2.s[3] 521 FMLA v27.4s, v19.4s, v3.s[3] 522 FMLA v29.4s, v19.4s, v4.s[3] 523 FMLA v31.4s, v19.4s, v5.s[3] 524 525 # Is there a remainder?- 2 floats of A (8 bytes) 5265: 527 TBZ x0, 3, 6f 528 529 # Remainder- 2 floats of A (8 bytes) 530 # Load A 531 LDR d0, [x3], 8 532 LDR d1, [x9], 8 533 LDR d2, [x10], 8 534 LDR d3, [x11], 8 535 LDR d4, [x12], 8 536 LDR d5, [x4], 8 537 # Load B 538 LDP q12, q13, [x5], 32 539 LDP q14, q15, [x5], 32 540 541 FMLA v20.4s, v12.4s, v0.s[0] 542 FMLA v22.4s, v12.4s, v1.s[0] 543 FMLA v24.4s, v12.4s, v2.s[0] 544 FMLA v26.4s, v12.4s, v3.s[0] 545 FMLA v28.4s, v12.4s, v4.s[0] 546 FMLA v30.4s, v12.4s, v5.s[0] 547 FMLA v21.4s, v13.4s, v0.s[0] 548 FMLA v23.4s, v13.4s, v1.s[0] 549 FMLA v25.4s, v13.4s, v2.s[0] 550 FMLA v27.4s, v13.4s, v3.s[0] 551 FMLA v29.4s, v13.4s, v4.s[0] 552 FMLA v31.4s, v13.4s, v5.s[0] 553 554 FMLA v20.4s, v14.4s, v0.s[1] 555 FMLA v22.4s, v14.4s, v1.s[1] 556 FMLA v24.4s, v14.4s, v2.s[1] 557 FMLA v26.4s, v14.4s, v3.s[1] 558 FMLA v28.4s, v14.4s, v4.s[1] 559 FMLA v30.4s, v14.4s, v5.s[1] 560 FMLA v21.4s, v15.4s, v0.s[1] 561 FMLA v23.4s, v15.4s, v1.s[1] 562 FMLA v25.4s, v15.4s, v2.s[1] 563 FMLA v27.4s, v15.4s, v3.s[1] 564 FMLA v29.4s, v15.4s, v4.s[1] 565 FMLA v31.4s, v15.4s, v5.s[1] 566 567 # Is there a remainder?- 1 float of A (4 bytes) 5686: 569 TBZ x0, 2, 3b 570 571 # Remainder- 1 float of A (4 bytes) 572 # Load A 573 LDR s0, [x3], 4 574 LDR s1, [x9], 4 575 LDR s2, [x10], 4 576 LDR s3, [x11], 4 577 LDR s4, [x12], 4 578 LDR s5, [x4], 4 579 # Load B 580 LDP q12, q13, [x5], 32 581 582 FMLA v20.4s, v12.4s, v0.s[0] 583 FMLA v22.4s, v12.4s, v1.s[0] 584 FMLA v24.4s, v12.4s, v2.s[0] 585 FMLA v26.4s, v12.4s, v3.s[0] 586 FMLA v28.4s, v12.4s, v4.s[0] 587 FMLA v30.4s, v12.4s, v5.s[0] 588 FMLA v21.4s, v13.4s, v0.s[0] 589 FMLA v23.4s, v13.4s, v1.s[0] 590 FMLA v25.4s, v13.4s, v2.s[0] 591 FMLA v27.4s, v13.4s, v3.s[0] 592 FMLA v29.4s, v13.4s, v4.s[0] 593 FMLA v31.4s, v13.4s, v5.s[0] 594 B 3b 595 596 # Store odd width 5977: 598 TBZ x1, 2, 8f 599 STR q20, [x6], 16 600 MOV v20.16b, v21.16b 601 STR q22, [x16], 16 602 MOV v22.16b, v23.16b 603 STR q24, [x17], 16 604 MOV v24.16b, v25.16b 605 STR q26, [x14], 16 606 MOV v26.16b, v27.16b 607 STR q28, [x13], 16 608 MOV v28.16b, v29.16b 609 STR q30, [x7], 16 610 MOV v30.16b, v31.16b 6118: 612 TBZ x1, 1, 9f 613 STR d20, [x6], 8 614 DUP d20, v20.d[1] 615 STR d22, [x16], 8 616 DUP d22, v22.d[1] 617 STR d24, [x17], 8 618 DUP d24, v24.d[1] 619 STR d26, [x14], 8 620 DUP d26, v26.d[1] 621 STR d28, [x13], 8 622 DUP d28, v28.d[1] 623 STR d30, [x7], 8 624 DUP d30, v30.d[1] 625 6269: 627 TBZ x1, 0, 10f 628 STR s20, [x6] 629 STR s22, [x16] 630 STR s24, [x17] 631 STR s26, [x14] 632 STR s28, [x13] 633 STR s30, [x7] 63410: 635 # Restore d8-d15 from stack 636 LDP d14, d15, [sp, 48] 637 LDP d12, d13, [sp, 32] 638 LDP d10, d11, [sp, 16] 639 LDP d8, d9, [sp], 64 640 RET 641 642END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57 643 644#ifdef __ELF__ 645.section ".note.GNU-stack","",%progbits 646#endif 647