1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31# x12 a4 32# x4 a5 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x14 c3 39# x13 c4 40# x7 c5 41 42# Vector register usage 43# A0 v0 v6 44# A1 v1 v7 45# A2 v2 v8 46# A3 v3 v9 47# A4 v4 v10 48# A5 v5 v11 49# B v12 v13 v14 v15 50# B v16 v17 v18 v19 51# C v20 v21 52# C v22 v23 53# C v24 v25 54# C v26 v27 55# C v28 v29 56# C v30 v31 57# Clamp v6 v7 58 59BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75 60 61 # Load params pointer 62 LDR x8, [sp, 8] 63 64 # Clamp A and C pointers / Save d8-d15 on stack 65 STP d8, d9, [sp, -64]! 66 CMP x0, 2 // if mr < 2 67 ADD x9, x3, x4 // a1 = a0 + a_stride 68 ADD x16, x6, x7 // c1 = c0 + cm_stride 69 CSEL x9, x3, x9, LO // a1 = a0 70 CSEL x16, x6, x16, LO // c1 = c0 71 72 STP d10, d11, [sp, 16] 73 ADD x10, x9, x4 // a2 = a1 + a_stride 74 ADD x17, x16, x7 // c2 = c1 + cm_stride 75 // if mr <= 2 76 CSEL x10, x9, x10, LS // a2 = a1 77 CSEL x17, x16, x17, LS // c2 = c1 78 79 STP d12, d13, [sp, 32] 80 CMP x0, 4 // if mr < 4 81 ADD x11, x10, x4 // a3 = a2 + a_stride 82 ADD x14, x17, x7 // c3 = c2 + cm_stride 83 CSEL x11, x10, x11, LO // a3 = a2 84 CSEL x14, x17, x14, LO // c3 = c2 85 86 STP d14, d15, [sp, 48] 87 ADD x12, x11, x4 // a4 = a3 + a_stride 88 ADD x13, x14, x7 // c4 = c3 + cm_stride 89 // if mr <= 4 90 CSEL x12, x11, x12, LS // a4 = a3 91 CSEL x13, x14, x13, LS // c4 = c3 92 93 CMP x0, 6 // if mr < 6 94 ADD x4, x12, x4 // a5 = a4 + a_stride 95 ADD x7, x13, x7 // c5 = c4 + cm_stride 96 CSEL x4, x12, x4, LO // a5 = a4 97 CSEL x7, x13, x7, LO // c5 = c4 98 990: 100 # Load initial bias from w into accumulators 101 LDP q20, q21, [x5], 32 102 MOV v22.16b, v20.16b 103 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 104 MOV v23.16b, v21.16b 105 PRFM PLDL1KEEP, [x5, 64] 106 MOV v24.16b, v20.16b 107 PRFM PLDL1KEEP, [x5, 128] 108 MOV v25.16b, v21.16b 109 PRFM PLDL1KEEP, [x5, 192] 110 MOV v26.16b, v20.16b 111 PRFM PLDL1KEEP, [x3] // Prefetch A 112 MOV v27.16b, v21.16b 113 PRFM PLDL1KEEP, [x9] 114 MOV v28.16b, v20.16b 115 PRFM PLDL1KEEP, [x10] 116 MOV v29.16b, v21.16b 117 PRFM PLDL1KEEP, [x11] 118 MOV v30.16b, v20.16b 119 PRFM PLDL1KEEP, [x12] 120 MOV v31.16b, v21.16b 121 PRFM PLDL1KEEP, [x4] 122 123 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 124 SUBS x0, x2, 32 // k = kc - 32 125 B.LO 4f 126 127 # Prologue - loads for main loop of 96 FMA 128 LDR q0, [x3], 16 129 LDR q1, [x9], 16 130 LDR q2, [x10], 16 131 LDR q3, [x11], 16 132 LDR q4, [x12], 16 133 LDR q5, [x4], 16 134 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 135 LDP q14, q15, [x5], 32 136 LDP q16, q17, [x5], 32 137 138 # Is there at least 8 floats (32 bytes) for main loop? 139 SUBS x0, x0, 32 140 B.LO 2f 141 142 # Main loop - 8 floats of A (32 bytes) 143 # 96 FMA + 6 LDP A + 8 LDP B 1441: 145 # First group of 4 A. 48 FMA. 146 FMLA v20.4s, v12.4s, v0.s[0] 147 LDP q18, q19, [x5], 32 // Load last B 148 FMLA v22.4s, v12.4s, v1.s[0] 149 FMLA v24.4s, v12.4s, v2.s[0] 150 FMLA v26.4s, v12.4s, v3.s[0] 151 FMLA v28.4s, v12.4s, v4.s[0] 152 FMLA v30.4s, v12.4s, v5.s[0] 153 FMLA v21.4s, v13.4s, v0.s[0] 154 FMLA v23.4s, v13.4s, v1.s[0] 155 FMLA v25.4s, v13.4s, v2.s[0] 156 FMLA v27.4s, v13.4s, v3.s[0] 157 FMLA v29.4s, v13.4s, v4.s[0] 158 159 FMLA v31.4s, v13.4s, v5.s[0] 160 FMLA v20.4s, v14.4s, v0.s[1] 161 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 162 FMLA v22.4s, v14.4s, v1.s[1] 163 FMLA v24.4s, v14.4s, v2.s[1] 164 FMLA v26.4s, v14.4s, v3.s[1] 165 FMLA v28.4s, v14.4s, v4.s[1] 166 PRFM PLDL1KEEP, [x5, 256] 167 FMLA v30.4s, v14.4s, v5.s[1] 168 FMLA v21.4s, v15.4s, v0.s[1] 169 FMLA v23.4s, v15.4s, v1.s[1] 170 FMLA v25.4s, v15.4s, v2.s[1] 171 LDR q6, [x3], 16 // Load next 6 A 172 FMLA v27.4s, v15.4s, v3.s[1] 173 FMLA v29.4s, v15.4s, v4.s[1] 174 FMLA v31.4s, v15.4s, v5.s[1] 175 LDR q7, [x9], 16 176 177 FMLA v20.4s, v16.4s, v0.s[2] 178 FMLA v22.4s, v16.4s, v1.s[2] 179 FMLA v24.4s, v16.4s, v2.s[2] 180 LDR q8, [x10], 16 181 FMLA v26.4s, v16.4s, v3.s[2] 182 FMLA v28.4s, v16.4s, v4.s[2] 183 FMLA v30.4s, v16.4s, v5.s[2] 184 LDR q9, [x11], 16 185 FMLA v21.4s, v17.4s, v0.s[2] 186 FMLA v23.4s, v17.4s, v1.s[2] 187 FMLA v25.4s, v17.4s, v2.s[2] 188 LDR q10, [x12], 16 189 FMLA v27.4s, v17.4s, v3.s[2] 190 FMLA v29.4s, v17.4s, v4.s[2] 191 FMLA v31.4s, v17.4s, v5.s[2] 192 LDR q11, [x4], 16 193 194 FMLA v20.4s, v18.4s, v0.s[3] 195 FMLA v22.4s, v18.4s, v1.s[3] 196 FMLA v24.4s, v18.4s, v2.s[3] 197 LDP q12, q13, [x5], 32 // Load 4 B 198 FMLA v26.4s, v18.4s, v3.s[3] 199 FMLA v28.4s, v18.4s, v4.s[3] 200 FMLA v30.4s, v18.4s, v5.s[3] 201 LDP q14, q15, [x5], 32 202 FMLA v21.4s, v19.4s, v0.s[3] 203 FMLA v23.4s, v19.4s, v1.s[3] 204 FMLA v25.4s, v19.4s, v2.s[3] 205 LDP q16, q17, [x5], 32 206 FMLA v27.4s, v19.4s, v3.s[3] 207 FMLA v29.4s, v19.4s, v4.s[3] 208 FMLA v31.4s, v19.4s, v5.s[3] 209 LDP q18, q19, [x5], 32 210 211 # Second group of 4 A. 48 FMA. 212 FMLA v20.4s, v12.4s, v6.s[0] 213 FMLA v22.4s, v12.4s, v7.s[0] 214 FMLA v24.4s, v12.4s, v8.s[0] 215 LDR q0, [x3], 16 // Load next 6 A 216 FMLA v26.4s, v12.4s, v9.s[0] 217 FMLA v28.4s, v12.4s, v10.s[0] 218 FMLA v30.4s, v12.4s, v11.s[0] 219 LDR q1, [x9], 16 220 FMLA v21.4s, v13.4s, v6.s[0] 221 FMLA v23.4s, v13.4s, v7.s[0] 222 FMLA v25.4s, v13.4s, v8.s[0] 223 LDR q2, [x10], 16 224 FMLA v27.4s, v13.4s, v9.s[0] 225 FMLA v29.4s, v13.4s, v10.s[0] 226 FMLA v31.4s, v13.4s, v11.s[0] 227 LDR q3, [x11], 16 228 229 FMLA v20.4s, v14.4s, v6.s[1] 230 FMLA v22.4s, v14.4s, v7.s[1] 231 FMLA v24.4s, v14.4s, v8.s[1] 232 LDR q4, [x12], 16 233 FMLA v26.4s, v14.4s, v9.s[1] 234 FMLA v28.4s, v14.4s, v10.s[1] 235 FMLA v30.4s, v14.4s, v11.s[1] 236 LDR q5, [x4], 16 237 FMLA v21.4s, v15.4s, v6.s[1] 238 FMLA v23.4s, v15.4s, v7.s[1] 239 FMLA v25.4s, v15.4s, v8.s[1] 240 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 241 FMLA v27.4s, v15.4s, v9.s[1] 242 FMLA v29.4s, v15.4s, v10.s[1] 243 FMLA v31.4s, v15.4s, v11.s[1] 244 LDP q14, q15, [x5], 32 245 246 FMLA v20.4s, v16.4s, v6.s[2] 247 FMLA v22.4s, v16.4s, v7.s[2] 248 FMLA v24.4s, v16.4s, v8.s[2] 249 FMLA v26.4s, v16.4s, v9.s[2] 250 FMLA v28.4s, v16.4s, v10.s[2] 251 FMLA v30.4s, v16.4s, v11.s[2] 252 FMLA v21.4s, v17.4s, v6.s[2] 253 FMLA v23.4s, v17.4s, v7.s[2] 254 FMLA v25.4s, v17.4s, v8.s[2] 255 FMLA v27.4s, v17.4s, v9.s[2] 256 FMLA v29.4s, v17.4s, v10.s[2] 257 FMLA v31.4s, v17.4s, v11.s[2] 258 LDP q16, q17, [x5], 32 259 260 FMLA v20.4s, v18.4s, v6.s[3] 261 FMLA v22.4s, v18.4s, v7.s[3] 262 SUBS x0, x0, 32 263 FMLA v24.4s, v18.4s, v8.s[3] 264 FMLA v26.4s, v18.4s, v9.s[3] 265 FMLA v28.4s, v18.4s, v10.s[3] 266 FMLA v30.4s, v18.4s, v11.s[3] 267 FMLA v21.4s, v19.4s, v6.s[3] 268 FMLA v23.4s, v19.4s, v7.s[3] 269 FMLA v25.4s, v19.4s, v8.s[3] 270 FMLA v27.4s, v19.4s, v9.s[3] 271 FMLA v29.4s, v19.4s, v10.s[3] 272 FMLA v31.4s, v19.4s, v11.s[3] 273 B.HS 1b 274 275 # Epilogue - 8 floats of A (32 bytes) 276 # 96 FMA + 6 LDP A + 8 LDP B 277 # First block same as main loop. Second block has no preloads. 2782: 279 # First group of 4 A. 48 FMA. 280 FMLA v20.4s, v12.4s, v0.s[0] 281 LDP q18, q19, [x5], 32 // Load last B 282 FMLA v22.4s, v12.4s, v1.s[0] 283 FMLA v24.4s, v12.4s, v2.s[0] 284 FMLA v26.4s, v12.4s, v3.s[0] 285 FMLA v28.4s, v12.4s, v4.s[0] 286 FMLA v30.4s, v12.4s, v5.s[0] 287 FMLA v21.4s, v13.4s, v0.s[0] 288 FMLA v23.4s, v13.4s, v1.s[0] 289 FMLA v25.4s, v13.4s, v2.s[0] 290 FMLA v27.4s, v13.4s, v3.s[0] 291 FMLA v29.4s, v13.4s, v4.s[0] 292 293 FMLA v31.4s, v13.4s, v5.s[0] 294 FMLA v20.4s, v14.4s, v0.s[1] 295 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 296 FMLA v22.4s, v14.4s, v1.s[1] 297 FMLA v24.4s, v14.4s, v2.s[1] 298 FMLA v26.4s, v14.4s, v3.s[1] 299 FMLA v28.4s, v14.4s, v4.s[1] 300 PRFM PLDL1KEEP, [x5, 256] 301 FMLA v30.4s, v14.4s, v5.s[1] 302 FMLA v21.4s, v15.4s, v0.s[1] 303 FMLA v23.4s, v15.4s, v1.s[1] 304 FMLA v25.4s, v15.4s, v2.s[1] 305 LDR q6, [x3], 16 // Load next 6 A 306 FMLA v27.4s, v15.4s, v3.s[1] 307 FMLA v29.4s, v15.4s, v4.s[1] 308 FMLA v31.4s, v15.4s, v5.s[1] 309 LDR q7, [x9], 16 310 311 FMLA v20.4s, v16.4s, v0.s[2] 312 FMLA v22.4s, v16.4s, v1.s[2] 313 FMLA v24.4s, v16.4s, v2.s[2] 314 LDR q8, [x10], 16 315 FMLA v26.4s, v16.4s, v3.s[2] 316 FMLA v28.4s, v16.4s, v4.s[2] 317 FMLA v30.4s, v16.4s, v5.s[2] 318 LDR q9, [x11], 16 319 FMLA v21.4s, v17.4s, v0.s[2] 320 FMLA v23.4s, v17.4s, v1.s[2] 321 FMLA v25.4s, v17.4s, v2.s[2] 322 LDR q10, [x12], 16 323 FMLA v27.4s, v17.4s, v3.s[2] 324 FMLA v29.4s, v17.4s, v4.s[2] 325 FMLA v31.4s, v17.4s, v5.s[2] 326 LDR q11, [x4], 16 327 328 FMLA v20.4s, v18.4s, v0.s[3] 329 FMLA v22.4s, v18.4s, v1.s[3] 330 FMLA v24.4s, v18.4s, v2.s[3] 331 LDP q12, q13, [x5], 32 // Load 4 B 332 FMLA v26.4s, v18.4s, v3.s[3] 333 FMLA v28.4s, v18.4s, v4.s[3] 334 FMLA v30.4s, v18.4s, v5.s[3] 335 LDP q14, q15, [x5], 32 336 FMLA v21.4s, v19.4s, v0.s[3] 337 FMLA v23.4s, v19.4s, v1.s[3] 338 FMLA v25.4s, v19.4s, v2.s[3] 339 LDP q16, q17, [x5], 32 340 FMLA v27.4s, v19.4s, v3.s[3] 341 FMLA v29.4s, v19.4s, v4.s[3] 342 FMLA v31.4s, v19.4s, v5.s[3] 343 LDP q18, q19, [x5], 32 344 345 # Second group of 4 A. 48 FMA. 346 FMLA v20.4s, v12.4s, v6.s[0] 347 FMLA v22.4s, v12.4s, v7.s[0] 348 FMLA v24.4s, v12.4s, v8.s[0] 349 FMLA v26.4s, v12.4s, v9.s[0] 350 FMLA v28.4s, v12.4s, v10.s[0] 351 FMLA v30.4s, v12.4s, v11.s[0] 352 FMLA v21.4s, v13.4s, v6.s[0] 353 FMLA v23.4s, v13.4s, v7.s[0] 354 FMLA v25.4s, v13.4s, v8.s[0] 355 FMLA v27.4s, v13.4s, v9.s[0] 356 FMLA v29.4s, v13.4s, v10.s[0] 357 FMLA v31.4s, v13.4s, v11.s[0] 358 359 FMLA v20.4s, v14.4s, v6.s[1] 360 FMLA v22.4s, v14.4s, v7.s[1] 361 FMLA v24.4s, v14.4s, v8.s[1] 362 FMLA v26.4s, v14.4s, v9.s[1] 363 FMLA v28.4s, v14.4s, v10.s[1] 364 FMLA v30.4s, v14.4s, v11.s[1] 365 FMLA v21.4s, v15.4s, v6.s[1] 366 FMLA v23.4s, v15.4s, v7.s[1] 367 FMLA v25.4s, v15.4s, v8.s[1] 368 FMLA v27.4s, v15.4s, v9.s[1] 369 FMLA v29.4s, v15.4s, v10.s[1] 370 FMLA v31.4s, v15.4s, v11.s[1] 371 372 FMLA v20.4s, v16.4s, v6.s[2] 373 FMLA v22.4s, v16.4s, v7.s[2] 374 FMLA v24.4s, v16.4s, v8.s[2] 375 FMLA v26.4s, v16.4s, v9.s[2] 376 FMLA v28.4s, v16.4s, v10.s[2] 377 FMLA v30.4s, v16.4s, v11.s[2] 378 FMLA v21.4s, v17.4s, v6.s[2] 379 FMLA v23.4s, v17.4s, v7.s[2] 380 FMLA v25.4s, v17.4s, v8.s[2] 381 FMLA v27.4s, v17.4s, v9.s[2] 382 FMLA v29.4s, v17.4s, v10.s[2] 383 FMLA v31.4s, v17.4s, v11.s[2] 384 385 FMLA v20.4s, v18.4s, v6.s[3] 386 FMLA v22.4s, v18.4s, v7.s[3] 387 FMLA v24.4s, v18.4s, v8.s[3] 388 FMLA v26.4s, v18.4s, v9.s[3] 389 FMLA v28.4s, v18.4s, v10.s[3] 390 FMLA v30.4s, v18.4s, v11.s[3] 391 FMLA v21.4s, v19.4s, v6.s[3] 392 FMLA v23.4s, v19.4s, v7.s[3] 393 394 # Load min/max values 395 LD2R {v6.4s, v7.4s}, [x8] 396 397 FMLA v25.4s, v19.4s, v8.s[3] 398 FMLA v27.4s, v19.4s, v9.s[3] 399 # Is there a remainder?- 4 floats of A (16 bytes) or less 400 TST x0, 31 401 FMLA v29.4s, v19.4s, v10.s[3] 402 FMLA v31.4s, v19.4s, v11.s[3] 403 B.NE 4f 404 405 # Clamp 4063: 407 FMAX v20.4s, v20.4s, v6.4s 408 # Load cn_stride 409 LDR x0, [sp, 64] 410 FMAX v21.4s, v21.4s, v6.4s 411 FMAX v22.4s, v22.4s, v6.4s 412 FMAX v23.4s, v23.4s, v6.4s 413 FMAX v24.4s, v24.4s, v6.4s 414 FMAX v25.4s, v25.4s, v6.4s 415 FMAX v26.4s, v26.4s, v6.4s 416 FMAX v27.4s, v27.4s, v6.4s 417 FMAX v28.4s, v28.4s, v6.4s 418 FMAX v29.4s, v29.4s, v6.4s 419 FMAX v30.4s, v30.4s, v6.4s 420 FMAX v31.4s, v31.4s, v6.4s 421 SUBS x1, x1, 8 422 FMIN v20.4s, v20.4s, v7.4s 423 FMIN v21.4s, v21.4s, v7.4s 424 FMIN v22.4s, v22.4s, v7.4s 425 FMIN v23.4s, v23.4s, v7.4s 426 FMIN v24.4s, v24.4s, v7.4s 427 FMIN v25.4s, v25.4s, v7.4s 428 FMIN v26.4s, v26.4s, v7.4s 429 FMIN v27.4s, v27.4s, v7.4s 430 FMIN v28.4s, v28.4s, v7.4s 431 FMIN v29.4s, v29.4s, v7.4s 432 FMIN v30.4s, v30.4s, v7.4s 433 FMIN v31.4s, v31.4s, v7.4s 434 435 # Store full 6 x 8 436 B.LO 7f 437 438 STP q20, q21, [x6] 439 ADD x6, x6, x0 440 SUB x3, x3, x2 // a0 -= kc 441 STP q22, q23, [x16] 442 ADD x16, x16, x0 443 SUB x9, x9, x2 // a1 -= kc 444 STP q24, q25, [x17] 445 ADD x17, x17, x0 446 SUB x10, x10, x2 // a2 -= kc 447 STP q26, q27, [x14] 448 ADD x14, x14, x0 449 SUB x11, x11, x2 // a3 -= kc 450 STP q28, q29, [x13] 451 ADD x13, x13, x0 452 SUB x12, x12, x2 // a4 -= kc 453 STP q30, q31, [x7] 454 ADD x7, x7, x0 455 SUB x4, x4, x2 // a5 -= kc 456 457 B.HI 0b 458 459 # Restore d8-d15 from stack 460 LDP d14, d15, [sp, 48] 461 LDP d12, d13, [sp, 32] 462 LDP d10, d11, [sp, 16] 463 LDP d8, d9, [sp], 64 464 RET 465 4664: 467 # Load min/max values 468 LD2R {v6.4s, v7.4s}, [x8] 469 470 # Is there a remainder?- 4 floats of A (16 bytes) 471 TBZ x0, 4, 5f 472 473 # Remainder- 4 floats of A (16 bytes) 474 # Load A 475 LDR q0, [x3], 16 476 LDR q1, [x9], 16 477 LDR q2, [x10], 16 478 LDR q3, [x11], 16 479 LDR q4, [x12], 16 480 LDR q5, [x4], 16 481 # Load B 482 LDP q12, q13, [x5], 32 483 LDP q14, q15, [x5], 32 484 LDP q16, q17, [x5], 32 485 LDP q18, q19, [x5], 32 486 487 FMLA v20.4s, v12.4s, v0.s[0] 488 FMLA v22.4s, v12.4s, v1.s[0] 489 FMLA v24.4s, v12.4s, v2.s[0] 490 FMLA v26.4s, v12.4s, v3.s[0] 491 FMLA v28.4s, v12.4s, v4.s[0] 492 FMLA v30.4s, v12.4s, v5.s[0] 493 FMLA v21.4s, v13.4s, v0.s[0] 494 FMLA v23.4s, v13.4s, v1.s[0] 495 FMLA v25.4s, v13.4s, v2.s[0] 496 FMLA v27.4s, v13.4s, v3.s[0] 497 FMLA v29.4s, v13.4s, v4.s[0] 498 FMLA v31.4s, v13.4s, v5.s[0] 499 500 FMLA v20.4s, v14.4s, v0.s[1] 501 FMLA v22.4s, v14.4s, v1.s[1] 502 FMLA v24.4s, v14.4s, v2.s[1] 503 FMLA v26.4s, v14.4s, v3.s[1] 504 FMLA v28.4s, v14.4s, v4.s[1] 505 FMLA v30.4s, v14.4s, v5.s[1] 506 FMLA v21.4s, v15.4s, v0.s[1] 507 FMLA v23.4s, v15.4s, v1.s[1] 508 FMLA v25.4s, v15.4s, v2.s[1] 509 FMLA v27.4s, v15.4s, v3.s[1] 510 FMLA v29.4s, v15.4s, v4.s[1] 511 FMLA v31.4s, v15.4s, v5.s[1] 512 513 FMLA v20.4s, v16.4s, v0.s[2] 514 FMLA v22.4s, v16.4s, v1.s[2] 515 FMLA v24.4s, v16.4s, v2.s[2] 516 FMLA v26.4s, v16.4s, v3.s[2] 517 FMLA v28.4s, v16.4s, v4.s[2] 518 FMLA v30.4s, v16.4s, v5.s[2] 519 FMLA v21.4s, v17.4s, v0.s[2] 520 FMLA v23.4s, v17.4s, v1.s[2] 521 FMLA v25.4s, v17.4s, v2.s[2] 522 FMLA v27.4s, v17.4s, v3.s[2] 523 FMLA v29.4s, v17.4s, v4.s[2] 524 FMLA v31.4s, v17.4s, v5.s[2] 525 526 FMLA v20.4s, v18.4s, v0.s[3] 527 FMLA v22.4s, v18.4s, v1.s[3] 528 FMLA v24.4s, v18.4s, v2.s[3] 529 FMLA v26.4s, v18.4s, v3.s[3] 530 FMLA v28.4s, v18.4s, v4.s[3] 531 FMLA v30.4s, v18.4s, v5.s[3] 532 FMLA v21.4s, v19.4s, v0.s[3] 533 FMLA v23.4s, v19.4s, v1.s[3] 534 FMLA v25.4s, v19.4s, v2.s[3] 535 FMLA v27.4s, v19.4s, v3.s[3] 536 FMLA v29.4s, v19.4s, v4.s[3] 537 FMLA v31.4s, v19.4s, v5.s[3] 538 539 # Is there a remainder?- 2 floats of A (8 bytes) 5405: 541 TBZ x0, 3, 6f 542 543 # Remainder- 2 floats of A (8 bytes) 544 # Load A 545 LDR d0, [x3], 8 546 LDR d1, [x9], 8 547 LDR d2, [x10], 8 548 LDR d3, [x11], 8 549 LDR d4, [x12], 8 550 LDR d5, [x4], 8 551 # Load B 552 LDP q12, q13, [x5], 32 553 LDP q14, q15, [x5], 32 554 555 FMLA v20.4s, v12.4s, v0.s[0] 556 FMLA v22.4s, v12.4s, v1.s[0] 557 FMLA v24.4s, v12.4s, v2.s[0] 558 FMLA v26.4s, v12.4s, v3.s[0] 559 FMLA v28.4s, v12.4s, v4.s[0] 560 FMLA v30.4s, v12.4s, v5.s[0] 561 FMLA v21.4s, v13.4s, v0.s[0] 562 FMLA v23.4s, v13.4s, v1.s[0] 563 FMLA v25.4s, v13.4s, v2.s[0] 564 FMLA v27.4s, v13.4s, v3.s[0] 565 FMLA v29.4s, v13.4s, v4.s[0] 566 FMLA v31.4s, v13.4s, v5.s[0] 567 568 FMLA v20.4s, v14.4s, v0.s[1] 569 FMLA v22.4s, v14.4s, v1.s[1] 570 FMLA v24.4s, v14.4s, v2.s[1] 571 FMLA v26.4s, v14.4s, v3.s[1] 572 FMLA v28.4s, v14.4s, v4.s[1] 573 FMLA v30.4s, v14.4s, v5.s[1] 574 FMLA v21.4s, v15.4s, v0.s[1] 575 FMLA v23.4s, v15.4s, v1.s[1] 576 FMLA v25.4s, v15.4s, v2.s[1] 577 FMLA v27.4s, v15.4s, v3.s[1] 578 FMLA v29.4s, v15.4s, v4.s[1] 579 FMLA v31.4s, v15.4s, v5.s[1] 580 581 # Is there a remainder?- 1 float of A (4 bytes) 5826: 583 TBZ x0, 2, 3b 584 585 # Remainder- 1 float of A (4 bytes) 586 # Load A 587 LDR s0, [x3], 4 588 LDR s1, [x9], 4 589 LDR s2, [x10], 4 590 LDR s3, [x11], 4 591 LDR s4, [x12], 4 592 LDR s5, [x4], 4 593 # Load B 594 LDP q12, q13, [x5], 32 595 596 FMLA v20.4s, v12.4s, v0.s[0] 597 FMLA v22.4s, v12.4s, v1.s[0] 598 FMLA v24.4s, v12.4s, v2.s[0] 599 FMLA v26.4s, v12.4s, v3.s[0] 600 FMLA v28.4s, v12.4s, v4.s[0] 601 FMLA v30.4s, v12.4s, v5.s[0] 602 FMLA v21.4s, v13.4s, v0.s[0] 603 FMLA v23.4s, v13.4s, v1.s[0] 604 FMLA v25.4s, v13.4s, v2.s[0] 605 FMLA v27.4s, v13.4s, v3.s[0] 606 FMLA v29.4s, v13.4s, v4.s[0] 607 FMLA v31.4s, v13.4s, v5.s[0] 608 B 3b 609 610 # Store odd width 6117: 612 TBZ x1, 2, 8f 613 STR q20, [x6], 16 614 MOV v20.16b, v21.16b 615 STR q22, [x16], 16 616 MOV v22.16b, v23.16b 617 STR q24, [x17], 16 618 MOV v24.16b, v25.16b 619 STR q26, [x14], 16 620 MOV v26.16b, v27.16b 621 STR q28, [x13], 16 622 MOV v28.16b, v29.16b 623 STR q30, [x7], 16 624 MOV v30.16b, v31.16b 6258: 626 TBZ x1, 1, 9f 627 STR d20, [x6], 8 628 DUP d20, v20.d[1] 629 STR d22, [x16], 8 630 DUP d22, v22.d[1] 631 STR d24, [x17], 8 632 DUP d24, v24.d[1] 633 STR d26, [x14], 8 634 DUP d26, v26.d[1] 635 STR d28, [x13], 8 636 DUP d28, v28.d[1] 637 STR d30, [x7], 8 638 DUP d30, v30.d[1] 639 6409: 641 TBZ x1, 0, 10f 642 STR s20, [x6] 643 STR s22, [x16] 644 STR s24, [x17] 645 STR s26, [x14] 646 STR s28, [x13] 647 STR s30, [x7] 64810: 649 # Restore d8-d15 from stack 650 LDP d14, d15, [sp, 48] 651 LDP d12, d13, [sp, 32] 652 LDP d10, d11, [sp, 16] 653 LDP d8, d9, [sp], 64 654 RET 655 656END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75 657 658#ifdef __ELF__ 659.section ".note.GNU-stack","",%progbits 660#endif 661