1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/4x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a57( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const float*restrict w, x5 19# float*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x10 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> x8 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointers 29# x20 a0 30# x13 a1 31# x14 a2 32# x15 a3 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x7 c3 / cm_stride 39 40# Vector register usage 41# A0 v0 v4 42# A1 v1 v5 43# A2 v2 v6 44# A3 v3 v7 45# B v8 v9 v10 v11 46# B v12 v13 v14 v15 47# B v20 v21 v22 v23 48# B v24 v25 v26 v27 49# C v16 v17 50# C v18 v19 51# C v28 v29 52# C v30 v31 53# Clamp v4 v5 54 55BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a57 56 57 # Load cn_stride, a_offset 58 LDP x10, x11, [sp] 59 60 # Load zero, params pointer 61 LDP x12, x8, [sp, 16] 62 63 # Load min/max values 64 LD2R {v4.4s, v5.4s}, [x8] 65 66 # Save x20 on stack 67 STR x20, [sp, -80]! 68 69 # Save d8-d15 on stack 70 STP d8, d9, [sp, 16] 71 STP d10, d11, [sp, 32] 72 STP d12, d13, [sp, 48] 73 STP d14, d15, [sp, 64] 74 75 # Clamp C pointers 76 CMP x0, 2 // if mr < 2 77 ADD x16, x6, x7 // c1 = c0 + cm_stride 78 CSEL x16, x6, x16, LO // c1 = c0 79 80 ADD x17, x16, x7 // c2 = c1 + cm_stride 81 // if mr <= 2 82 CSEL x17, x16, x17, LS // c2 = c1 83 84 CMP x0, 4 // if mr < 4 85 ADD x7, x17, x7 // c3 = c2 + cm_stride 86 CSEL x7, x17, x7, LO // c3 = c2 87 880: 89 # Load initial bias from w into accumulators 90 LDP q16, q17, [x5], 32 91 MOV v18.16b, v16.16b 92 MOV v19.16b, v17.16b 93 MOV v28.16b, v16.16b 94 MOV v29.16b, v17.16b 95 MOV v30.16b, v16.16b 96 MOV v31.16b, v17.16b 97 98 MOV x9, x3 // p = ks 99 1001: 101 # Load next 4 A pointers 102 LDP x20, x13, [x4], 16 103 LDP x14, x15, [x4], 16 104 105 CMP x20, x12 // if a0 == zero 106 ADD x20, x20, x11 // a0 += a_offset 107 CSEL x20, x12, x20, EQ // a0 = zero, else += a0 + a_offset 108 CMP x13, x12 // if a1 == zero 109 ADD x13, x13, x11 // a1 += a_offset 110 CSEL x13, x12, x13, EQ // a1 = zero, else += a1 + a_offset 111 CMP x14, x12 // if a2 == zero 112 ADD x14, x14, x11 // a2 += a_offset 113 CSEL x14, x12, x14, EQ // a2 = zero, else += a2 + a_offset 114 CMP x15, x12 // if a3 == zero 115 ADD x15, x15, x11 // a3 += a_offset 116 CSEL x15, x12, x15, EQ // a3 = zero, else += a3 + a_offset 117 118 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 119 SUBS x0, x2, 32 // k = kc - 32 120 B.LO 4f 121 122 # 16 prologue 123 # Read first block of 4 A and B. 124 LDR q0, [x20], 16 125 LDP q20, q21, [x5], 32 126 LDR q1, [x13], 16 127 LDR q2, [x14], 16 128 LDR q3, [x15], 16 129 LDP q22, q23, [x5], 32 130 LDP q24, q25, [x5], 32 131 LDP q26, q27, [x5], 32 132 133 # Is there at least 32. yes do main loop 134 SUBS x0, x0, 32 135 B.LO 3f 136 137 # Main loop - 8 floats of A 1382: 139 # First block of 4. FMA for first 4, loads for 2nd block of 4. 140 FMLA v16.4s, v20.4s, v0.s[0] 141 LDP q8, q9, [x5], 32 142 FMLA v17.4s, v21.4s, v0.s[0] 143 FMLA v18.4s, v20.4s, v1.s[0] 144 LDP q10, q11, [x5], 32 145 FMLA v19.4s, v21.4s, v1.s[0] 146 FMLA v28.4s, v20.4s, v2.s[0] 147 LDP q12, q13, [x5], 32 148 FMLA v29.4s, v21.4s, v2.s[0] 149 FMLA v30.4s, v20.4s, v3.s[0] 150 LDP q14, q15, [x5], 32 151 FMLA v31.4s, v21.4s, v3.s[0] 152 FMLA v16.4s, v22.4s, v0.s[1] 153 LDR q4, [x20], 16 154 FMLA v17.4s, v23.4s, v0.s[1] 155 FMLA v18.4s, v22.4s, v1.s[1] 156 LDR q5, [x13], 16 157 FMLA v19.4s, v23.4s, v1.s[1] 158 FMLA v28.4s, v22.4s, v2.s[1] 159 LDR q6, [x14], 16 160 FMLA v29.4s, v23.4s, v2.s[1] 161 FMLA v30.4s, v22.4s, v3.s[1] 162 LDR q7, [x15], 16 163 FMLA v31.4s, v23.4s, v3.s[1] 164 FMLA v16.4s, v24.4s, v0.s[2] 165 FMLA v17.4s, v25.4s, v0.s[2] 166 FMLA v18.4s, v24.4s, v1.s[2] 167 FMLA v19.4s, v25.4s, v1.s[2] 168 FMLA v28.4s, v24.4s, v2.s[2] 169 FMLA v29.4s, v25.4s, v2.s[2] 170 FMLA v30.4s, v24.4s, v3.s[2] 171 FMLA v31.4s, v25.4s, v3.s[2] 172 FMLA v16.4s, v26.4s, v0.s[3] 173 FMLA v17.4s, v27.4s, v0.s[3] 174 FMLA v18.4s, v26.4s, v1.s[3] 175 FMLA v19.4s, v27.4s, v1.s[3] 176 FMLA v28.4s, v26.4s, v2.s[3] 177 FMLA v29.4s, v27.4s, v2.s[3] 178 FMLA v30.4s, v26.4s, v3.s[3] 179 FMLA v31.4s, v27.4s, v3.s[3] 180 181 # Second block of 4. FMA for second 4, loads for 1nd block of 4. 182 FMLA v16.4s, v8.4s, v4.s[0] 183 LDP q20, q21, [x5], 32 184 FMLA v17.4s, v9.4s, v4.s[0] 185 FMLA v18.4s, v8.4s, v5.s[0] 186 LDP q22, q23, [x5], 32 187 FMLA v19.4s, v9.4s, v5.s[0] 188 FMLA v28.4s, v8.4s, v6.s[0] 189 LDP q24, q25, [x5], 32 190 FMLA v29.4s, v9.4s, v6.s[0] 191 FMLA v30.4s, v8.4s, v7.s[0] 192 LDP q26, q27, [x5], 32 193 FMLA v31.4s, v9.4s, v7.s[0] 194 FMLA v16.4s, v10.4s, v4.s[1] 195 LDR q0, [x20], 16 196 FMLA v17.4s, v11.4s, v4.s[1] 197 FMLA v18.4s, v10.4s, v5.s[1] 198 LDR q1, [x13], 16 199 FMLA v19.4s, v11.4s, v5.s[1] 200 FMLA v28.4s, v10.4s, v6.s[1] 201 LDR q2, [x14], 16 202 FMLA v29.4s, v11.4s, v6.s[1] 203 FMLA v30.4s, v10.4s, v7.s[1] 204 LDR q3, [x15], 16 205 FMLA v31.4s, v11.4s, v7.s[1] 206 FMLA v16.4s, v12.4s, v4.s[2] 207 FMLA v17.4s, v13.4s, v4.s[2] 208 FMLA v18.4s, v12.4s, v5.s[2] 209 FMLA v19.4s, v13.4s, v5.s[2] 210 FMLA v28.4s, v12.4s, v6.s[2] 211 FMLA v29.4s, v13.4s, v6.s[2] 212 FMLA v30.4s, v12.4s, v7.s[2] 213 FMLA v31.4s, v13.4s, v7.s[2] 214 FMLA v16.4s, v14.4s, v4.s[3] 215 FMLA v17.4s, v15.4s, v4.s[3] 216 FMLA v18.4s, v14.4s, v5.s[3] 217 FMLA v19.4s, v15.4s, v5.s[3] 218 FMLA v28.4s, v14.4s, v6.s[3] 219 FMLA v29.4s, v15.4s, v6.s[3] 220 SUBS x0, x0, 32 221 FMLA v30.4s, v14.4s, v7.s[3] 222 FMLA v31.4s, v15.4s, v7.s[3] 223 224 B.HS 2b 225 2263: 227 # Epilogue 228 # First block of 4. FMA for first 4, loads for 2nd block of 4. 229 FMLA v16.4s, v20.4s, v0.s[0] 230 LDP q8, q9, [x5], 32 231 FMLA v17.4s, v21.4s, v0.s[0] 232 FMLA v18.4s, v20.4s, v1.s[0] 233 LDP q10, q11, [x5], 32 234 FMLA v19.4s, v21.4s, v1.s[0] 235 FMLA v28.4s, v20.4s, v2.s[0] 236 LDP q12, q13, [x5], 32 237 FMLA v29.4s, v21.4s, v2.s[0] 238 FMLA v30.4s, v20.4s, v3.s[0] 239 LDP q14, q15, [x5], 32 240 FMLA v31.4s, v21.4s, v3.s[0] 241 FMLA v16.4s, v22.4s, v0.s[1] 242 LDR q4, [x20], 16 243 FMLA v17.4s, v23.4s, v0.s[1] 244 FMLA v18.4s, v22.4s, v1.s[1] 245 LDR q5, [x13], 16 246 FMLA v19.4s, v23.4s, v1.s[1] 247 FMLA v28.4s, v22.4s, v2.s[1] 248 LDR q6, [x14], 16 249 FMLA v29.4s, v23.4s, v2.s[1] 250 FMLA v30.4s, v22.4s, v3.s[1] 251 LDR q7, [x15], 16 252 FMLA v31.4s, v23.4s, v3.s[1] 253 FMLA v16.4s, v24.4s, v0.s[2] 254 FMLA v17.4s, v25.4s, v0.s[2] 255 FMLA v18.4s, v24.4s, v1.s[2] 256 FMLA v19.4s, v25.4s, v1.s[2] 257 FMLA v28.4s, v24.4s, v2.s[2] 258 FMLA v29.4s, v25.4s, v2.s[2] 259 FMLA v30.4s, v24.4s, v3.s[2] 260 FMLA v31.4s, v25.4s, v3.s[2] 261 FMLA v16.4s, v26.4s, v0.s[3] 262 FMLA v17.4s, v27.4s, v0.s[3] 263 FMLA v18.4s, v26.4s, v1.s[3] 264 FMLA v19.4s, v27.4s, v1.s[3] 265 FMLA v28.4s, v26.4s, v2.s[3] 266 FMLA v29.4s, v27.4s, v2.s[3] 267 FMLA v30.4s, v26.4s, v3.s[3] 268 FMLA v31.4s, v27.4s, v3.s[3] 269 270 # Second block of 4. FMA for second 4, noloads 271 FMLA v16.4s, v8.4s, v4.s[0] 272 FMLA v17.4s, v9.4s, v4.s[0] 273 FMLA v18.4s, v8.4s, v5.s[0] 274 FMLA v19.4s, v9.4s, v5.s[0] 275 FMLA v28.4s, v8.4s, v6.s[0] 276 FMLA v29.4s, v9.4s, v6.s[0] 277 FMLA v30.4s, v8.4s, v7.s[0] 278 FMLA v31.4s, v9.4s, v7.s[0] 279 FMLA v16.4s, v10.4s, v4.s[1] 280 FMLA v17.4s, v11.4s, v4.s[1] 281 FMLA v18.4s, v10.4s, v5.s[1] 282 FMLA v19.4s, v11.4s, v5.s[1] 283 FMLA v28.4s, v10.4s, v6.s[1] 284 FMLA v29.4s, v11.4s, v6.s[1] 285 FMLA v30.4s, v10.4s, v7.s[1] 286 FMLA v31.4s, v11.4s, v7.s[1] 287 FMLA v16.4s, v12.4s, v4.s[2] 288 FMLA v17.4s, v13.4s, v4.s[2] 289 FMLA v18.4s, v12.4s, v5.s[2] 290 FMLA v19.4s, v13.4s, v5.s[2] 291 FMLA v28.4s, v12.4s, v6.s[2] 292 FMLA v29.4s, v13.4s, v6.s[2] 293 FMLA v30.4s, v12.4s, v7.s[2] 294 FMLA v31.4s, v13.4s, v7.s[2] 295 296 FMLA v16.4s, v14.4s, v4.s[3] 297 FMLA v17.4s, v15.4s, v4.s[3] 298 FMLA v18.4s, v14.4s, v5.s[3] 299 FMLA v19.4s, v15.4s, v5.s[3] 300 301 # Load min/max values 302 LD2R {v4.4s, v5.4s}, [x8] 303 304 FMLA v28.4s, v14.4s, v6.s[3] 305 FMLA v29.4s, v15.4s, v6.s[3] 306 FMLA v30.4s, v14.4s, v7.s[3] 307 FMLA v31.4s, v15.4s, v7.s[3] 308 3094: 310 # Remainder- 4 floats of A 311 TBZ x0, 4, 5f 312 313 LDR q0, [x20], 16 314 LDP q20, q21, [x5], 32 315 LDR q1, [x13], 16 316 LDR q2, [x14], 16 317 LDR q3, [x15], 16 318 FMLA v16.4s, v20.4s, v0.s[0] 319 FMLA v17.4s, v21.4s, v0.s[0] 320 LDP q22, q23, [x5], 32 321 FMLA v18.4s, v20.4s, v1.s[0] 322 FMLA v19.4s, v21.4s, v1.s[0] 323 LDP q24, q25, [x5], 32 324 FMLA v28.4s, v20.4s, v2.s[0] 325 FMLA v29.4s, v21.4s, v2.s[0] 326 LDP q26, q27, [x5], 32 327 FMLA v30.4s, v20.4s, v3.s[0] 328 FMLA v31.4s, v21.4s, v3.s[0] 329 FMLA v16.4s, v22.4s, v0.s[1] 330 FMLA v17.4s, v23.4s, v0.s[1] 331 FMLA v18.4s, v22.4s, v1.s[1] 332 FMLA v19.4s, v23.4s, v1.s[1] 333 FMLA v28.4s, v22.4s, v2.s[1] 334 FMLA v29.4s, v23.4s, v2.s[1] 335 FMLA v30.4s, v22.4s, v3.s[1] 336 FMLA v31.4s, v23.4s, v3.s[1] 337 FMLA v16.4s, v24.4s, v0.s[2] 338 FMLA v17.4s, v25.4s, v0.s[2] 339 FMLA v18.4s, v24.4s, v1.s[2] 340 FMLA v19.4s, v25.4s, v1.s[2] 341 FMLA v28.4s, v24.4s, v2.s[2] 342 FMLA v29.4s, v25.4s, v2.s[2] 343 FMLA v30.4s, v24.4s, v3.s[2] 344 FMLA v31.4s, v25.4s, v3.s[2] 345 FMLA v16.4s, v26.4s, v0.s[3] 346 FMLA v17.4s, v27.4s, v0.s[3] 347 FMLA v18.4s, v26.4s, v1.s[3] 348 FMLA v19.4s, v27.4s, v1.s[3] 349 FMLA v28.4s, v26.4s, v2.s[3] 350 FMLA v29.4s, v27.4s, v2.s[3] 351 FMLA v30.4s, v26.4s, v3.s[3] 352 FMLA v31.4s, v27.4s, v3.s[3] 353 3545: 355 # Remainder- 2 floats of A 356 TBZ x0, 3, 6f 357 358 LDR d0, [x20], 8 359 LDP q20, q21, [x5], 32 360 LDR d1, [x13], 8 361 LDR d2, [x14], 8 362 LDR d3, [x15], 8 363 FMLA v16.4s, v20.4s, v0.s[0] 364 FMLA v17.4s, v21.4s, v0.s[0] 365 LDP q22, q23, [x5], 32 366 FMLA v18.4s, v20.4s, v1.s[0] 367 FMLA v19.4s, v21.4s, v1.s[0] 368 FMLA v28.4s, v20.4s, v2.s[0] 369 FMLA v29.4s, v21.4s, v2.s[0] 370 FMLA v30.4s, v20.4s, v3.s[0] 371 FMLA v31.4s, v21.4s, v3.s[0] 372 FMLA v16.4s, v22.4s, v0.s[1] 373 FMLA v17.4s, v23.4s, v0.s[1] 374 FMLA v18.4s, v22.4s, v1.s[1] 375 FMLA v19.4s, v23.4s, v1.s[1] 376 FMLA v28.4s, v22.4s, v2.s[1] 377 FMLA v29.4s, v23.4s, v2.s[1] 378 FMLA v30.4s, v22.4s, v3.s[1] 379 FMLA v31.4s, v23.4s, v3.s[1] 380 3816: 382 # Remainder- 1 float of A 383 TBZ x0, 2, 7f 384 385 LDR s0, [x20], 4 386 LDP q20, q21, [x5], 32 387 LDR s1, [x13], 4 388 LDR s2, [x14], 4 389 LDR s3, [x15], 4 390 FMLA v16.4s, v20.4s, v0.s[0] 391 FMLA v17.4s, v21.4s, v0.s[0] 392 FMLA v18.4s, v20.4s, v1.s[0] 393 FMLA v19.4s, v21.4s, v1.s[0] 394 FMLA v28.4s, v20.4s, v2.s[0] 395 FMLA v29.4s, v21.4s, v2.s[0] 396 FMLA v30.4s, v20.4s, v3.s[0] 397 FMLA v31.4s, v21.4s, v3.s[0] 398 3997: 400 # ks loop 401 SUBS x9, x9, 32 // ks -= MR * sizeof(void*) 402 B.HI 1b 403 404 # Clamp 405 FMAX v16.4s, v16.4s, v4.4s 406 FMAX v17.4s, v17.4s, v4.4s 407 FMAX v18.4s, v18.4s, v4.4s 408 FMAX v19.4s, v19.4s, v4.4s 409 FMAX v28.4s, v28.4s, v4.4s 410 FMAX v29.4s, v29.4s, v4.4s 411 FMAX v30.4s, v30.4s, v4.4s 412 FMAX v31.4s, v31.4s, v4.4s 413 FMIN v16.4s, v16.4s, v5.4s 414 FMIN v17.4s, v17.4s, v5.4s 415 FMIN v18.4s, v18.4s, v5.4s 416 FMIN v19.4s, v19.4s, v5.4s 417 FMIN v28.4s, v28.4s, v5.4s 418 FMIN v29.4s, v29.4s, v5.4s 419 FMIN v30.4s, v30.4s, v5.4s 420 FMIN v31.4s, v31.4s, v5.4s 421 422 # Store full 4 x 8 423 SUBS x1, x1, 8 424 B.LO 8f 425 426 STP q30, q31, [x7] 427 ADD x7, x7, x10 428 STP q28, q29, [x17] 429 ADD x17, x17, x10 430 STP q18, q19, [x16] 431 ADD x16, x16, x10 432 STP q16, q17, [x6] 433 ADD x6, x6, x10 434 435 SUB x4, x4, x3 // a -= ks 436 437 # nc loop 438 B.HI 0b 439 440 # Restore d8-d15 from stack 441 LDP d14, d15, [sp, 64] 442 LDP d12, d13, [sp, 48] 443 LDP d10, d11, [sp, 32] 444 LDP d8, d9, [sp, 16] 445 446 # Restore x20 from stack 447 LDR x20, [sp], 80 448 RET 449 450 # Store odd width 4518: 452 TBZ x1, 2, 9f 453 STR q30, [x7], 16 454 MOV v30.16b, v31.16b 455 STR q28, [x17], 16 456 MOV v28.16b, v29.16b 457 STR q18, [x16], 16 458 MOV v18.16b, v19.16b 459 STR q16, [x6], 16 460 MOV v16.16b, v17.16b 461 4629: 463 TBZ x1, 1, 10f 464 STR d30, [x7], 8 465 DUP d30, v30.d[1] 466 STR d28, [x17], 8 467 DUP d28, v28.d[1] 468 STR d18, [x16], 8 469 DUP d18, v18.d[1] 470 STR d16, [x6], 8 471 DUP d16, v16.d[1] 472 47310: 474 TBZ x1, 0, 11f 475 STR s30, [x7] 476 STR s28, [x17] 477 STR s18, [x16] 478 STR s16, [x6] 47911: 480 # Restore d8-d15 from stack 481 LDP d14, d15, [sp, 64] 482 LDP d12, d13, [sp, 48] 483 LDP d10, d11, [sp, 32] 484 LDP d8, d9, [sp, 16] 485 486 # Restore x20 from stack 487 LDR x20, [sp], 80 488 RET 489 490END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a57 491 492#ifdef __ELF__ 493.section ".note.GNU-stack","",%progbits 494#endif 495