1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> (x0) 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31# x12 a4 32# x4 a5 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x14 c3 39# x13 c4 40# x7 c5 41 42# Vector register usage 43# A0 v0 44# A1 v1 45# A2 v2 46# A3 v3 47# A4 v4 48# A5 v5 49# B v16 v17 v18 v19 50# C v20 v21 51# C v22 v23 52# C v24 v25 53# C v26 v27 54# C v28 v29 55# C v30 v31 56# Clamp v6, (v4), (v5) 57# unused A v8 v9 v10 v11 58# unused B v12 v13 v14 v15 59 60 61BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32 62 63 $if INC: 64 # Load acc, params pointer 65 LDP x15, x8, [sp, 8] 66 $else: 67 # Load params pointer 68 LDR x8, [sp, 8] 69 70 # Clamp A and C pointers 71 CMP x0, 2 // if mr < 2 72 ADD x9, x3, x4 // a1 = a0 + a_stride 73 ADD x16, x6, x7 // c1 = c0 + cm_stride 74 CSEL x9, x3, x9, LO // a1 = a0 75 CSEL x16, x6, x16, LO // c1 = c0 76 77 ADD x10, x9, x4 // a2 = a1 + a_stride 78 ADD x17, x16, x7 // c2 = c1 + cm_stride 79 // if mr <= 2 80 CSEL x10, x9, x10, LS // a2 = a1 81 CSEL x17, x16, x17, LS // c2 = c1 82 83 CMP x0, 4 // if mr < 4 84 ADD x11, x10, x4 // a3 = a2 + a_stride 85 ADD x14, x17, x7 // c3 = c2 + cm_stride 86 CSEL x11, x10, x11, LO // a3 = a2 87 CSEL x14, x17, x14, LO // c3 = c2 88 89 ADD x12, x11, x4 // a4 = a3 + a_stride 90 ADD x13, x14, x7 // c4 = c3 + cm_stride 91 // if mr <= 4 92 CSEL x12, x11, x12, LS // a4 = a3 93 CSEL x13, x14, x13, LS // c4 = c3 94 95 CMP x0, 6 // if mr < 6 96 ADD x4, x12, x4 // a5 = a4 + a_stride 97 ADD x7, x13, x7 // c5 = c4 + cm_stride 98 CSEL x4, x12, x4, LO // a5 = a4 99 CSEL x7, x13, x7, LO // c5 = c4 100 101 # Load params scale value 102 LD1R {v6.8h}, [x8] 103 ADD x8, x8, 2 104 1050: 106 $if INC: 107 # Load initial accumulators 108 LDP q20, q21, [x15], 32 109 LDP q22, q23, [x15], 32 110 LDP q24, q25, [x15], 32 111 LDP q26, q27, [x15], 32 112 LDP q28, q29, [x15], 32 113 LDP q30, q31, [x15], 32 114 $else: 115 # Load initial bias from w into accumulators 116 LDP q20, q21, [x5], 32 117 MOV v22.16b, v20.16b 118 MOV v23.16b, v21.16b 119 MOV v24.16b, v20.16b 120 MOV v25.16b, v21.16b 121 MOV v26.16b, v20.16b 122 MOV v27.16b, v21.16b 123 MOV v28.16b, v20.16b 124 MOV v29.16b, v21.16b 125 MOV v30.16b, v20.16b 126 MOV v31.16b, v21.16b 127 128 # Is there at least 2 halffloats (4 bytes)? 129 SUBS x0, x2, 4 // k = kc - 4 130 B.LO 3f 131 132 # Main loop - 2 halffloats of A (4 bytes) 133 # 24 FMA + 6 ld32 A + 4 LDR B 1341: 135 LDR s0, [x3], 4 136 LDR q16, [x5], 16 137 LDR q17, [x5], 16 138 LDR s1, [x9], 4 139 LDR s2, [x10], 4 140 LDR s3, [x11], 4 141 LDR s4, [x12], 4 142 LDR s5, [x4], 4 143 SUBS x0, x0, 4 144 FMLA v20.8h, v16.8h, v0.h[0] 145 FMLA v22.8h, v16.8h, v1.h[0] 146 FMLA v24.8h, v16.8h, v2.h[0] 147 FMLA v26.8h, v16.8h, v3.h[0] 148 LDR q18, [x5], 16 149 LDR q19, [x5], 16 150 FMLA v28.8h, v16.8h, v4.h[0] 151 FMLA v30.8h, v16.8h, v5.h[0] 152 FMLA v21.8h, v17.8h, v0.h[0] 153 FMLA v23.8h, v17.8h, v1.h[0] 154 FMLA v25.8h, v17.8h, v2.h[0] 155 FMLA v27.8h, v17.8h, v3.h[0] 156 FMLA v29.8h, v17.8h, v4.h[0] 157 FMLA v31.8h, v17.8h, v5.h[0] 158 159 FMLA v20.8h, v18.8h, v0.h[1] 160 FMLA v22.8h, v18.8h, v1.h[1] 161 FMLA v24.8h, v18.8h, v2.h[1] 162 FMLA v26.8h, v18.8h, v3.h[1] 163 FMLA v28.8h, v18.8h, v4.h[1] 164 FMLA v30.8h, v18.8h, v5.h[1] 165 FMLA v21.8h, v19.8h, v0.h[1] 166 FMLA v23.8h, v19.8h, v1.h[1] 167 FMLA v25.8h, v19.8h, v2.h[1] 168 FMLA v27.8h, v19.8h, v3.h[1] 169 FMLA v29.8h, v19.8h, v4.h[1] 170 FMLA v31.8h, v19.8h, v5.h[1] 171 B.HS 1b 172 173 # Is there a remainder?- 1 halffloat of A (2 bytes) 174 TBNZ x0, 1, 3f 1752: 176 # Scale and Clamp 177 FMUL v20.8h, v20.8h, v6.8h 178 # Load params values 179 LD2R {v4.8h, v5.8h}, [x8] 180 FMUL v21.8h, v21.8h, v6.8h 181 FMUL v22.8h, v22.8h, v6.8h 182 FMUL v23.8h, v23.8h, v6.8h 183 FMUL v24.8h, v24.8h, v6.8h 184 FMUL v25.8h, v25.8h, v6.8h 185 FMUL v26.8h, v26.8h, v6.8h 186 FMUL v27.8h, v27.8h, v6.8h 187 FMUL v28.8h, v28.8h, v6.8h 188 FMUL v29.8h, v29.8h, v6.8h 189 FMUL v30.8h, v30.8h, v6.8h 190 FMUL v31.8h, v31.8h, v6.8h 191 # Load cn_stride 192 LDR x0, [sp, 0] 193 FMAX v20.8h, v20.8h, v4.8h 194 FMAX v21.8h, v21.8h, v4.8h 195 FMAX v22.8h, v22.8h, v4.8h 196 FMAX v23.8h, v23.8h, v4.8h 197 FMAX v24.8h, v24.8h, v4.8h 198 FMAX v25.8h, v25.8h, v4.8h 199 FMAX v26.8h, v26.8h, v4.8h 200 FMAX v27.8h, v27.8h, v4.8h 201 FMAX v28.8h, v28.8h, v4.8h 202 FMAX v29.8h, v29.8h, v4.8h 203 FMAX v30.8h, v30.8h, v4.8h 204 FMAX v31.8h, v31.8h, v4.8h 205 SUBS x1, x1, 16 206 FMIN v20.8h, v20.8h, v5.8h 207 FMIN v21.8h, v21.8h, v5.8h 208 FMIN v22.8h, v22.8h, v5.8h 209 FMIN v23.8h, v23.8h, v5.8h 210 FMIN v24.8h, v24.8h, v5.8h 211 FMIN v25.8h, v25.8h, v5.8h 212 FMIN v26.8h, v26.8h, v5.8h 213 FMIN v27.8h, v27.8h, v5.8h 214 FMIN v28.8h, v28.8h, v5.8h 215 FMIN v29.8h, v29.8h, v5.8h 216 FMIN v30.8h, v30.8h, v5.8h 217 FMIN v31.8h, v31.8h, v5.8h 218 219 # Store full 6 x 16 220 B.LO 4f 221 222 $if INC: 223 ST1 {v30.16b, v31.16b}, [x7], x0 224 SUB x3, x3, x2 // a0 -= kc 225 ST1 {v28.16b, v29.16b}, [x13], x0 226 SUB x9, x9, x2 // a1 -= kc 227 ST1 {v26.16b, v27.16b}, [x14], x0 228 SUB x10, x10, x2 // a2 -= kc 229 ST1 {v24.16b, v25.16b}, [x17], x0 230 SUB x11, x11, x2 // a3 -= kc 231 ST1 {v22.16b, v23.16b}, [x16], x0 232 SUB x12, x12, x2 // a4 -= kc 233 ST1 {v20.16b, v21.16b}, [x6], x0 234 SUB x4, x4, x2 // a5 -= kc 235 $else: 236 ST1 {v20.16b, v21.16b}, [x6], x0 237 SUB x3, x3, x2 // a0 -= kc 238 ST1 {v22.16b, v23.16b}, [x16], x0 239 SUB x9, x9, x2 // a1 -= kc 240 ST1 {v24.16b, v25.16b}, [x17], x0 241 SUB x10, x10, x2 // a2 -= kc 242 ST1 {v26.16b, v27.16b}, [x14], x0 243 SUB x11, x11, x2 // a3 -= kc 244 ST1 {v28.16b, v29.16b}, [x13], x0 245 SUB x12, x12, x2 // a4 -= kc 246 ST1 {v30.16b, v31.16b}, [x7], x0 247 SUB x4, x4, x2 // a5 -= kc 248 249 B.HI 0b 250 RET 251 2523: 253 # Remainder- 1 halffloat of A (2 bytes) 254 LDR h0, [x3], 2 255 LDR q16, [x5], 16 256 LDR q17, [x5], 16 257 LDR h1, [x9], 2 258 LDR h2, [x10], 2 259 LDR h3, [x11], 2 260 LDR h4, [x12], 2 261 LDR h5, [x4], 2 262 FMLA v20.8h, v16.8h, v0.h[0] 263 FMLA v22.8h, v16.8h, v1.h[0] 264 FMLA v24.8h, v16.8h, v2.h[0] 265 FMLA v26.8h, v16.8h, v3.h[0] 266 FMLA v28.8h, v16.8h, v4.h[0] 267 FMLA v30.8h, v16.8h, v5.h[0] 268 FMLA v21.8h, v17.8h, v0.h[0] 269 FMLA v23.8h, v17.8h, v1.h[0] 270 FMLA v25.8h, v17.8h, v2.h[0] 271 FMLA v27.8h, v17.8h, v3.h[0] 272 FMLA v29.8h, v17.8h, v4.h[0] 273 FMLA v31.8h, v17.8h, v5.h[0] 274 B 2b 275 276 # Store odd width 2774: 278 TBZ x1, 3, 5f 279 $if INC: 280 STR q30, [x7], 16 281 MOV v30.16b, v31.16b 282 STR q28, [x13], 16 283 MOV v28.16b, v29.16b 284 STR q26, [x14], 16 285 MOV v26.16b, v27.16b 286 STR q24, [x17], 16 287 MOV v24.16b, v25.16b 288 STR q22, [x16], 16 289 MOV v22.16b, v23.16b 290 STR q20, [x6], 16 291 MOV v20.16b, v21.16b 292 $else: 293 STR q20, [x6], 16 294 MOV v20.16b, v21.16b 295 STR q22, [x16], 16 296 MOV v22.16b, v23.16b 297 STR q24, [x17], 16 298 MOV v24.16b, v25.16b 299 STR q26, [x14], 16 300 MOV v26.16b, v27.16b 301 STR q28, [x13], 16 302 MOV v28.16b, v29.16b 303 STR q30, [x7], 16 304 MOV v30.16b, v31.16b 305 3065: 307 TBZ x1, 2, 6f 308 $if INC: 309 STR d30, [x7], 8 310 DUP d30, v30.d[1] 311 STR d28, [x13], 8 312 DUP d28, v28.d[1] 313 STR d26, [x14], 8 314 DUP d26, v26.d[1] 315 STR d24, [x17], 8 316 DUP d24, v24.d[1] 317 STR d22, [x16], 8 318 DUP d22, v22.d[1] 319 STR d20, [x6], 8 320 DUP d20, v20.d[1] 321 $else: 322 STR d20, [x6], 8 323 DUP d20, v20.d[1] 324 STR d22, [x16], 8 325 DUP d22, v22.d[1] 326 STR d24, [x17], 8 327 DUP d24, v24.d[1] 328 STR d26, [x14], 8 329 DUP d26, v26.d[1] 330 STR d28, [x13], 8 331 DUP d28, v28.d[1] 332 STR d30, [x7], 8 333 DUP d30, v30.d[1] 334 3356: 336 TBZ x1, 1, 7f 337 $if INC: 338 STR s30, [x7], 4 339 DUP s30, v30.s[1] 340 STR s28, [x13], 4 341 DUP s28, v28.s[1] 342 STR s26, [x14], 4 343 DUP s26, v26.s[1] 344 STR s24, [x17], 4 345 DUP s24, v24.s[1] 346 STR s22, [x16], 4 347 DUP s22, v22.s[1] 348 STR s20, [x6], 4 349 DUP s20, v20.s[1] 350 $else: 351 STR s20, [x6], 4 352 DUP s20, v20.s[1] 353 STR s22, [x16], 4 354 DUP s22, v22.s[1] 355 STR s24, [x17], 4 356 DUP s24, v24.s[1] 357 STR s26, [x14], 4 358 DUP s26, v26.s[1] 359 STR s28, [x13], 4 360 DUP s28, v28.s[1] 361 STR s30, [x7], 4 362 DUP s30, v30.s[1] 363 3647: 365 TBZ x1, 0, 8f 366 $if INC: 367 STR h30, [x7] 368 STR h28, [x13] 369 STR h26, [x14] 370 STR h24, [x17] 371 STR h22, [x16] 372 STR h20, [x6] 373 $else: 374 STR h20, [x6] 375 STR h22, [x16] 376 STR h24, [x17] 377 STR h26, [x14] 378 STR h28, [x13] 379 STR h30, [x7] 3808: 381 RET 382 383END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32 384 385#ifdef __ELF__ 386.section ".note.GNU-stack","",%progbits 387#endif 388