1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> (x0) 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31# x12 a4 32# x4 a5 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x14 c3 39# x13 c4 40# x7 c5 41 42# Vector register usage 43# A0 v0 44# A1 v1 45# A2 v2 46# A3 v3 47# A4 v4 48# A5 v5 49# B v16 v17 v18 v19 50# C v20 51# C v22 52# C v24 53# C v26 54# C v28 55# C v30 56# Clamp v6, (v4), (v5) 57# unused A v8 v9 v10 v11 58# unused B v12 v13 v14 v15 59 60 61BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64 62 63 $if INC: 64 # Load acc, params pointer 65 LDP x15, x8, [sp, 8] 66 $else: 67 # Load params pointer 68 LDR x8, [sp, 8] 69 70 # Clamp A and C pointers 71 CMP x0, 2 // if mr < 2 72 ADD x9, x3, x4 // a1 = a0 + a_stride 73 ADD x16, x6, x7 // c1 = c0 + cm_stride 74 CSEL x9, x3, x9, LO // a1 = a0 75 CSEL x16, x6, x16, LO // c1 = c0 76 77 ADD x10, x9, x4 // a2 = a1 + a_stride 78 ADD x17, x16, x7 // c2 = c1 + cm_stride 79 // if mr <= 2 80 CSEL x10, x9, x10, LS // a2 = a1 81 CSEL x17, x16, x17, LS // c2 = c1 82 83 CMP x0, 4 // if mr < 4 84 ADD x11, x10, x4 // a3 = a2 + a_stride 85 ADD x14, x17, x7 // c3 = c2 + cm_stride 86 CSEL x11, x10, x11, LO // a3 = a2 87 CSEL x14, x17, x14, LO // c3 = c2 88 89 ADD x12, x11, x4 // a4 = a3 + a_stride 90 ADD x13, x14, x7 // c4 = c3 + cm_stride 91 // if mr <= 4 92 CSEL x12, x11, x12, LS // a4 = a3 93 CSEL x13, x14, x13, LS // c4 = c3 94 95 CMP x0, 6 // if mr < 6 96 ADD x4, x12, x4 // a5 = a4 + a_stride 97 ADD x7, x13, x7 // c5 = c4 + cm_stride 98 CSEL x4, x12, x4, LO // a5 = a4 99 CSEL x7, x13, x7, LO // c5 = c4 100 101 # Load params scale value 102 LD1R {v6.8h}, [x8] 103 ADD x8, x8, 2 104 1050: 106 $if INC: 107 # Load initial accumulators 108 LDP q20, q22, [x15], 32 109 LDP q24, q26, [x15], 32 110 LDP q28, q30, [x15], 32 111 $else: 112 # Load initial bias from w into accumulators 113 LDR q20, [x5], 16 114 MOV v22.16b, v20.16b 115 MOV v24.16b, v20.16b 116 MOV v26.16b, v20.16b 117 MOV v28.16b, v20.16b 118 MOV v30.16b, v20.16b 119 120 # Is there at least 4 halffloats (8 bytes)? 121 SUBS x0, x2, 8 // k = kc - 8 122 B.LO 3f 123 124 # Main loop - 4 halffloats of A (8 bytes) 125 # 24 FMA + 6 ld64 A + 4 LDR B 1261: 127 LDR d0, [x3], 8 128 LDR q16, [x5], 16 129 LDR q17, [x5], 16 130 LDR d1, [x9], 8 131 LDR d2, [x10], 8 132 LDR d3, [x11], 8 133 LDR d4, [x12], 8 134 LDR d5, [x4], 8 135 SUBS x0, x0, 8 136 FMLA v20.8h, v16.8h, v0.h[0] 137 FMLA v22.8h, v16.8h, v1.h[0] 138 FMLA v24.8h, v16.8h, v2.h[0] 139 FMLA v26.8h, v16.8h, v3.h[0] 140 FMLA v28.8h, v16.8h, v4.h[0] 141 FMLA v30.8h, v16.8h, v5.h[0] 142 LDR q18, [x5], 16 143 LDR q19, [x5], 16 144 145 FMLA v20.8h, v17.8h, v0.h[1] 146 FMLA v22.8h, v17.8h, v1.h[1] 147 FMLA v24.8h, v17.8h, v2.h[1] 148 FMLA v26.8h, v17.8h, v3.h[1] 149 FMLA v28.8h, v17.8h, v4.h[1] 150 FMLA v30.8h, v17.8h, v5.h[1] 151 152 FMLA v20.8h, v18.8h, v0.h[2] 153 FMLA v22.8h, v18.8h, v1.h[2] 154 FMLA v24.8h, v18.8h, v2.h[2] 155 FMLA v26.8h, v18.8h, v3.h[2] 156 FMLA v28.8h, v18.8h, v4.h[2] 157 FMLA v30.8h, v18.8h, v5.h[2] 158 159 FMLA v20.8h, v19.8h, v0.h[3] 160 FMLA v22.8h, v19.8h, v1.h[3] 161 FMLA v24.8h, v19.8h, v2.h[3] 162 FMLA v26.8h, v19.8h, v3.h[3] 163 FMLA v28.8h, v19.8h, v4.h[3] 164 FMLA v30.8h, v19.8h, v5.h[3] 165 B.HS 1b 166 167 # Is there a remainder?- 2 halffloats of A (4 bytes) 168 TBNZ x0, 2, 4f 169 # Is there a remainder?- 1 halffloats of A (2 bytes) 170 TBNZ x0, 1, 5f 1712: 172 # Scale and Clamp 173 FMUL v20.8h, v20.8h, v6.8h 174 # Load params values 175 LD2R {v4.8h, v5.8h}, [x8] 176 FMUL v22.8h, v22.8h, v6.8h 177 FMUL v24.8h, v24.8h, v6.8h 178 FMUL v26.8h, v26.8h, v6.8h 179 FMUL v28.8h, v28.8h, v6.8h 180 FMUL v30.8h, v30.8h, v6.8h 181 # Load cn_stride 182 LDR x0, [sp, 0] 183 FMAX v20.8h, v20.8h, v4.8h 184 FMAX v22.8h, v22.8h, v4.8h 185 FMAX v24.8h, v24.8h, v4.8h 186 FMAX v26.8h, v26.8h, v4.8h 187 FMAX v28.8h, v28.8h, v4.8h 188 FMAX v30.8h, v30.8h, v4.8h 189 SUBS x1, x1, 8 190 FMIN v20.8h, v20.8h, v5.8h 191 FMIN v22.8h, v22.8h, v5.8h 192 FMIN v24.8h, v24.8h, v5.8h 193 FMIN v26.8h, v26.8h, v5.8h 194 FMIN v28.8h, v28.8h, v5.8h 195 FMIN v30.8h, v30.8h, v5.8h 196 197 # Store full 6 x 8 198 B.LO 6f 199 200 $if INC: 201 ST1 {v30.16b}, [x7], x0 202 SUB x3, x3, x2 // a0 -= kc 203 ST1 {v28.16b}, [x13], x0 204 SUB x9, x9, x2 // a1 -= kc 205 ST1 {v26.16b}, [x14], x0 206 SUB x10, x10, x2 // a2 -= kc 207 ST1 {v24.16b}, [x17], x0 208 SUB x11, x11, x2 // a3 -= kc 209 ST1 {v22.16b}, [x16], x0 210 SUB x12, x12, x2 // a4 -= kc 211 ST1 {v20.16b}, [x6], x0 212 SUB x4, x4, x2 // a5 -= kc 213 $else: 214 ST1 {v20.16b}, [x6], x0 215 SUB x3, x3, x2 // a0 -= kc 216 ST1 {v22.16b}, [x16], x0 217 SUB x9, x9, x2 // a1 -= kc 218 ST1 {v24.16b}, [x17], x0 219 SUB x10, x10, x2 // a2 -= kc 220 ST1 {v26.16b}, [x14], x0 221 SUB x11, x11, x2 // a3 -= kc 222 ST1 {v28.16b}, [x13], x0 223 SUB x12, x12, x2 // a4 -= kc 224 ST1 {v30.16b}, [x7], x0 225 SUB x4, x4, x2 // a5 -= kc 226 227 B.HI 0b 228 RET 229 2303: 231 TBZ x0, 2, 5f 2324: 233 # Remainder- 2 halffloats of A (4 bytes) 234 LDR s0, [x3], 4 235 LDR q16, [x5], 16 236 LDR q17, [x5], 16 237 LDR s1, [x9], 4 238 LDR s2, [x10], 4 239 LDR s3, [x11], 4 240 LDR s4, [x12], 4 241 LDR s5, [x4], 4 242 243 FMLA v20.8h, v16.8h, v0.h[0] 244 FMLA v22.8h, v16.8h, v1.h[0] 245 FMLA v24.8h, v16.8h, v2.h[0] 246 FMLA v26.8h, v16.8h, v3.h[0] 247 FMLA v28.8h, v16.8h, v4.h[0] 248 FMLA v30.8h, v16.8h, v5.h[0] 249 250 FMLA v20.8h, v17.8h, v0.h[1] 251 FMLA v22.8h, v17.8h, v1.h[1] 252 FMLA v24.8h, v17.8h, v2.h[1] 253 FMLA v26.8h, v17.8h, v3.h[1] 254 FMLA v28.8h, v17.8h, v4.h[1] 255 FMLA v30.8h, v17.8h, v5.h[1] 256 257 TBZ x0, 1, 2b 258 2595: 260 # Remainder- 1 halffloat of A (2 bytes) 261 LDR h0, [x3], 2 262 LDR q16, [x5], 16 263 LDR h1, [x9], 2 264 LDR h2, [x10], 2 265 LDR h3, [x11], 2 266 LDR h4, [x12], 2 267 LDR h5, [x4], 2 268 FMLA v20.8h, v16.8h, v0.h[0] 269 FMLA v22.8h, v16.8h, v1.h[0] 270 FMLA v24.8h, v16.8h, v2.h[0] 271 FMLA v26.8h, v16.8h, v3.h[0] 272 FMLA v28.8h, v16.8h, v4.h[0] 273 FMLA v30.8h, v16.8h, v5.h[0] 274 B 2b 275 276 # Store odd width 2776: 278 TBZ x1, 2, 7f 279 $if INC: 280 STR d30, [x7], 8 281 DUP d30, v30.d[1] 282 STR d28, [x13], 8 283 DUP d28, v28.d[1] 284 STR d26, [x14], 8 285 DUP d26, v26.d[1] 286 STR d24, [x17], 8 287 DUP d24, v24.d[1] 288 STR d22, [x16], 8 289 DUP d22, v22.d[1] 290 STR d20, [x6], 8 291 DUP d20, v20.d[1] 292 $else: 293 STR d20, [x6], 8 294 DUP d20, v20.d[1] 295 STR d22, [x16], 8 296 DUP d22, v22.d[1] 297 STR d24, [x17], 8 298 DUP d24, v24.d[1] 299 STR d26, [x14], 8 300 DUP d26, v26.d[1] 301 STR d28, [x13], 8 302 DUP d28, v28.d[1] 303 STR d30, [x7], 8 304 DUP d30, v30.d[1] 305 3067: 307 TBZ x1, 1, 8f 308 $if INC: 309 STR s30, [x7], 4 310 DUP s30, v30.s[1] 311 STR s28, [x13], 4 312 DUP s28, v28.s[1] 313 STR s26, [x14], 4 314 DUP s26, v26.s[1] 315 STR s24, [x17], 4 316 DUP s24, v24.s[1] 317 STR s22, [x16], 4 318 DUP s22, v22.s[1] 319 STR s20, [x6], 4 320 DUP s20, v20.s[1] 321 $else: 322 STR s20, [x6], 4 323 DUP s20, v20.s[1] 324 STR s22, [x16], 4 325 DUP s22, v22.s[1] 326 STR s24, [x17], 4 327 DUP s24, v24.s[1] 328 STR s26, [x14], 4 329 DUP s26, v26.s[1] 330 STR s28, [x13], 4 331 DUP s28, v28.s[1] 332 STR s30, [x7], 4 333 DUP s30, v30.s[1] 334 3358: 336 TBZ x1, 0, 9f 337 $if INC: 338 STR h30, [x7] 339 STR h28, [x13] 340 STR h26, [x14] 341 STR h24, [x17] 342 STR h22, [x16] 343 STR h20, [x6] 344 $else: 345 STR h20, [x6] 346 STR h22, [x16] 347 STR h24, [x17] 348 STR h26, [x14] 349 STR h28, [x13] 350 STR h30, [x7] 3519: 352 RET 353 354END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64 355 356#ifdef __ELF__ 357.section ".note.GNU-stack","",%progbits 358#endif 359