1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const int8_t* restrict a, x3 13# size_t a_stride, x4 14# const void* restrict w, x5 15# int8_t* restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x12 18# const union xnn_qs8_gemm_params params) [sp + 8] -> x11 19 20# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 21 22# Register usage 23# A0 x3 v0 24# A1 x15 v1 25# A2 x13 v2 26# A3 x4 v3 27# B x5 v4 v5 v6 v7 28# C0 x6 v16 v20 v24 v28 29# C1 x8 v17 v21 v25 v29 30# C2 x9 v18 v22 v26 v30 31# C3 x7 v19 v23 v27 v31 32# unused v8 v9 v10 v11 v12 v13 v14 v15 33 34BEGIN_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64 35 36 # Clamp A and C pointers 37 CMP x0, 2 // if mr < 2 38 ADD x2, x2, 3 // kc = (kc + 3) & ~3 39 ADD x15, x3, x4 // a1 = a0 + a_stride 40 ADD x8, x6, x7 // c1 = c0 + cm_stride 41 CSEL x15, x3, x15, LO // a1 = a0 42 CSEL x8, x6, x8, LO // c1 = c0 43 BIC x2, x2, 3 44 45 ADD x13, x15, x4 // a2 = a1 + a_stride 46 ADD x9, x8, x7 // c2 = c1 + cm_stride 47 // if mr <= 2 48 CSEL x13, x15, x13, LS // a2 = a1 49 CSEL x9, x8, x9, LS // c2 = c1 50 51 CMP x0, 4 // if mr < 4 52 ADD x4, x13, x4 // a3 = a2 + a_stride 53 ADD x7, x9, x7 // c3 = c2 + cm_stride 54 CSEL x4, x13, x4, LO // a3 = a2 55 CSEL x7, x9, x7, LO // c3 = c2 56 57 .p2align 3 580: 59 # Load initial bias from w into accumulators 60 LDP q16, q20, [x5], 32 61 MOV v17.16b, v16.16b 62 MOV v18.16b, v16.16b 63 LDP q24, q28, [x5], 32 64 MOV v19.16b, v16.16b 65 MOV v21.16b, v20.16b 66 LDR x11, [sp, 8] // params 67 MOV v22.16b, v20.16b 68 MOV v23.16b, v20.16b 69 SUBS x0, x2, 8 // k = kc - 8 70 MOV v25.16b, v24.16b 71 MOV v26.16b, v24.16b 72 MOV v27.16b, v24.16b 73 MOV v29.16b, v28.16b 74 MOV v30.16b, v28.16b 75 MOV v31.16b, v28.16b 76 # Is there at least 8 bytes? 77 B.LO 3f 78 79 # Main loop - 8 bytes of A 80 .p2align 3 811: 82 LDR d0, [x3], 8 83 LDR q4, [x5], 16 84 LDR d1, [x15], 8 85 LDR d2, [x13], 8 86 LDR d3, [x4], 8 87 LDR q5, [x5], 16 88 SDOT v16.4s, v4.16b, v0.4b[0] 89 SDOT v17.4s, v4.16b, v1.4b[0] 90 LDP q6, q7, [x5], 32 91 SDOT v18.4s, v4.16b, v2.4b[0] 92 SDOT v19.4s, v4.16b, v3.4b[0] 93 SDOT v20.4s, v5.16b, v0.4b[0] 94 SDOT v21.4s, v5.16b, v1.4b[0] 95 SDOT v22.4s, v5.16b, v2.4b[0] 96 SDOT v23.4s, v5.16b, v3.4b[0] 97 SDOT v24.4s, v6.16b, v0.4b[0] 98 SDOT v25.4s, v6.16b, v1.4b[0] 99 LDP q4, q5, [x5], 32 100 SDOT v26.4s, v6.16b, v2.4b[0] 101 SDOT v27.4s, v6.16b, v3.4b[0] 102 SDOT v28.4s, v7.16b, v0.4b[0] 103 SDOT v29.4s, v7.16b, v1.4b[0] 104 SDOT v30.4s, v7.16b, v2.4b[0] 105 SDOT v31.4s, v7.16b, v3.4b[0] 106 SDOT v16.4s, v4.16b, v0.4b[1] 107 SDOT v17.4s, v4.16b, v1.4b[1] 108 LDP q6, q7, [x5], 32 109 SDOT v18.4s, v4.16b, v2.4b[1] 110 SDOT v19.4s, v4.16b, v3.4b[1] 111 SDOT v20.4s, v5.16b, v0.4b[1] 112 SDOT v21.4s, v5.16b, v1.4b[1] 113 SDOT v22.4s, v5.16b, v2.4b[1] 114 SDOT v23.4s, v5.16b, v3.4b[1] 115 SDOT v24.4s, v6.16b, v0.4b[1] 116 SDOT v25.4s, v6.16b, v1.4b[1] 117 SDOT v26.4s, v6.16b, v2.4b[1] 118 SDOT v27.4s, v6.16b, v3.4b[1] 119 SDOT v28.4s, v7.16b, v0.4b[1] 120 SDOT v29.4s, v7.16b, v1.4b[1] 121 SDOT v30.4s, v7.16b, v2.4b[1] 122 SUBS x0, x0, 8 123 SDOT v31.4s, v7.16b, v3.4b[1] 124 B.HS 1b 125 126 # Is there a remainder?- 4 bytes of A 127 TBNZ x0, 2, 3f 128 1292: 130 # Apply params - scale, shift, bias and clamp 131 LD2R {v0.4s, v1.4s}, [x11], 8 132 CMEQ v2.4s, v1.4s, 0 133 134 BIC v4.16b, v16.16b, v2.16b 135 BIC v5.16b, v17.16b, v2.16b 136 BIC v6.16b, v18.16b, v2.16b 137 BIC v7.16b, v19.16b, v2.16b 138 139 SQRDMULH v16.4s, v16.4s, v0.4s 140 SQRDMULH v17.4s, v17.4s, v0.4s 141 SQRDMULH v18.4s, v18.4s, v0.4s 142 SQRDMULH v19.4s, v19.4s, v0.4s 143 144 SSRA v16.4s, v4.4s, 31 // signed shift right accumulate 145 SSRA v17.4s, v5.4s, 31 146 SSRA v18.4s, v6.4s, 31 147 SSRA v19.4s, v7.4s, 31 148 149 BIC v4.16b, v20.16b, v2.16b 150 BIC v5.16b, v21.16b, v2.16b 151 BIC v6.16b, v22.16b, v2.16b 152 BIC v7.16b, v23.16b, v2.16b 153 154 SQRDMULH v20.4s, v20.4s, v0.4s 155 SQRDMULH v21.4s, v21.4s, v0.4s 156 SQRDMULH v22.4s, v22.4s, v0.4s 157 SQRDMULH v23.4s, v23.4s, v0.4s 158 159 SSRA v20.4s, v4.4s, 31 160 SSRA v21.4s, v5.4s, 31 161 SSRA v22.4s, v6.4s, 31 162 SSRA v23.4s, v7.4s, 31 163 164 BIC v4.16b, v24.16b, v2.16b 165 BIC v5.16b, v25.16b, v2.16b 166 BIC v6.16b, v26.16b, v2.16b 167 BIC v7.16b, v27.16b, v2.16b 168 169 SQRDMULH v24.4s, v24.4s, v0.4s 170 SQRDMULH v25.4s, v25.4s, v0.4s 171 SQRDMULH v26.4s, v26.4s, v0.4s 172 SQRDMULH v27.4s, v27.4s, v0.4s 173 174 SSRA v24.4s, v4.4s, 31 175 SSRA v25.4s, v5.4s, 31 176 SSRA v26.4s, v6.4s, 31 177 SSRA v27.4s, v7.4s, 31 178 179 BIC v4.16b, v28.16b, v2.16b 180 BIC v5.16b, v29.16b, v2.16b 181 BIC v6.16b, v30.16b, v2.16b 182 BIC v7.16b, v31.16b, v2.16b 183 184 SQRDMULH v28.4s, v28.4s, v0.4s 185 SQRDMULH v29.4s, v29.4s, v0.4s 186 SQRDMULH v30.4s, v30.4s, v0.4s 187 SQRDMULH v31.4s, v31.4s, v0.4s 188 189 SSRA v28.4s, v4.4s, 31 190 SSRA v29.4s, v5.4s, 31 191 SSRA v30.4s, v6.4s, 31 192 SSRA v31.4s, v7.4s, 31 193 194 SRSHL v16.4s, v16.4s, v1.4s // signed rounding shift left 195 SRSHL v17.4s, v17.4s, v1.4s 196 SRSHL v18.4s, v18.4s, v1.4s 197 SRSHL v19.4s, v19.4s, v1.4s 198 SRSHL v20.4s, v20.4s, v1.4s 199 SRSHL v21.4s, v21.4s, v1.4s 200 SRSHL v22.4s, v22.4s, v1.4s 201 SRSHL v23.4s, v23.4s, v1.4s 202 SRSHL v24.4s, v24.4s, v1.4s 203 SRSHL v25.4s, v25.4s, v1.4s 204 SRSHL v26.4s, v26.4s, v1.4s 205 SRSHL v27.4s, v27.4s, v1.4s 206 SRSHL v28.4s, v28.4s, v1.4s 207 SRSHL v29.4s, v29.4s, v1.4s 208 SRSHL v30.4s, v30.4s, v1.4s 209 SRSHL v31.4s, v31.4s, v1.4s 210 211 SQXTN v16.4h, v16.4s 212 SQXTN v17.4h, v17.4s 213 SQXTN v18.4h, v18.4s 214 SQXTN v19.4h, v19.4s 215 SQXTN v24.4h, v24.4s 216 SQXTN v25.4h, v25.4s 217 SQXTN v26.4h, v26.4s 218 SQXTN v27.4h, v27.4s 219 LD1R {v2.8h}, [x11], 2 // add bias 220 221 SQXTN2 v16.8h, v20.4s 222 SQXTN2 v17.8h, v21.4s 223 SQXTN2 v18.8h, v22.4s 224 SQXTN2 v19.8h, v23.4s 225 SQXTN2 v24.8h, v28.4s 226 SQXTN2 v25.8h, v29.4s 227 SQXTN2 v26.8h, v30.4s 228 SQXTN2 v27.8h, v31.4s 229 230 SQADD v16.8h, v16.8h, v2.8h 231 SQADD v17.8h, v17.8h, v2.8h 232 SQADD v18.8h, v18.8h, v2.8h 233 SQADD v19.8h, v19.8h, v2.8h 234 SQADD v24.8h, v24.8h, v2.8h 235 SQADD v25.8h, v25.8h, v2.8h 236 SQADD v26.8h, v26.8h, v2.8h 237 SQADD v27.8h, v27.8h, v2.8h 238 LD1R {v0.16b}, [x11], 1 // clamp min value 239 240 SQXTN v4.8b, v16.8h 241 SQXTN v5.8b, v17.8h 242 SQXTN v6.8b, v18.8h 243 SQXTN v7.8b, v19.8h 244 LD1R {v1.16b}, [x11] // clamp max value 245 SQXTN2 v4.16b, v24.8h 246 SQXTN2 v5.16b, v25.8h 247 SQXTN2 v6.16b, v26.8h 248 SQXTN2 v7.16b, v27.8h 249 LDR x12, [sp] // cn_stride 250 251 SMAX v4.16b, v4.16b, v0.16b 252 SMAX v5.16b, v5.16b, v0.16b 253 SMAX v6.16b, v6.16b, v0.16b 254 SMAX v7.16b, v7.16b, v0.16b 255 SUBS x1, x1, 16 256 SMIN v4.16b, v4.16b, v1.16b 257 SMIN v5.16b, v5.16b, v1.16b 258 SMIN v6.16b, v6.16b, v1.16b 259 SMIN v7.16b, v7.16b, v1.16b 260 B.LO 4f 261 262 # Store full 4 x 16 263 ST1 {v4.16b}, [x6], x12 264 SUB x3, x3, x2 // a0 -= kc 265 ST1 {v5.16b}, [x8], x12 266 SUB x15, x15, x2 // a1 -= kc 267 ST1 {v6.16b}, [x9], x12 268 SUB x13, x13, x2 // a2 -= kc 269 ST1 {v7.16b}, [x7], x12 270 SUB x4, x4, x2 // a3 -= kc 271 B.NE 0b 272 RET 273 274 # Remainder- 4 bytes of A 275 .p2align 3 2763: 277 LDR s0, [x3], 4 278 LDR q4, [x5], 16 279 LDR s1, [x15], 4 280 LDR s2, [x13], 4 281 LDR s3, [x4], 4 282 SDOT v16.4s, v4.16b, v0.4b[0] 283 LDR q5, [x5], 16 284 SDOT v17.4s, v4.16b, v1.4b[0] 285 SDOT v18.4s, v4.16b, v2.4b[0] 286 SDOT v19.4s, v4.16b, v3.4b[0] 287 SDOT v20.4s, v5.16b, v0.4b[0] 288 LDP q6, q7, [x5], 32 289 SDOT v21.4s, v5.16b, v1.4b[0] 290 SDOT v22.4s, v5.16b, v2.4b[0] 291 SDOT v23.4s, v5.16b, v3.4b[0] 292 SDOT v24.4s, v6.16b, v0.4b[0] 293 SDOT v25.4s, v6.16b, v1.4b[0] 294 SDOT v26.4s, v6.16b, v2.4b[0] 295 SDOT v27.4s, v6.16b, v3.4b[0] 296 SDOT v28.4s, v7.16b, v0.4b[0] 297 SDOT v29.4s, v7.16b, v1.4b[0] 298 SDOT v30.4s, v7.16b, v2.4b[0] 299 SDOT v31.4s, v7.16b, v3.4b[0] 300 B 2b 301 302 # Store odd width 303 .p2align 3 3044: 305 TBZ x1, 3, 5f 306 STR d4, [x6], 8 307 DUP d4, v4.d[1] 308 STR d5, [x8], 8 309 DUP d5, v5.d[1] 310 STR d6, [x9], 8 311 DUP d6, v6.d[1] 312 STR d7, [x7], 8 313 DUP d7, v7.d[1] 3145: 315 TBZ x1, 2, 6f 316 STR s4, [x6], 4 317 DUP s4, v4.s[1] 318 STR s5, [x8], 4 319 DUP s5, v5.s[1] 320 STR s6, [x9], 4 321 DUP s6, v6.s[1] 322 STR s7, [x7], 4 323 DUP s7, v7.s[1] 3246: 325 TBZ x1, 1, 7f 326 ST1 {v4.h}[0], [x6], 2 327 DUP h4, v4.h[1] 328 ST1 {v5.h}[0], [x8], 2 329 DUP h5, v5.h[1] 330 ST1 {v6.h}[0], [x9], 2 331 DUP h6, v6.h[1] 332 ST1 {v7.h}[0], [x7], 2 333 DUP h7, v7.h[1] 3347: 335 TBZ x1, 0, 8f 336 ST1 {v4.b}[0], [x6] 337 ST1 {v5.b}[0], [x8] 338 ST1 {v6.b}[0], [x9] 339 ST1 {v7.b}[0], [x7] 3408: 341 RET 342 343END_FUNCTION xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64 344 345#ifdef __ELF__ 346.section ".note.GNU-stack","",%progbits 347#endif 348