1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x11 a1 29# x12 a2 30# x4 a3 / a_stride 31 32# C pointers 33# x6 c0 34# x9 c1 35# x10 c2 36# x7 c3 / cm_stride 37 38# Vector register usage 39# A0 v0 40# A1 v1 41# A2 v2 42# A3 v3 43# B v20 v21 v22 v23 44# C v16 45# C v18 46# C v28 47# C v30 48# Clamp v4, v5, v6 49# unused A v7 v8 v9 v10 v11 50# unused B v19 51 52BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64 53 54 $if INC: 55 # Load cn_stride, acc 56 LDP x14, x15, [sp] 57 # Load params pointer 58 LDR x8, [sp, 16] 59 $else: 60 # Load cn_stride, params pointer 61 LDP x14, x8, [sp] 62 63 # Load params values 64 LD3R {v4.8h, v5.8h, v6.8h}, [x8] 65 66 # Clamp A and C pointers 67 CMP x0, 2 // if mr < 2 68 ADD x11, x3, x4 // a1 = a0 + a_stride 69 ADD x9, x6, x7 // c1 = c0 + cm_stride 70 CSEL x11, x3, x11, LO // a1 = a0 71 CSEL x9, x6, x9, LO // c1 = c0 72 73 ADD x12, x11, x4 // a2 = a1 + a_stride 74 ADD x10, x9, x7 // c2 = c1 + cm_stride 75 // if mr <= 2 76 CSEL x12, x11, x12, LS // a2 = a1 77 CSEL x10, x9, x10, LS // c2 = c1 78 79 CMP x0, 4 // if mr < 4 80 ADD x4, x12, x4 // a3 = a2 + a_stride 81 ADD x7, x10, x7 // c3 = c2 + cm_stride 82 CSEL x4, x12, x4, LO // a3 = a2 83 CSEL x7, x10, x7, LO // c3 = c2 84 850: 86 $if INC: 87 # Load initial accumulators 88 LDP q16, q18, [x15], 32 89 LDP q28, q30, [x15], 32 90 $else: 91 # Load initial bias from w into accumulators 92 LDR q16, [x5], 16 93 MOV v18.16b, v16.16b 94 MOV v28.16b, v16.16b 95 MOV v30.16b, v16.16b 96 97 # Is there at least 4 halffloats (8 bytes)? 98 SUBS x0, x2, 8 // k = kc - 8 99 B.LO 3f 100 101 # Main loop - 4 halffloats of A (8 bytes) 1021: 103 LDR d0, [x3], 8 104 LDR q20, [x5], 16 105 LDR q21, [x5], 16 106 LDR d1, [x11], 8 107 LDR d2, [x12], 8 108 LDR d3, [x4], 8 109 SUBS x0, x0, 8 110 FMLA v16.8h, v20.8h, v0.h[0] 111 FMLA v18.8h, v20.8h, v1.h[0] 112 FMLA v28.8h, v20.8h, v2.h[0] 113 FMLA v30.8h, v20.8h, v3.h[0] 114 LDR q22, [x5], 16 115 LDR q23, [x5], 16 116 117 FMLA v16.8h, v21.8h, v0.h[1] 118 FMLA v18.8h, v21.8h, v1.h[1] 119 FMLA v28.8h, v21.8h, v2.h[1] 120 FMLA v30.8h, v21.8h, v3.h[1] 121 122 FMLA v16.8h, v22.8h, v0.h[2] 123 FMLA v18.8h, v22.8h, v1.h[2] 124 FMLA v28.8h, v22.8h, v2.h[2] 125 FMLA v30.8h, v22.8h, v3.h[2] 126 127 FMLA v16.8h, v23.8h, v0.h[3] 128 FMLA v18.8h, v23.8h, v1.h[3] 129 FMLA v28.8h, v23.8h, v2.h[3] 130 FMLA v30.8h, v23.8h, v3.h[3] 131 B.HS 1b 132 133 # Is there a remainder?- 2 halffloats of A (4 bytes) 134 TBNZ x0, 2, 4f 135 # Is there a remainder?- 1 halffloats of A (2 bytes) 136 TBNZ x0, 1, 5f 1372: 138 # Scale and Clamp 139 FMUL v16.8h, v16.8h, v4.8h 140 SUBS x1, x1, 8 141 FMUL v18.8h, v18.8h, v4.8h 142 FMUL v28.8h, v28.8h, v4.8h 143 FMUL v30.8h, v30.8h, v4.8h 144 FMAX v16.8h, v16.8h, v5.8h 145 FMAX v18.8h, v18.8h, v5.8h 146 FMAX v28.8h, v28.8h, v5.8h 147 FMAX v30.8h, v30.8h, v5.8h 148 FMIN v16.8h, v16.8h, v6.8h 149 FMIN v18.8h, v18.8h, v6.8h 150 FMIN v28.8h, v28.8h, v6.8h 151 FMIN v30.8h, v30.8h, v6.8h 152 153 # Store full 4 x 8 154 B.LO 6f 155 156 $if INC: 157 ST1 {v30.16b}, [x7], x14 158 SUB x3, x3, x2 // a0 -= kc 159 ST1 {v28.16b}, [x10], x14 160 SUB x11, x11, x2 // a1 -= kc 161 ST1 {v18.16b}, [x9], x14 162 SUB x12, x12, x2 // a2 -= kc 163 ST1 {v16.16b}, [x6], x14 164 SUB x4, x4, x2 // a3 -= kc 165 $else: 166 ST1 {v16.16b}, [x6], x14 167 SUB x3, x3, x2 // a0 -= kc 168 ST1 {v18.16b}, [x9], x14 169 SUB x11, x11, x2 // a1 -= kc 170 ST1 {v28.16b}, [x10], x14 171 SUB x12, x12, x2 // a2 -= kc 172 ST1 {v30.16b}, [x7], x14 173 SUB x4, x4, x2 // a3 -= kc 174 175 B.HI 0b 176 RET 177 1783: 179 TBZ x0, 2, 5f 1804: 181 # Remainder- 2 halffloats of A (4 bytes) 182 LDR s0, [x3], 4 183 LDR q20, [x5], 16 184 LDR q21, [x5], 16 185 LDR s1, [x11], 4 186 LDR s2, [x12], 4 187 LDR s3, [x4], 4 188 189 FMLA v16.8h, v20.8h, v0.h[0] 190 FMLA v18.8h, v20.8h, v1.h[0] 191 FMLA v28.8h, v20.8h, v2.h[0] 192 FMLA v30.8h, v20.8h, v3.h[0] 193 194 FMLA v16.8h, v21.8h, v0.h[1] 195 FMLA v18.8h, v21.8h, v1.h[1] 196 FMLA v28.8h, v21.8h, v2.h[1] 197 FMLA v30.8h, v21.8h, v3.h[1] 198 199 TBZ x0, 1, 2b 200 2015: 202 # Remainder- 1 halffloat of A (2 bytes) 203 LDR h0, [x3], 2 204 LDR q20, [x5], 16 205 LDR h1, [x11], 2 206 LDR h2, [x12], 2 207 LDR h3 , [x4], 2 208 FMLA v16.8h, v20.8h, v0.h[0] 209 FMLA v18.8h, v20.8h, v1.h[0] 210 FMLA v28.8h, v20.8h, v2.h[0] 211 FMLA v30.8h, v20.8h, v3.h[0] 212 B 2b 213 214 # Store odd width 2156: 216 TBZ x1, 2, 7f 217 $if INC: 218 STR d30, [x7], 8 219 DUP d30, v30.d[1] 220 STR d28, [x10], 8 221 DUP d28, v28.d[1] 222 STR d18, [x9], 8 223 DUP d18, v18.d[1] 224 STR d16, [x6], 8 225 DUP d16, v16.d[1] 226 $else: 227 STR d16, [x6], 8 228 DUP d16, v16.d[1] 229 STR d18, [x9], 8 230 DUP d18, v18.d[1] 231 STR d28, [x10], 8 232 DUP d28, v28.d[1] 233 STR d30, [x7], 8 234 DUP d30, v30.d[1] 235 2367: 237 TBZ x1, 1, 8f 238 $if INC: 239 STR s30, [x7], 4 240 DUP s30, v30.s[1] 241 STR s28, [x10], 4 242 DUP s28, v28.s[1] 243 STR s18, [x9], 4 244 DUP s18, v18.s[1] 245 STR s16, [x6], 4 246 DUP s16, v16.s[1] 247 $else: 248 STR s16, [x6], 4 249 DUP s16, v16.s[1] 250 STR s18, [x9], 4 251 DUP s18, v18.s[1] 252 STR s28, [x10], 4 253 DUP s28, v28.s[1] 254 STR s30, [x7], 4 255 DUP s30, v30.s[1] 256 2578: 258 TBZ x1, 0, 9f 259 $if INC: 260 STR h30, [x7] 261 STR h28, [x10] 262 STR h18, [x9] 263 STR h16, [x6] 264 $else: 265 STR h16, [x6] 266 STR h18, [x9] 267 STR h28, [x10] 268 STR h30, [x7] 2699: 270 RET 271 272END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64 273 274#ifdef __ELF__ 275.section ".note.GNU-stack","",%progbits 276#endif 277