1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x11 a1 29# x12 a2 30# x4 a3 / a_stride 31 32# C pointers 33# x6 c0 34# x9 c1 35# x10 c2 36# x7 c3 / cm_stride 37 38# Clamp v4, v5, v6 39 40BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32 41 42 $if INC: 43 # Load cn_stride, acc 44 LDP x14, x15, [sp] 45 # Load params pointer 46 LDR x8, [sp, 16] 47 $else: 48 # Load cn_stride, params pointer 49 LDP x14, x8, [sp] 50 51 # Load params values 52 LD3R {v4.8h, v5.8h, v6.8h}, [x8] 53 54 # Clamp A and C pointers 55 CMP x0, 2 // if mr < 2 56 ADD x11, x3, x4 // a1 = a0 + a_stride 57 ADD x9, x6, x7 // c1 = c0 + cm_stride 58 CSEL x11, x3, x11, LO // a1 = a0 59 CSEL x9, x6, x9, LO // c1 = c0 60 61 ADD x12, x11, x4 // a2 = a1 + a_stride 62 ADD x10, x9, x7 // c2 = c1 + cm_stride 63 // if mr <= 2 64 CSEL x12, x11, x12, LS // a2 = a1 65 CSEL x10, x9, x10, LS // c2 = c1 66 67 CMP x0, 4 // if mr < 4 68 ADD x4, x12, x4 // a3 = a2 + a_stride 69 ADD x7, x10, x7 // c3 = c2 + cm_stride 70 CSEL x4, x12, x4, LO // a3 = a2 71 CSEL x7, x10, x7, LO // c3 = c2 72 730: 74 $if INC: 75 # Load initial accumulators 76 LDP q16, q17, [x15], 32 77 LDP q18, q19, [x15], 32 78 LDP q28, q29, [x15], 32 79 LDP q30, q31, [x15], 32 80 $else: 81 # Load initial bias from w into accumulators 82 LDP q16, q17, [x5], 32 83 MOV v18.16b, v16.16b 84 MOV v19.16b, v17.16b 85 MOV v28.16b, v16.16b 86 MOV v29.16b, v17.16b 87 MOV v30.16b, v16.16b 88 MOV v31.16b, v17.16b 89 90 # Is there at least 2 halffloats (4 bytes)? 91 SUBS x0, x2, 4 // k = kc - 4 92 B.LO 3f 93 94 # Main loop - 2 halffloats of A (4 bytes) 951: 96 LDR s0, [x3], 4 97 LDR q20, [x5], 16 98 LDR q21, [x5], 16 99 LDR s1, [x11], 4 100 LDR s2, [x12], 4 101 LDR s3, [x4], 4 102 SUBS x0, x0, 4 103 FMLA v16.8h, v20.8h, v0.h[0] 104 FMLA v17.8h, v21.8h, v0.h[0] 105 FMLA v18.8h, v20.8h, v1.h[0] 106 FMLA v19.8h, v21.8h, v1.h[0] 107 LDR q22, [x5], 16 108 LDR q23, [x5], 16 109 FMLA v28.8h, v20.8h, v2.h[0] 110 FMLA v29.8h, v21.8h, v2.h[0] 111 FMLA v30.8h, v20.8h, v3.h[0] 112 FMLA v31.8h, v21.8h, v3.h[0] 113 FMLA v16.8h, v22.8h, v0.h[1] 114 FMLA v17.8h, v23.8h, v0.h[1] 115 FMLA v18.8h, v22.8h, v1.h[1] 116 FMLA v19.8h, v23.8h, v1.h[1] 117 FMLA v28.8h, v22.8h, v2.h[1] 118 FMLA v29.8h, v23.8h, v2.h[1] 119 FMLA v30.8h, v22.8h, v3.h[1] 120 FMLA v31.8h, v23.8h, v3.h[1] 121 B.HS 1b 122 123 # Is there a remainder?- 1 halffloat of A (2 bytes) 124 TBNZ x0, 1, 3f 125 1262: 127 # Scale and Clamp 128 FMUL v16.8h, v16.8h, v4.8h 129 SUBS x1, x1, 16 130 FMUL v17.8h, v17.8h, v4.8h 131 FMUL v18.8h, v18.8h, v4.8h 132 FMUL v19.8h, v19.8h, v4.8h 133 FMUL v28.8h, v28.8h, v4.8h 134 FMUL v29.8h, v29.8h, v4.8h 135 FMUL v30.8h, v30.8h, v4.8h 136 FMUL v31.8h, v31.8h, v4.8h 137 FMAX v16.8h, v16.8h, v5.8h 138 FMAX v17.8h, v17.8h, v5.8h 139 FMAX v18.8h, v18.8h, v5.8h 140 FMAX v19.8h, v19.8h, v5.8h 141 FMAX v28.8h, v28.8h, v5.8h 142 FMAX v29.8h, v29.8h, v5.8h 143 FMAX v30.8h, v30.8h, v5.8h 144 FMAX v31.8h, v31.8h, v5.8h 145 FMIN v16.8h, v16.8h, v6.8h 146 FMIN v17.8h, v17.8h, v6.8h 147 FMIN v18.8h, v18.8h, v6.8h 148 FMIN v19.8h, v19.8h, v6.8h 149 FMIN v28.8h, v28.8h, v6.8h 150 FMIN v29.8h, v29.8h, v6.8h 151 FMIN v30.8h, v30.8h, v6.8h 152 FMIN v31.8h, v31.8h, v6.8h 153 154 # Store full 4 x 16 155 B.LO 4f 156 157 $if INC: 158 ST1 {v30.16b, v31.16b}, [x7], x14 159 SUB x3, x3, x2 // a0 -= kc 160 ST1 {v28.16b, v29.16b}, [x10], x14 161 SUB x11, x11, x2 // a1 -= kc 162 ST1 {v18.16b, v19.16b}, [x9], x14 163 SUB x12, x12, x2 // a2 -= kc 164 ST1 {v16.16b, v17.16b}, [x6], x14 165 SUB x4, x4, x2 // a3 -= kc 166 $else: 167 ST1 {v16.16b, v17.16b}, [x6], x14 168 SUB x3, x3, x2 // a0 -= kc 169 ST1 {v18.16b, v19.16b}, [x9], x14 170 SUB x11, x11, x2 // a1 -= kc 171 ST1 {v28.16b, v29.16b}, [x10], x14 172 SUB x12, x12, x2 // a2 -= kc 173 ST1 {v30.16b, v31.16b}, [x7], x14 174 SUB x4, x4, x2 // a3 -= kc 175 176 B.HI 0b 177 178 RET 179 180 # Remainder- 1 halffloat of A (2 bytes) 1813: 182 LDR h0, [x3], 2 183 LDR q20, [x5], 16 184 LDR q21, [x5], 16 185 LDR h1, [x11], 2 186 LDR h2, [x12], 2 187 LDR h3, [x4], 2 188 FMLA v16.8h, v20.8h, v0.h[0] 189 FMLA v17.8h, v21.8h, v0.h[0] 190 FMLA v18.8h, v20.8h, v1.h[0] 191 FMLA v19.8h, v21.8h, v1.h[0] 192 FMLA v28.8h, v20.8h, v2.h[0] 193 FMLA v29.8h, v21.8h, v2.h[0] 194 FMLA v30.8h, v20.8h, v3.h[0] 195 FMLA v31.8h, v21.8h, v3.h[0] 196 B 2b 197 198 # Store odd width 1994: 200 TBZ x1, 3, 5f 201 $if INC: 202 STR q30, [x7], 16 203 MOV v30.16b, v31.16b 204 STR q28, [x10], 16 205 MOV v28.16b, v29.16b 206 STR q18, [x9], 16 207 MOV v18.16b, v19.16b 208 STR q16, [x6], 16 209 MOV v16.16b, v17.16b 210 $else: 211 STR q16, [x6], 16 212 MOV v16.16b, v17.16b 213 STR q18, [x9], 16 214 MOV v18.16b, v19.16b 215 STR q28, [x10], 16 216 MOV v28.16b, v29.16b 217 STR q30, [x7], 16 218 MOV v30.16b, v31.16b 219 2205: 221 TBZ x1, 2, 6f 222 $if INC: 223 STR d30, [x7], 8 224 DUP d30, v30.d[1] 225 STR d28, [x10], 8 226 DUP d28, v28.d[1] 227 STR d18, [x9], 8 228 DUP d18, v18.d[1] 229 STR d16, [x6], 8 230 DUP d16, v16.d[1] 231 $else: 232 STR d16, [x6], 8 233 DUP d16, v16.d[1] 234 STR d18, [x9], 8 235 DUP d18, v18.d[1] 236 STR d28, [x10], 8 237 DUP d28, v28.d[1] 238 STR d30, [x7], 8 239 DUP d30, v30.d[1] 240 2416: 242 TBZ x1, 1, 7f 243 $if INC: 244 STR s30, [x7], 4 245 DUP s30, v30.s[1] 246 STR s28, [x10], 4 247 DUP s28, v28.s[1] 248 STR s18, [x9], 4 249 DUP s18, v18.s[1] 250 STR s16, [x6], 4 251 DUP s16, v16.s[1] 252 $else: 253 STR s16, [x6], 4 254 DUP s16, v16.s[1] 255 STR s18, [x9], 4 256 DUP s18, v18.s[1] 257 STR s28, [x10], 4 258 DUP s28, v28.s[1] 259 STR s30, [x7], 4 260 DUP s30, v30.s[1] 261 2627: 263 TBZ x1, 0, 8f 264 $if INC: 265 STR h30, [x7] 266 STR h28, [x10] 267 STR h18, [x9] 268 STR h16, [x6] 269 $else: 270 STR h16, [x6] 271 STR h18, [x9] 272 STR h28, [x10] 273 STR h30, [x7] 2748: 275 RET 276 277END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32 278 279#ifdef __ELF__ 280.section ".note.GNU-stack","",%progbits 281#endif 282