1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_ld128( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x11 a1 29# x12 a2 30# x4 a3 / a_stride 31 32# C pointers 33# x6 c0 34# x9 c1 35# x10 c2 36# x7 c3 / cm_stride 37 38BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_ld128 39 40 $if INC: 41 # Load cn_stride, acc 42 LDP x14, x15, [sp] 43 # Load params pointer 44 LDR x8, [sp, 16] 45 $else: 46 # Load cn_stride, params pointer 47 LDP x14, x8, [sp] 48 49 # Load min/max values 50 LD2R {v4.4s, v5.4s}, [x8] 51 52 # Clamp A and C pointers 53 CMP x0, 2 // if mr < 2 54 ADD x11, x3, x4 // a1 = a0 + a_stride 55 ADD x9, x6, x7 // c1 = c0 + cm_stride 56 CSEL x11, x3, x11, LO // a1 = a0 57 CSEL x9, x6, x9, LO // c1 = c0 58 59 ADD x12, x11, x4 // a2 = a1 + a_stride 60 ADD x10, x9, x7 // c2 = c1 + cm_stride 61 // if mr <= 2 62 CSEL x12, x11, x12, LS // a2 = a1 63 CSEL x10, x9, x10, LS // c2 = c1 64 65 CMP x0, 4 // if mr < 4 66 ADD x4, x12, x4 // a3 = a2 + a_stride 67 ADD x7, x10, x7 // c3 = c2 + cm_stride 68 CSEL x4, x12, x4, LO // a3 = a2 69 CSEL x7, x10, x7, LO // c3 = c2 70 710: 72 $if INC: 73 # Load initial accumulators 74 LDP q16, q17, [x15], 32 75 LDP q18, q19, [x15], 32 76 LDP q28, q29, [x15], 32 77 LDP q30, q31, [x15], 32 78 $else: 79 # Load initial bias from w into accumulators 80 LDP q16, q17, [x5], 32 81 MOV v18.16b, v16.16b 82 MOV v19.16b, v17.16b 83 MOV v28.16b, v16.16b 84 MOV v29.16b, v17.16b 85 MOV v30.16b, v16.16b 86 MOV v31.16b, v17.16b 87 88 # Is there at least 4 floats (16 bytes)? 89 SUBS x0, x2, 16 // k = kc - 16 90 B.LO 3f 91 92 # Main loop - 4 floats of A (16 bytes) 931: 94 LDR q0, [x3], 16 95 LDP q20, q21, [x5], 32 96 LDR q1, [x11], 16 97 LDR q2, [x12], 16 98 LDR q3, [x4], 16 99 FMLA v16.4s, v20.4s, v0.s[0] 100 FMLA v17.4s, v21.4s, v0.s[0] 101 FMLA v18.4s, v20.4s, v1.s[0] 102 FMLA v19.4s, v21.4s, v1.s[0] 103 LDP q22, q23, [x5], 32 104 FMLA v28.4s, v20.4s, v2.s[0] 105 FMLA v29.4s, v21.4s, v2.s[0] 106 FMLA v30.4s, v20.4s, v3.s[0] 107 FMLA v31.4s, v21.4s, v3.s[0] 108 LDP q24, q25, [x5], 32 109 FMLA v16.4s, v22.4s, v0.s[1] 110 FMLA v17.4s, v23.4s, v0.s[1] 111 FMLA v18.4s, v22.4s, v1.s[1] 112 FMLA v19.4s, v23.4s, v1.s[1] 113 LDP q26, q27, [x5], 32 114 FMLA v28.4s, v22.4s, v2.s[1] 115 FMLA v29.4s, v23.4s, v2.s[1] 116 FMLA v30.4s, v22.4s, v3.s[1] 117 FMLA v31.4s, v23.4s, v3.s[1] 118 FMLA v16.4s, v24.4s, v0.s[2] 119 FMLA v17.4s, v25.4s, v0.s[2] 120 FMLA v18.4s, v24.4s, v1.s[2] 121 FMLA v19.4s, v25.4s, v1.s[2] 122 FMLA v28.4s, v24.4s, v2.s[2] 123 FMLA v29.4s, v25.4s, v2.s[2] 124 FMLA v30.4s, v24.4s, v3.s[2] 125 FMLA v31.4s, v25.4s, v3.s[2] 126 FMLA v16.4s, v26.4s, v0.s[3] 127 FMLA v17.4s, v27.4s, v0.s[3] 128 FMLA v18.4s, v26.4s, v1.s[3] 129 FMLA v19.4s, v27.4s, v1.s[3] 130 FMLA v28.4s, v26.4s, v2.s[3] 131 FMLA v29.4s, v27.4s, v2.s[3] 132 SUBS x0, x0, 16 133 FMLA v30.4s, v26.4s, v3.s[3] 134 FMLA v31.4s, v27.4s, v3.s[3] 135 B.HS 1b 136 137 TST x0, 15 138 B.NE 3f 139 1402: 141 # Clamp 142 FMAX v16.4s, v16.4s, v4.4s 143 SUBS x1, x1, 8 144 FMAX v17.4s, v17.4s, v4.4s 145 FMAX v18.4s, v18.4s, v4.4s 146 FMAX v19.4s, v19.4s, v4.4s 147 FMAX v28.4s, v28.4s, v4.4s 148 FMAX v29.4s, v29.4s, v4.4s 149 FMAX v30.4s, v30.4s, v4.4s 150 FMAX v31.4s, v31.4s, v4.4s 151 FMIN v16.4s, v16.4s, v5.4s 152 FMIN v17.4s, v17.4s, v5.4s 153 FMIN v18.4s, v18.4s, v5.4s 154 FMIN v19.4s, v19.4s, v5.4s 155 FMIN v28.4s, v28.4s, v5.4s 156 FMIN v29.4s, v29.4s, v5.4s 157 FMIN v30.4s, v30.4s, v5.4s 158 FMIN v31.4s, v31.4s, v5.4s 159 160 # Store full 4 x 8 161 B.LO 5f 162 163 $if INC: 164 ST1 {v30.16b, v31.16b}, [x7], x14 165 SUB x3, x3, x2 // a0 -= kc 166 ST1 {v28.16b, v29.16b}, [x10], x14 167 SUB x11, x11, x2 // a1 -= kc 168 ST1 {v18.16b, v19.16b}, [x9], x14 169 SUB x12, x12, x2 // a2 -= kc 170 ST1 {v16.16b, v17.16b}, [x6], x14 171 SUB x4, x4, x2 // a3 -= kc 172 $else: 173 ST1 {v16.16b, v17.16b}, [x6], x14 174 SUB x3, x3, x2 // a0 -= kc 175 ST1 {v18.16b, v19.16b}, [x9], x14 176 SUB x11, x11, x2 // a1 -= kc 177 ST1 {v28.16b, v29.16b}, [x10], x14 178 SUB x12, x12, x2 // a2 -= kc 179 ST1 {v30.16b, v31.16b}, [x7], x14 180 SUB x4, x4, x2 // a3 -= kc 181 182 B.HI 0b 183 RET 184 185 # Remainder- 2 floats of A (8 bytes) 1863: 187 # Is there a remainder?- 2 floats of A (8 bytes) 188 TBZ x0, 3, 4f 189 190 # Remainder- 2 floats of A (8 bytes) 191 LDR d0, [x3], 8 192 LDP q20, q21, [x5], 32 193 LDR d1, [x11], 8 194 LDR d2, [x12], 8 195 LDR d3, [x4], 8 196 FMLA v16.4s, v20.4s, v0.s[0] 197 FMLA v17.4s, v21.4s, v0.s[0] 198 FMLA v18.4s, v20.4s, v1.s[0] 199 FMLA v19.4s, v21.4s, v1.s[0] 200 LDP q22, q23, [x5], 32 201 FMLA v28.4s, v20.4s, v2.s[0] 202 FMLA v29.4s, v21.4s, v2.s[0] 203 FMLA v30.4s, v20.4s, v3.s[0] 204 FMLA v31.4s, v21.4s, v3.s[0] 205 FMLA v16.4s, v22.4s, v0.s[1] 206 FMLA v17.4s, v23.4s, v0.s[1] 207 FMLA v18.4s, v22.4s, v1.s[1] 208 FMLA v19.4s, v23.4s, v1.s[1] 209 FMLA v28.4s, v22.4s, v2.s[1] 210 FMLA v29.4s, v23.4s, v2.s[1] 211 FMLA v30.4s, v22.4s, v3.s[1] 212 FMLA v31.4s, v23.4s, v3.s[1] 213 214 # Is there a remainder?- 1 floats of A (4 bytes) 215 TBZ x0, 2, 2b 216 217 # Remainder- 1 float of A (4 bytes) 2184: 219 LDR s0, [x3], 4 220 LDP q20, q21, [x5], 32 221 LDR s1, [x11], 4 222 LDR s2, [x12], 4 223 LDR s3, [x4], 4 224 FMLA v16.4s, v20.4s, v0.s[0] 225 FMLA v17.4s, v21.4s, v0.s[0] 226 FMLA v18.4s, v20.4s, v1.s[0] 227 FMLA v19.4s, v21.4s, v1.s[0] 228 FMLA v28.4s, v20.4s, v2.s[0] 229 FMLA v29.4s, v21.4s, v2.s[0] 230 FMLA v30.4s, v20.4s, v3.s[0] 231 FMLA v31.4s, v21.4s, v3.s[0] 232 B 2b 233 234 235 # Store odd width 2365: 237 TBZ x1, 2, 6f 238 $if INC: 239 STR q30, [x7], 16 240 MOV v30.16b, v31.16b 241 STR q28, [x10], 16 242 MOV v28.16b, v29.16b 243 STR q18, [x9], 16 244 MOV v18.16b, v19.16b 245 STR q16, [x6], 16 246 MOV v16.16b, v17.16b 247 $else: 248 STR q16, [x6], 16 249 MOV v16.16b, v17.16b 250 STR q18, [x9], 16 251 MOV v18.16b, v19.16b 252 STR q28, [x10], 16 253 MOV v28.16b, v29.16b 254 STR q30, [x7], 16 255 MOV v30.16b, v31.16b 256 2576: 258 TBZ x1, 1, 7f 259 $if INC: 260 STR d30, [x7], 8 261 DUP d30, v30.d[1] 262 STR d28, [x10], 8 263 DUP d28, v28.d[1] 264 STR d18, [x9], 8 265 DUP d18, v18.d[1] 266 STR d16, [x6], 8 267 DUP d16, v16.d[1] 268 $else: 269 STR d16, [x6], 8 270 DUP d16, v16.d[1] 271 STR d18, [x9], 8 272 DUP d18, v18.d[1] 273 STR d28, [x10], 8 274 DUP d28, v28.d[1] 275 STR d30, [x7], 8 276 DUP d30, v30.d[1] 277 2787: 279 TBZ x1, 0, 8f 280 $if INC: 281 STR s30, [x7] 282 STR s28, [x10] 283 STR s18, [x9] 284 STR s16, [x6] 285 $else: 286 STR s16, [x6] 287 STR s18, [x9] 288 STR s28, [x10] 289 STR s30, [x7] 290 2918: 292 RET 293 294END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_ld128 295 296#ifdef __ELF__ 297.section ".note.GNU-stack","",%progbits 298#endif 299