1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64( 9# size_t mr, (x0) - unused. mr = 1 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, (x4) - unused 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, (x7) - unused 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointer 27# x3 a0 28 29# C pointer 30# x6 c0 31 32# Vector register usage 33# A0 v0 v1 34# B v20 v21 v22 v23 35# C v16 v18 36# Clamp v4, v5, v6 37 38 39BEGIN_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64 40 41 $if INC: 42 # Load cn_stride, acc 43 LDP x14, x15, [sp] 44 # Load params pointer 45 LDR x8, [sp, 16] 46 $else: 47 # Load cn_stride, params pointer 48 LDP x14, x8, [sp] 49 50 # Load params values 51 LD3R {v4.8h, v5.8h, v6.8h}, [x8] 520: 53 $if INC: 54 # Load initial accumulators 55 LDR q16, [x15], 16 56 $else: 57 # Load initial bias from w into accumulators 58 LDR q16, [x5], 16 59 60 MOVI v18.8h, 0 // second set of C for pipelining FMLA 61 62 # Is there at least 4 halffloats (8 bytes) 63 SUBS x0, x2, 8 // k = kc - 8 64 65 B.LO 3f 66 67 # Main loop - 4 halffloats of A (8 bytes) 681: 69 LDR d0, [x3], 8 70 LDR q20, [x5], 16 71 LDR q21, [x5], 16 72 LDR q22, [x5], 16 73 LDR q23, [x5], 16 74 SUBS x0, x0, 8 75 FMLA v16.8h, v20.8h, v0.h[0] 76 FMLA v18.8h, v21.8h, v0.h[1] 77 FMLA v16.8h, v22.8h, v0.h[2] 78 FMLA v18.8h, v23.8h, v0.h[3] 79 B.HS 1b 80 81 # Is there a remainder?- 2 halffloats of A (4 bytes) 82 TBNZ x0, 2, 4f 83 # Is there a remainder?- 1 halffloats of A (2 bytes) 84 TBNZ x0, 1, 5f 85 862: 87 FADD v16.8h, v16.8h, v18.8h 88 SUBS x1, x1, 8 89 90 # Scale and Clamp 91 FMUL v16.8h, v16.8h, v4.8h 92 FMAX v16.8h, v16.8h, v5.8h 93 FMIN v16.8h, v16.8h, v6.8h 94 95 # Store full 1 x 8 96 B.LO 6f 97 98 ST1 {v16.16b}, [x6], x14 99 SUB x3, x3, x2 // a0 -= kc 100 101 B.HI 0b 102 RET 103 1043: 105 TBZ x0, 2, 5f 1064: 107 # Remainder- 2 halffloats of A (4 bytes) 108 LDR s0, [x3], 4 109 LDR q20, [x5], 16 110 LDR q21, [x5], 16 111 FMLA v16.8h, v20.8h, v0.h[0] 112 FMLA v18.8h, v21.8h, v0.h[1] 113 TBZ x0, 1, 2b 114 1155: 116 # Remainder- 1 halffloat of A (2 bytes) 117 LDR h0, [x3], 2 118 LDR q20, [x5], 16 119 FMLA v16.8h, v20.8h, v0.h[0] 120 B 2b 121 122 # Store odd channels 1236: 124 TBZ x1, 2, 7f 125 STR d16, [x6], 8 126 DUP d16, v16.d[1] 127 1287: 129 TBZ x1, 1, 8f 130 STR s16, [x6], 4 131 DUP s16, v16.s[1] 132 1338: 134 TBZ x1, 0, 9f 135 STR h16, [x6] 1369: 137 RET 138 139END_FUNCTION xnn_f16_gemm${"inc" if INC else ""}_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64 140 141#ifdef __ELF__ 142.section ".note.GNU-stack","",%progbits 143#endif 144