1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/1x16-aarch64-neonfp16arith-ld32.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemminc_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, (x4) - unused 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x14 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointer 28# x3 a0 29 30# C pointer 31# x6 c0 32 33# Clamp v4, v5, v6 34 35BEGIN_FUNCTION xnn_f16_gemminc_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32 36 37 # Load cn_stride, acc 38 LDP x14, x15, [sp] 39 # Load params pointer 40 LDR x8, [sp, 16] 41 42 # Load params values 43 LD3R {v4.8h, v5.8h, v6.8h}, [x8] 440: 45 # Load initial accumulators 46 LDP q16, q17, [x15], 32 47 48 MOVI v18.8h, 0 // second set of C for pipelining FMLA 49 MOVI v19.8h, 0 50 51 # Is there at least 2 halffloats (4 bytes) 52 SUBS x0, x2, 4 // k = kc - 4 53 54 B.LO 3f 55 56 # Main loop - 2 halffloats of A (4 bytes) 571: 58 LDR s0, [x3], 4 59 LDR q20, [x5], 16 60 LDR q21, [x5], 16 61 LDR q22, [x5], 16 62 LDR q23, [x5], 16 63 SUBS x0, x0, 4 64 FMLA v16.8h, v20.8h, v0.h[0] 65 FMLA v17.8h, v21.8h, v0.h[0] 66 FMLA v18.8h, v22.8h, v0.h[1] 67 FMLA v19.8h, v23.8h, v0.h[1] 68 B.HS 1b 69 70 # Is there a remainder?- 1 halffloats of A (2 bytes) 71 TBNZ x0, 1, 3f 72 732: 74 FADD v16.8h, v16.8h, v18.8h 75 FADD v17.8h, v17.8h, v19.8h 76 SUBS x1, x1, 16 77 78 # Scale and Clamp 79 80 FMUL v16.8h, v16.8h, v4.8h 81 FMUL v17.8h, v17.8h, v4.8h 82 FMAX v16.8h, v16.8h, v5.8h 83 FMAX v17.8h, v17.8h, v5.8h 84 FMIN v16.8h, v16.8h, v6.8h 85 FMIN v17.8h, v17.8h, v6.8h 86 87 # Store full 1 x 16 88 B.LO 4f 89 90 STP q16, q17, [x6] 91 ADD x6, x6, x14 92 93 SUB x3, x3, x2 // a0 -= kc 94 95 B.HI 0b 96 97 RET 98 993: 100 # Remainder- 1 halffloat of A (2 bytes) 101 LDR q20, [x5], 16 102 LDR q21, [x5], 16 103 LDR h0, [x3], 2 104 FMLA v16.8h, v20.8h, v0.h[0] 105 FMLA v17.8h, v21.8h, v0.h[0] 106 B 2b 107 108 # Store odd channels 1094: 110 TBZ x1, 3, 5f 111 STR q16, [x6], 16 112 MOV v16.16b, v17.16b 113 1145: 115 TBZ x1, 2, 6f 116 STR d16, [x6], 8 117 DUP d16, v16.d[1] 118 1196: 120 TBZ x1, 1, 7f 121 STR s16, [x6], 4 122 DUP s16, v16.s[1] 123 1247: 125 TBZ x1, 0, 8f 126 STR h16, [x6] 1278: 128 RET 129 130END_FUNCTION xnn_f16_gemminc_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32 131 132#ifdef __ELF__ 133.section ".note.GNU-stack","",%progbits 134#endif 135