1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/1x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75( 13# size_t mr, (x0) - unused. mr = 1 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, (x4) - unused 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, (x7) - unused 21# size_t cn_stride, [sp] -> x14 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointer 28# x3 a0 29 30# C pointer 31# x6 c0 32 33# Clamp v4 v5 34 35BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75 36 37 # Load cn_stride, acc 38 LDP x14, x15, [sp] 39 # Load params pointer 40 LDR x8, [sp, 16] 41 42 # Load min/max values 43 LD2R {v4.4s, v5.4s}, [x8] 440: 45 # Load initial accumulators 46 LDP q16, q17, [x15], 32 47 48 MOVI v18.4s, 0 // second set of C for pipelining FMLA 49 PRFM PLDL1KEEP, [x5] 50 MOVI v19.4s, 0 51 PRFM PLDL1KEEP, [x5, 64] 52 PRFM PLDL1KEEP, [x5, 128] 53 PRFM PLDL1KEEP, [x5, 192] 54 55 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 56 SUBS x0, x2, 32 // k = kc - 32 57 58 B.LO 3f 59 60 # 16 prologue 61 # Read first block of 1 A and B. 62 LDP q20, q21, [x5], 32 63 LDP q22, q23, [x5], 32 64 LDP q24, q25, [x5], 32 65 LDP q26, q27, [x5], 32 66 LDR q0, [x3], 16 67 68 # Is there at least 32. yes do main loop 69 SUBS x0, x0, 32 70 B.LO 2f 71 72 # Main loop - 8 floats of A (32 bytes) 731: 74 # First block of 4. FMA for first 4, loads for 2nd block of 4. 75 FMLA v16.4s, v20.4s, v0.s[0] 76 LDR q1, [x3], 16 77 FMLA v17.4s, v21.4s, v0.s[0] 78 LDP q20, q21, [x5], 32 79 FMLA v18.4s, v22.4s, v0.s[1] 80 PRFM PLDL1KEEP, [x5, 96] 81 FMLA v19.4s, v23.4s, v0.s[1] 82 LDP q22, q23, [x5], 32 83 FMLA v16.4s, v24.4s, v0.s[2] 84 FMLA v17.4s, v25.4s, v0.s[2] 85 LDP q24, q25, [x5], 32 86 FMLA v18.4s, v26.4s, v0.s[3] 87 FMLA v19.4s, v27.4s, v0.s[3] 88 LDP q26, q27, [x5], 32 89 90 # Second block of 4. FMA for second 4, loads for 1st block of 4. 91 FMLA v16.4s, v20.4s, v1.s[0] 92 LDR q0, [x3], 16 93 FMLA v17.4s, v21.4s, v1.s[0] 94 LDP q20, q21, [x5], 32 95 FMLA v18.4s, v22.4s, v1.s[1] 96 FMLA v19.4s, v23.4s, v1.s[1] 97 LDP q22, q23, [x5], 32 98 FMLA v16.4s, v24.4s, v1.s[2] 99 FMLA v17.4s, v25.4s, v1.s[2] 100 LDP q24, q25, [x5], 32 101 FMLA v18.4s, v26.4s, v1.s[3] 102 FMLA v19.4s, v27.4s, v1.s[3] 103 SUBS x0, x0, 32 104 LDP q26, q27, [x5], 32 105 B.HS 1b 106 1072: 108 # Epilogue 109 110 # First block of 4. FMA for first 4, loads for 2nd block of 4. 111 FMLA v16.4s, v20.4s, v0.s[0] 112 LDR q1, [x3], 16 113 FMLA v17.4s, v21.4s, v0.s[0] 114 LDP q20, q21, [x5], 32 115 FMLA v18.4s, v22.4s, v0.s[1] 116 FMLA v19.4s, v23.4s, v0.s[1] 117 LDP q22, q23, [x5], 32 118 FMLA v16.4s, v24.4s, v0.s[2] 119 FMLA v17.4s, v25.4s, v0.s[2] 120 LDP q24, q25, [x5], 32 121 FMLA v18.4s, v26.4s, v0.s[3] 122 FMLA v19.4s, v27.4s, v0.s[3] 123 LDP q26, q27, [x5], 32 124 125 # Second block of 4. no loads 126 FMLA v16.4s, v20.4s, v1.s[0] 127 FMLA v17.4s, v21.4s, v1.s[0] 128 FMLA v18.4s, v22.4s, v1.s[1] 129 FMLA v19.4s, v23.4s, v1.s[1] 130 FMLA v16.4s, v24.4s, v1.s[2] 131 FMLA v17.4s, v25.4s, v1.s[2] 132 FMLA v18.4s, v26.4s, v1.s[3] 133 FMLA v19.4s, v27.4s, v1.s[3] 134 1353: 136 # Is there a remainder?- 4 floats of A (16 bytes) 137 TBNZ x0, 4, 5f 138 # Is there a remainder?- 2 floats of A (8 bytes) 139 TBNZ x0, 3, 6f 140 # Is there a remainder?- 1 floats of A (4 bytes) 141 TBNZ x0, 2, 8f 142 1434: 144 FADD v16.4s, v16.4s, v18.4s 145 SUBS x1, x1, 8 146 FADD v17.4s, v17.4s, v19.4s 147 148 # Clamp 149 FMAX v16.4s, v16.4s, v4.4s 150 FMAX v17.4s, v17.4s, v4.4s 151 FMIN v16.4s, v16.4s, v5.4s 152 FMIN v17.4s, v17.4s, v5.4s 153 154 # Store full 1 x 8 155 B.LO 9f 156 157 STP q16, q17, [x6] 158 ADD x6, x6, x14 159 160 SUB x3, x3, x2 // a0 -= kc 161 162 B.HI 0b 163 164 RET 165 1665: 167 # Remainder- 4 floats of A (16 bytes) 168 LDP q20, q21, [x5], 32 169 LDR q0, [x3], 16 170 FMLA v16.4s, v20.4s, v0.s[0] 171 FMLA v17.4s, v21.4s, v0.s[0] 172 LDP q22, q23, [x5], 32 173 LDP q24, q25, [x5], 32 174 LDP q26, q27, [x5], 32 175 FMLA v18.4s, v22.4s, v0.s[1] 176 FMLA v19.4s, v23.4s, v0.s[1] 177 FMLA v16.4s, v24.4s, v0.s[2] 178 FMLA v17.4s, v25.4s, v0.s[2] 179 FMLA v18.4s, v26.4s, v0.s[3] 180 FMLA v19.4s, v27.4s, v0.s[3] 181 182 TBZ x0, 3, 7f 1836: 184 # Remainder- 2 floats of A (8 bytes) 185 LDP q20, q21, [x5], 32 186 LDR d0, [x3], 8 187 FMLA v16.4s, v20.4s, v0.s[0] 188 FMLA v17.4s, v21.4s, v0.s[0] 189 LDP q22, q23, [x5], 32 190 FMLA v18.4s, v22.4s, v0.s[1] 191 FMLA v19.4s, v23.4s, v0.s[1] 1927: 193 TBZ x0, 2, 4b 1948: 195 # Remainder- 1 float of A (4 bytes) 196 LDP q20, q21, [x5], 32 197 LDR s0, [x3], 4 198 FMLA v16.4s, v20.4s, v0.s[0] 199 FMLA v17.4s, v21.4s, v0.s[0] 200 B 4b 201 202 # Store odd channels 2039: 204 TBZ x1, 2, 10f 205 STR q16, [x6], 16 206 MOV v16.16b, v17.16b 207 20810: 209 TBZ x1, 1, 11f 210 STR d16, [x6], 8 211 DUP d16, v16.d[1] 212 21311: 214 TBZ x1, 0, 12f 215 STR s16, [x6] 21612: 217 RET 218 219END_FUNCTION xnn_f32_gemminc_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75 220 221#ifdef __ELF__ 222.section ".note.GNU-stack","",%progbits 223#endif 224