1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8.syntax unified 9 10// void xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64( 11// size_t mr, r0 12// size_t nc, r1 13// size_t kc, r2 -> r5 14// const uint8_t*restrict a, r3 15// size_t a_stride, sp + 96 -> (r11) 16// const void*restrict w, sp + 100 -> r9 17// uint8_t*restrict c, sp + 104 -> r6 18// size_t cm_stride, sp + 108 -> (r7) 19// size_t cn_stride, sp + 112 -> r11 20// const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) sp + 116 -> (r11) 21 22 23// inner loop registers 24 25// A0 r3 s0-s1 d0 26// A1 r12 s2-s3 d1 27// A2 r10 s4-s5 d2 28// A3 r0 s6-s7 d3 29 30// B r9 s8, s9, s10, s11 d4-d5 31// B s12, s13, s14, s15 d6-d7 32 33// C0 r6 s16-s17 d8 s18-s19 d9 34// C1 r4 s20-s21 d10 s22-s23 d11 35// C2 r8 s24-s25 d12 s26-s27 d13 36// C3 r7 s28-s29 d14 s30-s31 d15 37 38BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64 39 .arm 40#ifndef __APPLE__ 41 .arch armv6 42 .fpu vfp 43#endif 44 // Push 96 bytes 45 PUSH {r4, r5, r6, r7, r8, r9, r10, r11} // 32 46 VPUSH {d8-d15} // +64 = 96 47 48 LDR r11, [sp, 96] // Load a_stride 49 LDRD r6, r7, [sp, 104] // Load c and cm_stride 50 51 // Clamp A and C pointers 52 CMP r0, 2 // if mr >= 2 53 ADD r12, r3, r11 // a1 = a0 + a_stride 54 ADD r4, r6, r7 // c1 = c0 + cm_stride 55 MOVLO r12, r3 // a1 56 MOVLO r4, r6 // c1 57 58 LDR r9, [sp, 100] // Load w 59 60 // if mr > 2 61 ADD r10, r12, r11 // a2 = a1 + a_stride 62 ADD r8, r4, r7 // c2 = c1 + cm_stride 63 MOVLS r10, r12 // a2 64 MOVLS r8, r4 // c2 65 66 CMP r0, 4 // if mr >=4 67 ADD r0, r10, r11 // a3 = a2 + a_stride 68 ADD r7, r8, r7 // c3 = c2 + cm_stride 69 LDR r11, [sp, 112] // Load cn_stride 70 MOVLO r0, r10 // a3 71 MOVLO r7, r8 // c3 72 730: 74 # Load initial bias from w into accumulators 75 VLDM r9!, {d8-d9} // Bias 76 SUBS r5, r2, 8 77 VMOV.F64 d10, d8 78 VMOV.F64 d12, d8 79 VMOV.F64 d14, d8 80 VMOV.F64 d11, d9 81 VMOV.F64 d13, d9 82 VMOV.F64 d15, d9 83 BLO 3f // less than 2 channels? 84 85 // Main loop - 2 floats of A (8 bytes) 861: 87 VLDM r3!, {d0} // A0 88 VLDM r9!, {d4-d5} // B0 89 VLDM r12!, {d1} // A1 90 VLDM r10!, {d2} // A2 91 VLDM r0!, {d3} // A3 92 93 VMLA.F32 s16, s8, s0 94 VMLA.F32 s17, s9, s0 95 VMLA.F32 s20, s8, s2 96 VMLA.F32 s21, s9, s2 97 VMLA.F32 s24, s8, s4 98 VMLA.F32 s25, s9, s4 99 VMLA.F32 s28, s8, s6 100 VMLA.F32 s29, s9, s6 101 102 VLDM r9!, {d6-d7} // B1 103 104 VMLA.F32 s18, s10, s0 105 VMLA.F32 s19, s11, s0 106 VMLA.F32 s22, s10, s2 107 VMLA.F32 s23, s11, s2 108 VMLA.F32 s26, s10, s4 109 VMLA.F32 s27, s11, s4 110 VMLA.F32 s30, s10, s6 111 VMLA.F32 s31, s11, s6 112 113 VMLA.F32 s16, s12, s1 114 VMLA.F32 s17, s13, s1 115 VMLA.F32 s20, s12, s3 116 VMLA.F32 s21, s13, s3 117 VMLA.F32 s24, s12, s5 118 VMLA.F32 s25, s13, s5 119 VMLA.F32 s28, s12, s7 120 VMLA.F32 s29, s13, s7 121 122 SUBS r5, r5, 8 123 124 VMLA.F32 s18, s14, s1 125 VMLA.F32 s19, s15, s1 126 VMLA.F32 s22, s14, s3 127 VMLA.F32 s23, s15, s3 128 VMLA.F32 s26, s14, s5 129 VMLA.F32 s27, s15, s5 130 VMLA.F32 s30, s14, s7 131 VMLA.F32 s31, s15, s7 132 133 BHS 1b 134 135 // Is there a remainder?- 1 floats of A (4 bytes) 136 TST r5, 4 137 BNE 3f 138 1392: 140 141 SUBS r1, r1, 4 142 BLO 4f 143 144 // Store full 4 x 4 145 VSTM r6, {d8-d9} 146 SUB r0, r0, r2 147 ADD r6, r11 148 VSTM r4, {d10-d11} 149 SUB r10, r10, r2 150 ADD r4, r11 151 VSTM r8, {d12-d13} 152 SUB r12, r12, r2 153 ADD r8, r11 154 VSTM r7, {d14-d15} 155 SUB r3, r3, r2 156 ADD r7, r11 157 BHI 0b 158 159 VPOP {d8-d15} 160 POP {r4, r5, r6, r7, r8, r9, r10, r11} 161 BX lr 162 1633: 164 // Remainder- 1 floats of A (4 bytes) 165 VLDM r3!, {s0} // A0 166 VLDM r9!, {d6-d7} // B 167 VLDM r12!, {s1} // A1 168 VLDM r10!, {s2} // A2 169 VLDM r0!, {s3} // A3 170 171 VMLA.F32 s16, s12, s0 172 VMLA.F32 s17, s13, s0 173 VMLA.F32 s18, s14, s0 174 VMLA.F32 s19, s15, s0 175 176 VMLA.F32 s20, s12, s1 177 VMLA.F32 s21, s13, s1 178 VMLA.F32 s22, s14, s1 179 VMLA.F32 s23, s15, s1 180 181 VMLA.F32 s24, s12, s2 182 VMLA.F32 s25, s13, s2 183 VMLA.F32 s26, s14, s2 184 VMLA.F32 s27, s15, s2 185 186 VMLA.F32 s28, s12, s3 187 VMLA.F32 s29, s13, s3 188 VMLA.F32 s30, s14, s3 189 VMLA.F32 s31, s15, s3 190 191 B 2b 192 193 // Store odd width 1944: 195 TST r1, 2 196 BEQ 5f 197 VSTM r6!, {d8} 198 VMOV.F32 s16, s18 199 VSTM r4!, {d10} 200 VMOV.F32 s20, s22 201 VSTM r8!, {d12} 202 VMOV.F32 s24, s26 203 VSTM r7!, {d14} 204 VMOV.F32 s28, s30 205 2065: 207 TST r1, 1 208 BEQ 6f 209 VSTR s16, [r6] 210 VSTR s20, [r4] 211 VSTR s24, [r8] 212 VSTR s28, [r7] 213 2146: 215 VPOP {d8-d15} 216 POP {r4, r5, r6, r7, r8, r9, r10, r11} 217 BX lr 218 219END_FUNCTION xnn_f32_gemm_ukernel_4x4__aarch32_vfp_ld64 220 221#ifdef __ELF__ 222.section ".note.GNU-stack","",%progbits 223#endif 224