1///****************************************************************************** 2// * 3// * Copyright (C) 2018 The Android Open Source Project 4// * 5// * Licensed under the Apache License, Version 2.0 (the "License"); 6// * you may not use this file except in compliance with the License. 7// * You may obtain a copy of the License at: 8// * 9// * http://www.apache.org/licenses/LICENSE-2.0 10// * 11// * Unless required by applicable law or agreed to in writing, software 12// * distributed under the License is distributed on an "AS IS" BASIS, 13// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14// * See the License for the specific language governing permissions and 15// * limitations under the License. 16// * 17// ***************************************************************************** 18// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20 21.macro push_v_regs 22 stp q8, q9, [sp, #-32]! 23 stp q10, q11, [sp, #-32]! 24 stp q12, q13, [sp, #-32]! 25 stp q14, q15, [sp, #-32]! 26 stp X8, X9, [sp, #-16]! 27 stp X10, X11, [sp, #-16]! 28 stp X12, X13, [sp, #-16]! 29 stp X14, X15, [sp, #-16]! 30 stp X16, X17, [sp, #-16]! 31 stp X29, X30, [sp, #-16]! 32.endm 33.macro pop_v_regs 34 ldp X29, X30, [sp], #16 35 ldp X16, X17, [sp], #16 36 ldp X14, X15, [sp], #16 37 ldp X12, X13, [sp], #16 38 ldp X10, X11, [sp], #16 39 ldp X8, X9, [sp], #16 40 ldp q14, q15, [sp], #32 41 ldp q12, q13, [sp], #32 42 ldp q10, q11, [sp], #32 43 ldp q8, q9, [sp], #32 44.endm 45 46.text 47.global ixheaacd_over_lap_add1_armv8 48ixheaacd_over_lap_add1_armv8: 49 push_v_regs 50 LSL X10, X5, #1 51 SUB X11, X10, #1 52 LSL X10, X11, #2 53 ADD X10, X0, X10 54 SUB X10, X10, #12 55 LSL X8, X11, #1 56 ADD X8, X8, X3 57 SUB X8, X8, #14 58 MOV X12, #-16 59 DUP V11.8H, W4 60 LD1 {V3.4S}, [X10], X12 61 MOV W7, #0x2000 62 63 NEG W7, W7 64 SQNEG V0.4S, V3.4S 65 DUP V10.4S, W7 66 UZP1 V31.8H, V0.8H, V0.8H 67 UZP2 V30.8H, V0.8H, V0.8H 68 REV64 V31.8h, V31.8h 69 REV64 V30.8h, V30.8h 70 SUB X11, X5, #1 71 UZP1 V7.8H, V3.8H, V3.8H 72 UZP2 V6.8H, V3.8H, V3.8H 73 REV64 V7.8H, V7.8H 74 REV64 V6.8H, V6.8H 75 MOV V16.S[0], W6 76 MOV V17.S[0], W11 77 SMULL V17.4S, V16.4H, V17.4H 78 MOV W11, V17.S[0] 79 LSL X11, X11, #1 80 81 LD2 {V2.4H, V3.4H}, [X8], X12 82 ADD X11, X11, X2 83 REV64 V2.4H, V2.4H 84 REV64 V3.4H, V3.4H 85 LSL X4, X6, #1 86 NEG X4, X4 87 LSL X9, X6, #1 88 MOV V16.S[0], W5 89 MOV V17.S[0], W6 90 SMULL V17.4S, V16.4H, V17.4H 91 MOV W6, V17.S[0] 92 LSL W6, W6, #1 93 ADD X6, X6, X2 94 95 UMULL V15.4S, V7.4H, V2.4H 96 LD1 {V4.4S}, [X1], #16 97 USHR V15.4S, V15.4S, #16 98 99 SMLAL V15.4S, V6.4H, V2.4H 100 SQSHL V15.4S, V15.4S, V11.4S 101 SSHLL V27.4S, V3.4H, #0 102 SMULL V28.2D, V27.2S, V4.2S 103 SMULL2 V29.2D, V27.4S, V4.4S 104 SQXTN V28.2S, V28.2D 105 SQXTN2 V28.4S, V29.2D 106 MOV V14.16B, V28.16B 107 108 SQADD V14.4S, V14.4S, V10.4S 109 SQSUB V13.4S, V15.4S, V14.4S 110 SQSHL V13.4S, V13.4S, #2 111 SSHR V13.4S, V13.4S, #16 112 UZP1 V26.8H, V13.8H, V13.8H 113 114 UMULL V12.4S, V31.4H, V3.4H 115 USHR V12.4S, V12.4S, #16 116 SMLAL V12.4S, V30.4H, V3.4H 117 SQSHL V12.4S, V12.4S, V11.4S 118 LD1 {V3.4S}, [X10], X12 119 120 SSHLL V27.4S, V2.4H, #0 121 SMULL V28.2D, V27.2S, V4.2S 122 SMULL2 V29.2D, V27.4S, V4.4S 123 SQXTN V28.2S, V28.2D 124 SQXTN2 V28.4S, V29.2D 125 MOV V8.16B, V28.16B 126 127 SQADD V8.4S, V8.4S, V10.4S 128 129 SQNEG V0.4S, V3.4S 130 UZP1 V1.8H, V0.8H, V0.8H 131 UZP2 V0.8H, V0.8H, V0.8H 132 REV64 V1.8h, V1.8h 133 REV64 V0.8h, V0.8h 134 SQSUB V9.4S, V12.4S, V8.4S 135 UZP1 V7.8H, V3.8H, V3.8H 136 UZP2 V6.8H, V3.8H, V3.8H 137 REV64 V7.8h, V7.8h 138 REV64 V6.8h, V6.8h 139 SQSHL V9.4S, V9.4S, #2 140 LD2 {V2.4H, V3.4H}, [X8], X12 141 SSHR V9.4S, V9.4S, #16 142 REV64 V2.4H, V2.4H 143 REV64 V3.4H, V3.4H 144 UZP1 V18.8H, V9.8H, V9.8H 145 146 LD1 {V4.4S}, [X1], #16 147 SUB W5, W5, #8 148 149 150LOOP_1: 151 152 ST1 {V26.H}[0], [X11], X4 153 UMULL V15.4S, V7.4H, V2.4H 154 ST1 {V26.H}[1], [X11], X4 155 UMULL V12.4S, V1.4H, V3.4H 156 ST1 {V26.H}[2], [X11], X4 157 USHR V15.4S, V15.4S, #16 158 ST1 {V26.H}[3], [X11], X4 159 USHR V12.4S, V12.4S, #16 160 ST1 {V18.H}[0], [X6], X9 161 SMLAL V15.4S, V6.4H, V2.4H 162 ST1 {V18.H}[1], [X6], X9 163 SMLAL V12.4S, V0.4H, V3.4H 164 ST1 {V18.H}[2], [X6], X9 165 SQSHL V15.4S, V15.4S, V11.4S 166 ST1 {V18.H}[3], [X6], X9 167 SQSHL V12.4S, V12.4S, V11.4S 168 LD1 {V6.4S}, [X10], X12 169 170 SSHLL V27.4S, V3.4H, #0 171 SMULL V28.2D, V27.2S, V4.2S 172 SMULL2 V29.2D, V27.4S, V4.4S 173 SQXTN V28.2S, V28.2D 174 SQXTN2 V28.4S, V29.2D 175 MOV V14.16B, V28.16B 176 177 SSHLL V27.4S, V2.4H, #0 178 SMULL V28.2D, V27.2S, V4.2S 179 SMULL2 V29.2D, V27.4S, V4.4S 180 SQXTN V28.2S, V28.2D 181 SQXTN2 V28.4S, V29.2D 182 MOV V8.16B, V28.16B 183 184 LD2 {V2.4H, V3.4H}, [X8], X12 185 186 SQNEG V0.4S, V6.4S 187 188 LD1 {V4.4S}, [X1], #16 189 190 SQADD V14.4S, V14.4S, V10.4S 191 UZP1 V1.8H, V0.8H, V0.8H 192 UZP2 V0.8H, V0.8H, V0.8H 193 REV64 V1.8h, V1.8h 194 REV64 V0.8h, V0.8h 195 SQADD V8.4S, V8.4S, V10.4S 196 UZP1 V7.8H, V6.8H, V6.8H 197 UZP2 V6.8H, V6.8H, V6.8H 198 REV64 V7.8h, V7.8h 199 REV64 V6.8h, V6.8h 200 SQSUB V13.4S, V15.4S, V14.4S 201 REV64 V2.4H, V2.4H 202 REV64 V3.4H, V3.4H 203 SQSUB V9.4S, V12.4S, V8.4S 204 SQSHL V13.4S, V13.4S, #2 205 SQSHL V9.4S, V9.4S, #2 206 UMULL V15.4S, V7.4H, V2.4H 207 SSHR V13.4S, V13.4S, #16 208 UZP1 V26.8H, V13.8H, V13.8H 209 SSHR V9.4S, V9.4S, #16 210 ST1 {V26.H}[0], [X11], X4 211 UMULL V12.4S, V1.4H, V3.4H 212 UZP1 V18.8H, V9.8H, V9.8H 213 USHR V15.4S, V15.4S, #16 214 ST1 {V26.H}[1], [X11], X4 215 SMLAL V15.4S, V6.4H, V2.4H 216 ST1 {V26.H}[2], [X11], X4 217 USHR V12.4S, V12.4S, #16 218 ST1 {V26.H}[3], [X11], X4 219 SMLAL V12.4S, V0.4H, V3.4H 220 ST1 {V18.H}[0], [X6], X9 221 SQSHL V15.4S, V15.4S, V11.4S 222 ST1 {V18.H}[1], [X6], X9 223 SQSHL V12.4S, V12.4S, V11.4S 224 ST1 {V18.H}[2], [X6], X9 225 226 SSHLL V27.4S, V3.4H, #0 227 SMULL V28.2D, V27.2S, V4.2S 228 SMULL2 V29.2D, V27.4S, V4.4S 229 SQXTN V28.2S, V28.2D 230 SQXTN2 V28.4S, V29.2D 231 MOV V14.16B, V28.16B 232 233 ST1 {V18.H}[3], [X6], X9 234 235 236 SSHLL V27.4S, V2.4H, #0 237 SMULL V28.2D, V27.2S, V4.2S 238 SMULL2 V29.2D, V27.4S, V4.4S 239 SQXTN V28.2S, V28.2D 240 SQXTN2 V28.4S, V29.2D 241 MOV V8.16B, V28.16B 242 243 LD1 {V3.4S}, [X10], X12 244 SQADD V14.4S, V14.4S, V10.4S 245 246 SQNEG V0.4S, V3.4S 247 UZP1 V1.8H, V0.8H, V0.8H 248 UZP2 V0.8H, V0.8H, V0.8H 249 REV64 V1.8H, V1.8H 250 REV64 V0.8H, V0.8H 251 SQSUB V13.4S, V15.4S, V14.4S 252 UZP1 V7.8H, V3.8H, V3.8H 253 UZP2 V6.8H, V3.8H, V3.8H 254 REV64 V7.8H, V7.8H 255 REV64 V6.8H, V6.8H 256 SQADD V8.4S, V8.4S, V10.4S 257 LD2 {V2.4H, V3.4H}, [X8], X12 258 SQSUB V9.4S, V12.4S, V8.4S 259 REV64 V2.4H, V2.4H 260 REV64 V3.4H, V3.4H 261 SQSHL V13.4S, V13.4S, #2 262 LD1 {V4.4S}, [X1], #16 263 264 SQSHL V9.4S, V9.4S, #2 265 SSHR V13.4S, V13.4S, #16 266 SUBS X5, X5, #8 267 SSHR V9.4S, V9.4S, #16 268 UZP1 V26.8H, V13.8H, V13.8H 269 UZP1 V18.8H, V9.8H, V9.8H 270 271 BGT LOOP_1 272 273 ST1 {V26.H}[0], [X11], X4 274 UMULL V15.4S, V7.4H, V2.4H 275 ST1 {V26.H}[1], [X11], X4 276 UMULL V12.4s, V1.4H, V3.4H 277 ST1 {V26.H}[2], [X11], X4 278 USHR V15.4S, V15.4S, #16 279 ST1 {V26.H}[3], [X11], X4 280 USHR V12.4S, V12.4S, #16 281 282 ST1 {V18.H}[0], [X6], X9 283 SMLAL V15.4S, V6.4H, V2.4H 284 ST1 {V18.H}[1], [X6], X9 285 SMLAL V12.4S, V0.4H, V3.4H 286 ST1 {V18.H}[2], [X6], X9 287 SQSHL V15.4S, V15.4S, V11.4S 288 ST1 {V18.H}[3], [X6], X9 289 SQSHL V12.4S, V12.4S, V11.4S 290 291 292 SSHLL V27.4S, V3.4H, #0 293 SMULL V28.2D, V27.2S, V4.2S 294 SMULL2 V29.2D, V27.4S, V4.4S 295 SQXTN V28.2S, V28.2D 296 SQXTN2 V28.4S, V29.2D 297 MOV V14.16B, V28.16B 298 299 SSHLL V27.4S, V2.4H, #0 300 SMULL V28.2D, V27.2S, V4.2S 301 SMULL2 V29.2D, V27.4S, V4.4S 302 SQXTN V28.2S, V28.2D 303 SQXTN2 V28.4S, V29.2D 304 MOV V8.16B, V28.16B 305 306 SQADD V14.4S, V14.4S, V10.4S 307 SQADD V8.4S, V8.4S, V10.4S 308 SQSUB V13.4S, V15.4S, V14.4S 309 SQSUB V9.4S, V12.4S, V8.4S 310 SQSHL V13.4S, V13.4S, #2 311 SQSHL V9.4S, V9.4S, #2 312 SSHR V13.4S, V13.4S, #16 313 SSHR V9.4S, V9.4S, #16 314 UZP1 V26.8H, V13.8H, V13.8H 315 316 UZP1 V18.8H, V9.8H, V9.8H 317 318 319 ST1 {V26.H}[0], [X11], X4 320 ST1 {V26.H}[1], [X11], X4 321 ST1 {V26.H}[2], [X11], X4 322 ST1 {V26.H}[3], [X11], X4 323 324 ST1 {V18.H}[0], [X6], X9 325 ST1 {V18.H}[1], [X6], X9 326 ST1 {V18.H}[2], [X6], X9 327 ST1 {V18.H}[3], [X6], X9 328 pop_v_regs 329 RET 330 331 332 333 334