1; libFLAC - Free Lossless Audio Codec library 2; Copyright (C) 2004,2005,2006,2007 Josh Coalson 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; 8; - Redistributions of source code must retain the above copyright 9; notice, this list of conditions and the following disclaimer. 10; 11; - Redistributions in binary form must reproduce the above copyright 12; notice, this list of conditions and the following disclaimer in the 13; documentation and/or other materials provided with the distribution. 14; 15; - Neither the name of the Xiph.org Foundation nor the names of its 16; contributors may be used to endorse or promote products derived from 17; this software without specific prior written permission. 18; 19; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR 23; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 24; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 25; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 26; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 27; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 28; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 29; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31.text 32 .align 2 33.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16 34 35.globl _FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8 36 37_FLAC__lpc_restore_signal_asm_ppc_altivec_16: 38; r3: residual[] 39; r4: data_len 40; r5: qlp_coeff[] 41; r6: order 42; r7: lp_quantization 43; r8: data[] 44 45; see src/libFLAC/lpc.c:FLAC__lpc_restore_signal() 46; these is a PowerPC/Altivec assembly version which requires bps<=16 (or actual 47; bps<=15 for mid-side coding, since that uses an extra bit) 48 49; these should be fast; the inner loop is unrolled (it takes no more than 50; 3*(order%4) instructions, all of which are arithmetic), and all of the 51; coefficients and all relevant history stay in registers, so the outer loop 52; has only one load from memory (the residual) 53 54; I have not yet run this through simg4, so there may be some avoidable stalls, 55; and there may be a somewhat more clever way to do the outer loop 56 57; the branch mechanism may prevent dynamic loading; I still need to examine 58; this issue, and there may be a more elegant method 59 60 stmw r31,-4(r1) 61 62 addi r9,r1,-28 63 li r31,0xf 64 andc r9,r9,r31 ; for quadword-aligned stack data 65 66 slwi r6,r6,2 ; adjust for word size 67 slwi r4,r4,2 68 add r4,r4,r8 ; r4 = data+data_len 69 70 mfspr r0,256 ; cache old vrsave 71 addis r31,0,hi16(0xfffffc00) 72 ori r31,r31,lo16(0xfffffc00) 73 mtspr 256,r31 ; declare VRs in vrsave 74 75 cmplw cr0,r8,r4 ; i<data_len 76 bc 4,0,L1400 77 78 ; load coefficients into v0-v7 and initial history into v8-v15 79 li r31,0xf 80 and r31,r8,r31 ; r31: data%4 81 li r11,16 82 subf r31,r31,r11 ; r31: 4-(data%4) 83 slwi r31,r31,3 ; convert to bits for vsro 84 li r10,-4 85 stw r31,-4(r9) 86 lvewx v0,r10,r9 87 vspltisb v18,-1 88 vsro v18,v18,v0 ; v18: mask vector 89 90 li r31,0x8 91 lvsl v0,0,r31 92 vsldoi v0,v0,v0,12 93 li r31,0xc 94 lvsl v1,0,r31 95 vspltisb v2,0 96 vspltisb v3,-1 97 vmrglw v2,v2,v3 98 vsel v0,v1,v0,v2 ; v0: reversal permutation vector 99 100 add r10,r5,r6 101 lvsl v17,0,r5 ; v17: coefficient alignment permutation vector 102 vperm v17,v17,v17,v0 ; v17: reversal coefficient alignment permutation vector 103 104 mr r11,r8 105 lvsl v16,0,r11 ; v16: history alignment permutation vector 106 107 lvx v0,0,r5 108 addi r5,r5,16 109 lvx v1,0,r5 110 vperm v0,v0,v1,v17 111 lvx v8,0,r11 112 addi r11,r11,-16 113 lvx v9,0,r11 114 vperm v8,v9,v8,v16 115 cmplw cr0,r5,r10 116 bc 12,0,L1101 117 vand v0,v0,v18 118 addis r31,0,hi16(L1307) 119 ori r31,r31,lo16(L1307) 120 b L1199 121 122L1101: 123 addi r5,r5,16 124 lvx v2,0,r5 125 vperm v1,v1,v2,v17 126 addi r11,r11,-16 127 lvx v10,0,r11 128 vperm v9,v10,v9,v16 129 cmplw cr0,r5,r10 130 bc 12,0,L1102 131 vand v1,v1,v18 132 addis r31,0,hi16(L1306) 133 ori r31,r31,lo16(L1306) 134 b L1199 135 136L1102: 137 addi r5,r5,16 138 lvx v3,0,r5 139 vperm v2,v2,v3,v17 140 addi r11,r11,-16 141 lvx v11,0,r11 142 vperm v10,v11,v10,v16 143 cmplw cr0,r5,r10 144 bc 12,0,L1103 145 vand v2,v2,v18 146 addis r31,0,hi16(L1305) 147 ori r31,r31,lo16(L1305) 148 b L1199 149 150L1103: 151 addi r5,r5,16 152 lvx v4,0,r5 153 vperm v3,v3,v4,v17 154 addi r11,r11,-16 155 lvx v12,0,r11 156 vperm v11,v12,v11,v16 157 cmplw cr0,r5,r10 158 bc 12,0,L1104 159 vand v3,v3,v18 160 addis r31,0,hi16(L1304) 161 ori r31,r31,lo16(L1304) 162 b L1199 163 164L1104: 165 addi r5,r5,16 166 lvx v5,0,r5 167 vperm v4,v4,v5,v17 168 addi r11,r11,-16 169 lvx v13,0,r11 170 vperm v12,v13,v12,v16 171 cmplw cr0,r5,r10 172 bc 12,0,L1105 173 vand v4,v4,v18 174 addis r31,0,hi16(L1303) 175 ori r31,r31,lo16(L1303) 176 b L1199 177 178L1105: 179 addi r5,r5,16 180 lvx v6,0,r5 181 vperm v5,v5,v6,v17 182 addi r11,r11,-16 183 lvx v14,0,r11 184 vperm v13,v14,v13,v16 185 cmplw cr0,r5,r10 186 bc 12,0,L1106 187 vand v5,v5,v18 188 addis r31,0,hi16(L1302) 189 ori r31,r31,lo16(L1302) 190 b L1199 191 192L1106: 193 addi r5,r5,16 194 lvx v7,0,r5 195 vperm v6,v6,v7,v17 196 addi r11,r11,-16 197 lvx v15,0,r11 198 vperm v14,v15,v14,v16 199 cmplw cr0,r5,r10 200 bc 12,0,L1107 201 vand v6,v6,v18 202 addis r31,0,hi16(L1301) 203 ori r31,r31,lo16(L1301) 204 b L1199 205 206L1107: 207 addi r5,r5,16 208 lvx v19,0,r5 209 vperm v7,v7,v19,v17 210 addi r11,r11,-16 211 lvx v19,0,r11 212 vperm v15,v19,v15,v16 213 vand v7,v7,v18 214 addis r31,0,hi16(L1300) 215 ori r31,r31,lo16(L1300) 216 217L1199: 218 mtctr r31 219 220 ; set up invariant vectors 221 vspltish v16,0 ; v16: zero vector 222 223 li r10,-12 224 lvsr v17,r10,r8 ; v17: result shift vector 225 lvsl v18,r10,r3 ; v18: residual shift back vector 226 227 li r10,-4 228 stw r7,-4(r9) 229 lvewx v19,r10,r9 ; v19: lp_quantization vector 230 231L1200: 232 vmulosh v20,v0,v8 ; v20: sum vector 233 bcctr 20,0 234 235L1300: 236 vmulosh v21,v7,v15 237 vsldoi v15,v15,v14,4 ; increment history 238 vaddsws v20,v20,v21 239 240L1301: 241 vmulosh v21,v6,v14 242 vsldoi v14,v14,v13,4 243 vaddsws v20,v20,v21 244 245L1302: 246 vmulosh v21,v5,v13 247 vsldoi v13,v13,v12,4 248 vaddsws v20,v20,v21 249 250L1303: 251 vmulosh v21,v4,v12 252 vsldoi v12,v12,v11,4 253 vaddsws v20,v20,v21 254 255L1304: 256 vmulosh v21,v3,v11 257 vsldoi v11,v11,v10,4 258 vaddsws v20,v20,v21 259 260L1305: 261 vmulosh v21,v2,v10 262 vsldoi v10,v10,v9,4 263 vaddsws v20,v20,v21 264 265L1306: 266 vmulosh v21,v1,v9 267 vsldoi v9,v9,v8,4 268 vaddsws v20,v20,v21 269 270L1307: 271 vsumsws v20,v20,v16 ; v20[3]: sum 272 vsraw v20,v20,v19 ; v20[3]: sum >> lp_quantization 273 274 lvewx v21,0,r3 ; v21[n]: *residual 275 vperm v21,v21,v21,v18 ; v21[3]: *residual 276 vaddsws v20,v21,v20 ; v20[3]: *residual + (sum >> lp_quantization) 277 vsldoi v18,v18,v18,4 ; increment shift vector 278 279 vperm v21,v20,v20,v17 ; v21[n]: shift for storage 280 vsldoi v17,v17,v17,12 ; increment shift vector 281 stvewx v21,0,r8 282 283 vsldoi v20,v20,v20,12 284 vsldoi v8,v8,v20,4 ; insert value onto history 285 286 addi r3,r3,4 287 addi r8,r8,4 288 cmplw cr0,r8,r4 ; i<data_len 289 bc 12,0,L1200 290 291L1400: 292 mtspr 256,r0 ; restore old vrsave 293 lmw r31,-4(r1) 294 blr 295 296_FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8: 297; r3: residual[] 298; r4: data_len 299; r5: qlp_coeff[] 300; r6: order 301; r7: lp_quantization 302; r8: data[] 303 304; see _FLAC__lpc_restore_signal_asm_ppc_altivec_16() above 305; this version assumes order<=8; it uses fewer vector registers, which should 306; save time in context switches, and has less code, which may improve 307; instruction caching 308 309 stmw r31,-4(r1) 310 311 addi r9,r1,-28 312 li r31,0xf 313 andc r9,r9,r31 ; for quadword-aligned stack data 314 315 slwi r6,r6,2 ; adjust for word size 316 slwi r4,r4,2 317 add r4,r4,r8 ; r4 = data+data_len 318 319 mfspr r0,256 ; cache old vrsave 320 addis r31,0,hi16(0xffc00000) 321 ori r31,r31,lo16(0xffc00000) 322 mtspr 256,r31 ; declare VRs in vrsave 323 324 cmplw cr0,r8,r4 ; i<data_len 325 bc 4,0,L2400 326 327 ; load coefficients into v0-v1 and initial history into v2-v3 328 li r31,0xf 329 and r31,r8,r31 ; r31: data%4 330 li r11,16 331 subf r31,r31,r11 ; r31: 4-(data%4) 332 slwi r31,r31,3 ; convert to bits for vsro 333 li r10,-4 334 stw r31,-4(r9) 335 lvewx v0,r10,r9 336 vspltisb v6,-1 337 vsro v6,v6,v0 ; v6: mask vector 338 339 li r31,0x8 340 lvsl v0,0,r31 341 vsldoi v0,v0,v0,12 342 li r31,0xc 343 lvsl v1,0,r31 344 vspltisb v2,0 345 vspltisb v3,-1 346 vmrglw v2,v2,v3 347 vsel v0,v1,v0,v2 ; v0: reversal permutation vector 348 349 add r10,r5,r6 350 lvsl v5,0,r5 ; v5: coefficient alignment permutation vector 351 vperm v5,v5,v5,v0 ; v5: reversal coefficient alignment permutation vector 352 353 mr r11,r8 354 lvsl v4,0,r11 ; v4: history alignment permutation vector 355 356 lvx v0,0,r5 357 addi r5,r5,16 358 lvx v1,0,r5 359 vperm v0,v0,v1,v5 360 lvx v2,0,r11 361 addi r11,r11,-16 362 lvx v3,0,r11 363 vperm v2,v3,v2,v4 364 cmplw cr0,r5,r10 365 bc 12,0,L2101 366 vand v0,v0,v6 367 addis r31,0,hi16(L2301) 368 ori r31,r31,lo16(L2301) 369 b L2199 370 371L2101: 372 addi r5,r5,16 373 lvx v7,0,r5 374 vperm v1,v1,v7,v5 375 addi r11,r11,-16 376 lvx v7,0,r11 377 vperm v3,v7,v3,v4 378 vand v1,v1,v6 379 addis r31,0,hi16(L2300) 380 ori r31,r31,lo16(L2300) 381 382L2199: 383 mtctr r31 384 385 ; set up invariant vectors 386 vspltish v4,0 ; v4: zero vector 387 388 li r10,-12 389 lvsr v5,r10,r8 ; v5: result shift vector 390 lvsl v6,r10,r3 ; v6: residual shift back vector 391 392 li r10,-4 393 stw r7,-4(r9) 394 lvewx v7,r10,r9 ; v7: lp_quantization vector 395 396L2200: 397 vmulosh v8,v0,v2 ; v8: sum vector 398 bcctr 20,0 399 400L2300: 401 vmulosh v9,v1,v3 402 vsldoi v3,v3,v2,4 403 vaddsws v8,v8,v9 404 405L2301: 406 vsumsws v8,v8,v4 ; v8[3]: sum 407 vsraw v8,v8,v7 ; v8[3]: sum >> lp_quantization 408 409 lvewx v9,0,r3 ; v9[n]: *residual 410 vperm v9,v9,v9,v6 ; v9[3]: *residual 411 vaddsws v8,v9,v8 ; v8[3]: *residual + (sum >> lp_quantization) 412 vsldoi v6,v6,v6,4 ; increment shift vector 413 414 vperm v9,v8,v8,v5 ; v9[n]: shift for storage 415 vsldoi v5,v5,v5,12 ; increment shift vector 416 stvewx v9,0,r8 417 418 vsldoi v8,v8,v8,12 419 vsldoi v2,v2,v8,4 ; insert value onto history 420 421 addi r3,r3,4 422 addi r8,r8,4 423 cmplw cr0,r8,r4 ; i<data_len 424 bc 12,0,L2200 425 426L2400: 427 mtspr 256,r0 ; restore old vrsave 428 lmw r31,-4(r1) 429 blr 430