1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14 15pb_1: times 16 db 1 16sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 17sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 18sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 19sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 20sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 21sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 22sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 23sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 24sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 25sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 26sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 27sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 28sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 29sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 30 31SECTION .text 32 33INIT_XMM ssse3 34cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset 35 GET_GOT goffsetq 36 37 mova m0, [aboveq] 38 DEFINE_ARGS dst, stride, stride3, dst8, line 39 lea stride3q, [strideq*3] 40 lea dst8q, [dstq+strideq*8] 41 mova m1, [GLOBAL(sh_b123456789abcdeff)] 42 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] 43 pavgb m3, m2, m0 44 pxor m2, m0 45 pshufb m0, m1 46 pand m2, [GLOBAL(pb_1)] 47 psubb m3, m2 48 pavgb m0, m3 49 50 ; first 4 lines and first half of 3rd 4 lines 51 mov lined, 2 52.loop: 53 mova [dstq ], m0 54 movhps [dst8q ], m0 55 pshufb m0, m1 56 mova [dstq +strideq ], m0 57 movhps [dst8q+strideq ], m0 58 pshufb m0, m1 59 mova [dstq +strideq*2 ], m0 60 movhps [dst8q+strideq*2 ], m0 61 pshufb m0, m1 62 mova [dstq +stride3q ], m0 63 movhps [dst8q+stride3q ], m0 64 pshufb m0, m1 65 lea dstq, [dstq +strideq*4] 66 lea dst8q, [dst8q+strideq*4] 67 dec lined 68 jnz .loop 69 70 ; bottom-right 8x8 block 71 movhps [dstq +8], m0 72 movhps [dstq+strideq +8], m0 73 movhps [dstq+strideq*2+8], m0 74 movhps [dstq+stride3q +8], m0 75 lea dstq, [dstq+strideq*4] 76 movhps [dstq +8], m0 77 movhps [dstq+strideq +8], m0 78 movhps [dstq+strideq*2+8], m0 79 movhps [dstq+stride3q +8], m0 80 81 RESTORE_GOT 82 RET 83 84INIT_XMM ssse3 85cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset 86 GET_GOT goffsetq 87 88 mova m0, [aboveq] 89 mova m4, [aboveq+16] 90 DEFINE_ARGS dst, stride, stride3, dst16, line 91 lea stride3q, [strideq*3] 92 lea dst16q, [dstq +strideq*8] 93 lea dst16q, [dst16q+strideq*8] 94 mova m1, [GLOBAL(sh_b123456789abcdeff)] 95 pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)] 96 pavgb m3, m2, m4 97 pxor m2, m4 98 palignr m5, m4, m0, 1 99 palignr m6, m4, m0, 2 100 pshufb m4, m1 101 pand m2, [GLOBAL(pb_1)] 102 psubb m3, m2 103 pavgb m4, m3 104 pavgb m3, m0, m6 105 pxor m0, m6 106 pand m0, [GLOBAL(pb_1)] 107 psubb m3, m0 108 pavgb m5, m3 109 110 ; write 4x4 lines (and the first half of the second 4x4 lines) 111 mov lined, 4 112.loop: 113 mova [dstq ], m5 114 mova [dstq +16], m4 115 mova [dst16q ], m4 116 palignr m3, m4, m5, 1 117 pshufb m4, m1 118 mova [dstq +strideq ], m3 119 mova [dstq +strideq +16], m4 120 mova [dst16q+strideq ], m4 121 palignr m5, m4, m3, 1 122 pshufb m4, m1 123 mova [dstq +strideq*2 ], m5 124 mova [dstq +strideq*2+16], m4 125 mova [dst16q+strideq*2 ], m4 126 palignr m3, m4, m5, 1 127 pshufb m4, m1 128 mova [dstq +stride3q ], m3 129 mova [dstq +stride3q +16], m4 130 mova [dst16q+stride3q ], m4 131 palignr m5, m4, m3, 1 132 pshufb m4, m1 133 lea dstq, [dstq +strideq*4] 134 lea dst16q, [dst16q+strideq*4] 135 dec lined 136 jnz .loop 137 138 ; write second half of second 4x4 lines 139 mova [dstq +16], m4 140 mova [dstq +strideq +16], m4 141 mova [dstq +strideq*2+16], m4 142 mova [dstq +stride3q +16], m4 143 lea dstq, [dstq +strideq*4] 144 mova [dstq +16], m4 145 mova [dstq +strideq +16], m4 146 mova [dstq +strideq*2+16], m4 147 mova [dstq +stride3q +16], m4 148 lea dstq, [dstq +strideq*4] 149 mova [dstq +16], m4 150 mova [dstq +strideq +16], m4 151 mova [dstq +strideq*2+16], m4 152 mova [dstq +stride3q +16], m4 153 lea dstq, [dstq +strideq*4] 154 mova [dstq +16], m4 155 mova [dstq +strideq +16], m4 156 mova [dstq +strideq*2+16], m4 157 mova [dstq +stride3q +16], m4 158 159 RESTORE_GOT 160 RET 161 162; ------------------------------------------ 163; input: x, y, z, result 164; 165; trick from pascal 166; (x+2y+z+2)>>2 can be calculated as: 167; result = avg(x,z) 168; result -= xor(x,z) & 1 169; result = avg(result,y) 170; ------------------------------------------ 171%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 172 pavgb %4, %1, %3 173 pxor %3, %1 174 pand %3, [GLOBAL(pb_1)] 175 psubb %4, %3 176 pavgb %4, %2 177%endmacro 178 179INIT_XMM ssse3 180cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset 181 GET_GOT goffsetq 182 183 movq m3, [aboveq] 184 pshufb m1, m3, [GLOBAL(sh_b23456777)] 185 pshufb m2, m3, [GLOBAL(sh_b12345677)] 186 187 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 188 pavgb m3, m2 189 190 ; store 4 lines 191 movd [dstq ], m3 192 movd [dstq+strideq], m4 193 lea dstq, [dstq+strideq*2] 194 psrldq m3, 1 195 psrldq m4, 1 196 movd [dstq ], m3 197 movd [dstq+strideq], m4 198 RESTORE_GOT 199 RET 200 201INIT_XMM ssse3 202cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset 203 GET_GOT goffsetq 204 205 movq m3, [aboveq] 206 DEFINE_ARGS dst, stride, stride3 207 lea stride3q, [strideq*3] 208 pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] 209 pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] 210 pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] 211 pshufb m3, [GLOBAL(sh_b0123456777777777)] 212 213 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4 214 pavgb m3, m2 215 216 ; store 4 lines 217 movq [dstq ], m3 218 movq [dstq+strideq], m4 219 psrldq m3, 1 220 psrldq m4, 1 221 movq [dstq+strideq*2], m3 222 movq [dstq+stride3q ], m4 223 lea dstq, [dstq+strideq*4] 224 psrldq m3, 1 225 psrldq m4, 1 226 227 ; store 4 lines 228 movq [dstq ], m3 229 movq [dstq+strideq], m4 230 psrldq m3, 1 231 psrldq m4, 1 232 movq [dstq+strideq*2], m3 233 movq [dstq+stride3q ], m4 234 RESTORE_GOT 235 RET 236 237INIT_XMM ssse3 238cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset 239 GET_GOT goffsetq 240 241 mova m0, [aboveq] 242 DEFINE_ARGS dst, stride, stride3, line 243 lea stride3q, [strideq*3] 244 mova m1, [GLOBAL(sh_b123456789abcdeff)] 245 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] 246 pshufb m3, m0, m1 247 248 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4 249 pavgb m0, m3 250 251 mov lined, 4 252.loop: 253 mova [dstq ], m0 254 mova [dstq+strideq ], m4 255 pshufb m0, m1 256 pshufb m4, m1 257 mova [dstq+strideq*2], m0 258 mova [dstq+stride3q ], m4 259 pshufb m0, m1 260 pshufb m4, m1 261 lea dstq, [dstq+strideq*4] 262 dec lined 263 jnz .loop 264 RESTORE_GOT 265 REP_RET 266 267INIT_XMM ssse3 268cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset 269 GET_GOT goffsetq 270 271 mova m0, [aboveq] 272 mova m7, [aboveq+16] 273 DEFINE_ARGS dst, stride, stride3, line 274 mova m1, [GLOBAL(sh_b123456789abcdeff)] 275 lea stride3q, [strideq*3] 276 pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)] 277 pshufb m3, m7, m1 278 279 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4 280 palignr m6, m7, m0, 1 281 palignr m5, m7, m0, 2 282 pavgb m7, m3 283 284 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2 285 pavgb m0, m6 286 287 mov lined, 8 288.loop: 289 mova [dstq ], m0 290 mova [dstq +16], m7 291 mova [dstq+strideq ], m2 292 mova [dstq+strideq +16], m4 293 palignr m3, m7, m0, 1 294 palignr m5, m4, m2, 1 295 pshufb m7, m1 296 pshufb m4, m1 297 298 mova [dstq+strideq*2 ], m3 299 mova [dstq+strideq*2+16], m7 300 mova [dstq+stride3q ], m5 301 mova [dstq+stride3q +16], m4 302 palignr m0, m7, m3, 1 303 palignr m2, m4, m5, 1 304 pshufb m7, m1 305 pshufb m4, m1 306 lea dstq, [dstq+strideq*4] 307 dec lined 308 jnz .loop 309 RESTORE_GOT 310 REP_RET 311 312INIT_XMM ssse3 313cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset 314 GET_GOT goffsetq 315 movd m0, [leftq] ; l1, l2, l3, l4 316 movd m1, [aboveq-1] ; tl, t1, t2, t3 317 punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 318 pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 319 psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 320 psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 321 ; comments below are for a predictor like this 322 ; A1 B1 C1 D1 323 ; A2 B2 A1 B1 324 ; A3 B3 A2 B2 325 ; A4 B4 A3 B3 326 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 327 pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 328 329 punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 .. 330 331 DEFINE_ARGS dst, stride, stride3 332 lea stride3q, [strideq*3] 333 pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 .. 334 movd [dstq+stride3q ], m3 335 psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. 336 movd [dstq+strideq*2], m3 337 psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. 338 movd [dstq+strideq ], m3 339 psrldq m3, 2 ; A1 B1 C1 D1 .. 340 movd [dstq ], m3 341 RESTORE_GOT 342 RET 343 344INIT_XMM ssse3 345cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset 346 GET_GOT goffsetq 347 movq m0, [leftq] ; [0- 7] l1-8 [byte] 348 movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] 349 pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] 350 pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] 351 pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] 352 pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] 353 psrldq m4, m0, 1 ; t1-7 [word] 354 psrldq m5, m0, 2 ; t2-7 [word] 355 ; comments below are for a predictor like this 356 ; A1 B1 C1 D1 E1 F1 G1 H1 357 ; A2 B2 A1 B1 C1 D1 E1 F1 358 ; A3 B3 A2 B2 A1 B1 C1 D1 359 ; A4 B4 A3 B3 A2 B2 A1 B1 360 ; A5 B5 A4 B4 A3 B3 A2 B2 361 ; A6 B6 A5 B5 A4 B4 A3 B3 362 ; A7 B7 A6 B6 A5 B5 A4 B4 363 ; A8 B8 A7 B7 A6 B6 A5 B5 364 pavgb m6, m1, m2 ; 2-tap avg A8-A1 365 366 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 367 368 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 369 370 punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 371 372 DEFINE_ARGS dst, stride, stride3 373 lea stride3q, [strideq*3] 374 375 movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 376 palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 377 movq [dstq+strideq*2], m0 378 psrldq m0, 2 ; A-B2, A-B1, C-H1 379 movq [dstq+strideq ], m0 380 psrldq m0, 2 ; A-H1 381 movq [dstq ], m0 382 lea dstq, [dstq+strideq*4] 383 movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 384 psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 385 movq [dstq+strideq*2], m6 386 psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 387 movq [dstq+strideq ], m6 388 psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 389 movq [dstq ], m6 390 RESTORE_GOT 391 RET 392 393INIT_XMM ssse3 394cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset 395 GET_GOT goffsetq 396 mova m0, [leftq] 397 movu m7, [aboveq-1] 398 ; comments below are for a predictor like this 399 ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 400 ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 401 ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 402 ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 403 ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 404 ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 405 ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 406 ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 407 ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 408 ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 409 ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 410 ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 411 ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 412 ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 413 ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 414 ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 415 pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] 416 palignr m5, m0, m6, 15 417 palignr m3, m0, m6, 14 418 419 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg 420 pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] 421 pavgb m5, m0 ; A1 - Ag 422 423 punpcklbw m0, m4, m5 ; A-B8 ... A-B1 424 punpckhbw m4, m5 ; A-B9 ... A-Bg 425 426 pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] 427 pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] 428 429 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 430 431 pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] 432 DEFINE_ARGS dst, stride, stride3 433 lea stride3q, [strideq*3] 434 palignr m2, m1, m6, 14 435 mova [dstq ], m2 436 palignr m2, m1, m6, 12 437 mova [dstq+strideq ], m2 438 palignr m2, m1, m6, 10 439 mova [dstq+strideq*2], m2 440 palignr m2, m1, m6, 8 441 mova [dstq+stride3q ], m2 442 lea dstq, [dstq+strideq*4] 443 palignr m2, m1, m6, 6 444 mova [dstq ], m2 445 palignr m2, m1, m6, 4 446 mova [dstq+strideq ], m2 447 palignr m2, m1, m6, 2 448 mova [dstq+strideq*2], m2 449 pshufb m4, [GLOBAL(sh_bfedcba9876543210)] 450 mova [dstq+stride3q ], m6 451 lea dstq, [dstq+strideq*4] 452 453 palignr m2, m6, m4, 14 454 mova [dstq ], m2 455 palignr m2, m6, m4, 12 456 mova [dstq+strideq ], m2 457 palignr m2, m6, m4, 10 458 mova [dstq+strideq*2], m2 459 palignr m2, m6, m4, 8 460 mova [dstq+stride3q ], m2 461 lea dstq, [dstq+strideq*4] 462 palignr m2, m6, m4, 6 463 mova [dstq ], m2 464 palignr m2, m6, m4, 4 465 mova [dstq+strideq ], m2 466 palignr m2, m6, m4, 2 467 mova [dstq+strideq*2], m2 468 mova [dstq+stride3q ], m4 469 RESTORE_GOT 470 RET 471 472INIT_XMM ssse3 473cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset 474 GET_GOT goffsetq 475 mova m0, [leftq] 476 movu m7, [aboveq-1] 477 movu m1, [aboveq+15] 478 479 pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] 480 pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] 481 482 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] 483 484 palignr m3, m1, m7, 1 485 palignr m5, m1, m7, 2 486 487 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] 488 489 pshufb m7, [GLOBAL(sh_bfedcba9876543210)] 490 palignr m5, m0, m7, 15 491 palignr m3, m0, m7, 14 492 493 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg 494 pavgb m5, m0 ; A1 - Ag 495 punpcklbw m6, m4, m5 ; A-B8 ... A-B1 496 punpckhbw m4, m5 ; A-B9 ... A-Bg 497 pshufb m6, [GLOBAL(sh_bfedcba9876543210)] 498 pshufb m4, [GLOBAL(sh_bfedcba9876543210)] 499 500 DEFINE_ARGS dst, stride, stride3, left, line 501 lea stride3q, [strideq*3] 502 503 palignr m5, m2, m1, 14 504 palignr m7, m1, m6, 14 505 mova [dstq ], m7 506 mova [dstq+16 ], m5 507 palignr m5, m2, m1, 12 508 palignr m7, m1, m6, 12 509 mova [dstq+strideq ], m7 510 mova [dstq+strideq+16 ], m5 511 palignr m5, m2, m1, 10 512 palignr m7, m1, m6, 10 513 mova [dstq+strideq*2 ], m7 514 mova [dstq+strideq*2+16], m5 515 palignr m5, m2, m1, 8 516 palignr m7, m1, m6, 8 517 mova [dstq+stride3q ], m7 518 mova [dstq+stride3q+16 ], m5 519 lea dstq, [dstq+strideq*4] 520 palignr m5, m2, m1, 6 521 palignr m7, m1, m6, 6 522 mova [dstq ], m7 523 mova [dstq+16 ], m5 524 palignr m5, m2, m1, 4 525 palignr m7, m1, m6, 4 526 mova [dstq+strideq ], m7 527 mova [dstq+strideq+16 ], m5 528 palignr m5, m2, m1, 2 529 palignr m7, m1, m6, 2 530 mova [dstq+strideq*2 ], m7 531 mova [dstq+strideq*2+16], m5 532 mova [dstq+stride3q ], m6 533 mova [dstq+stride3q+16 ], m1 534 lea dstq, [dstq+strideq*4] 535 536 palignr m5, m1, m6, 14 537 palignr m3, m6, m4, 14 538 mova [dstq ], m3 539 mova [dstq+16 ], m5 540 palignr m5, m1, m6, 12 541 palignr m3, m6, m4, 12 542 mova [dstq+strideq ], m3 543 mova [dstq+strideq+16 ], m5 544 palignr m5, m1, m6, 10 545 palignr m3, m6, m4, 10 546 mova [dstq+strideq*2 ], m3 547 mova [dstq+strideq*2+16], m5 548 palignr m5, m1, m6, 8 549 palignr m3, m6, m4, 8 550 mova [dstq+stride3q ], m3 551 mova [dstq+stride3q+16 ], m5 552 lea dstq, [dstq+strideq*4] 553 palignr m5, m1, m6, 6 554 palignr m3, m6, m4, 6 555 mova [dstq ], m3 556 mova [dstq+16 ], m5 557 palignr m5, m1, m6, 4 558 palignr m3, m6, m4, 4 559 mova [dstq+strideq ], m3 560 mova [dstq+strideq+16 ], m5 561 palignr m5, m1, m6, 2 562 palignr m3, m6, m4, 2 563 mova [dstq+strideq*2 ], m3 564 mova [dstq+strideq*2+16], m5 565 mova [dstq+stride3q ], m4 566 mova [dstq+stride3q+16 ], m6 567 lea dstq, [dstq+strideq*4] 568 569 mova m7, [leftq] 570 mova m3, [leftq+16] 571 palignr m5, m3, m7, 15 572 palignr m0, m3, m7, 14 573 574 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - 575 pavgb m5, m3 ; Ah - 576 punpcklbw m3, m2, m5 ; A-B8 ... A-B1 577 punpckhbw m2, m5 ; A-B9 ... A-Bg 578 pshufb m3, [GLOBAL(sh_bfedcba9876543210)] 579 pshufb m2, [GLOBAL(sh_bfedcba9876543210)] 580 581 palignr m7, m6, m4, 14 582 palignr m0, m4, m3, 14 583 mova [dstq ], m0 584 mova [dstq+16 ], m7 585 palignr m7, m6, m4, 12 586 palignr m0, m4, m3, 12 587 mova [dstq+strideq ], m0 588 mova [dstq+strideq+16 ], m7 589 palignr m7, m6, m4, 10 590 palignr m0, m4, m3, 10 591 mova [dstq+strideq*2 ], m0 592 mova [dstq+strideq*2+16], m7 593 palignr m7, m6, m4, 8 594 palignr m0, m4, m3, 8 595 mova [dstq+stride3q ], m0 596 mova [dstq+stride3q+16 ], m7 597 lea dstq, [dstq+strideq*4] 598 palignr m7, m6, m4, 6 599 palignr m0, m4, m3, 6 600 mova [dstq ], m0 601 mova [dstq+16 ], m7 602 palignr m7, m6, m4, 4 603 palignr m0, m4, m3, 4 604 mova [dstq+strideq ], m0 605 mova [dstq+strideq+16 ], m7 606 palignr m7, m6, m4, 2 607 palignr m0, m4, m3, 2 608 mova [dstq+strideq*2 ], m0 609 mova [dstq+strideq*2+16], m7 610 mova [dstq+stride3q ], m3 611 mova [dstq+stride3q+16 ], m4 612 lea dstq, [dstq+strideq*4] 613 614 palignr m7, m4, m3, 14 615 palignr m0, m3, m2, 14 616 mova [dstq ], m0 617 mova [dstq+16 ], m7 618 palignr m7, m4, m3, 12 619 palignr m0, m3, m2, 12 620 mova [dstq+strideq ], m0 621 mova [dstq+strideq+16 ], m7 622 palignr m7, m4, m3, 10 623 palignr m0, m3, m2, 10 624 mova [dstq+strideq*2 ], m0 625 mova [dstq+strideq*2+16], m7 626 palignr m7, m4, m3, 8 627 palignr m0, m3, m2, 8 628 mova [dstq+stride3q ], m0 629 mova [dstq+stride3q+16 ], m7 630 lea dstq, [dstq+strideq*4] 631 palignr m7, m4, m3, 6 632 palignr m0, m3, m2, 6 633 mova [dstq ], m0 634 mova [dstq+16 ], m7 635 palignr m7, m4, m3, 4 636 palignr m0, m3, m2, 4 637 mova [dstq+strideq ], m0 638 mova [dstq+strideq+16 ], m7 639 palignr m7, m4, m3, 2 640 palignr m0, m3, m2, 2 641 mova [dstq+strideq*2 ], m0 642 mova [dstq+strideq*2+16], m7 643 mova [dstq+stride3q ], m2 644 mova [dstq+stride3q+16 ], m3 645 646 RESTORE_GOT 647 RET 648 649INIT_XMM ssse3 650cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset 651 GET_GOT goffsetq 652 movq m3, [leftq] ; abcdefgh [byte] 653 lea stride3q, [strideq*3] 654 655 pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] 656 pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] 657 pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] 658 659 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3 660 pavgb m0, m2 661 punpcklbw m0, m3 ; interleaved output 662 663 movq [dstq ], m0 664 psrldq m0, 2 665 movq [dstq+strideq ], m0 666 psrldq m0, 2 667 movq [dstq+strideq*2], m0 668 psrldq m0, 2 669 movq [dstq+stride3q ], m0 670 lea dstq, [dstq+strideq*4] 671 pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh 672 psrldq m0, 2 673 movq [dstq ], m0 674 psrldq m0, 2 675 movq [dstq+strideq ], m0 676 psrldq m0, 2 677 movq [dstq+strideq*2], m0 678 psrldq m0, 2 679 movq [dstq+stride3q ], m0 680 RESTORE_GOT 681 RET 682 683INIT_XMM ssse3 684cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset 685 GET_GOT goffsetq 686 lea stride3q, [strideq*3] 687 mova m0, [leftq] ; abcdefghijklmnop [byte] 688 pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp 689 pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] 690 691 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 692 pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte] 693 694 punpckhbw m4, m1, m3 ; interleaved input 695 punpcklbw m1, m3 ; interleaved output 696 mova [dstq ], m1 697 palignr m3, m4, m1, 2 698 mova [dstq+strideq ], m3 699 palignr m3, m4, m1, 4 700 mova [dstq+strideq*2], m3 701 palignr m3, m4, m1, 6 702 mova [dstq+stride3q ], m3 703 lea dstq, [dstq+strideq*4] 704 palignr m3, m4, m1, 8 705 mova [dstq ], m3 706 palignr m3, m4, m1, 10 707 mova [dstq+strideq ], m3 708 palignr m3, m4, m1, 12 709 mova [dstq+strideq*2], m3 710 palignr m3, m4, m1, 14 711 mova [dstq+stride3q ], m3 712 DEFINE_ARGS dst, stride, stride3, line 713 mov lined, 2 714 mova m0, [GLOBAL(sh_b23456789abcdefff)] 715.loop: 716 lea dstq, [dstq+strideq*4] 717 mova [dstq ], m4 718 pshufb m4, m0 719 mova [dstq+strideq ], m4 720 pshufb m4, m0 721 mova [dstq+strideq*2], m4 722 pshufb m4, m0 723 mova [dstq+stride3q ], m4 724 pshufb m4, m0 725 dec lined 726 jnz .loop 727 RESTORE_GOT 728 REP_RET 729 730INIT_XMM ssse3 731cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset 732 GET_GOT goffsetq 733 lea stride3q, [strideq*3] 734 mova m1, [leftq] ; 0-15 [byte] 735 mova m2, [leftq+16] ; 16-31 [byte] 736 pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)] 737 pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)] 738 739 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3 740 palignr m6, m2, m1, 1 741 palignr m5, m2, m1, 2 742 pavgb m2, m4 ; high 16px even lines 743 744 X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0 745 pavgb m1, m6 ; low 16px even lines 746 747 punpckhbw m6, m1, m0 ; interleaved output 2 748 punpcklbw m1, m0 ; interleaved output 1 749 750 punpckhbw m7, m2, m3 ; interleaved output 4 751 punpcklbw m2, m3 ; interleaved output 3 752 753 ; output 1st 8 lines (and half of 2nd 8 lines) 754 DEFINE_ARGS dst, stride, stride3, dst8 755 lea dst8q, [dstq+strideq*8] 756 mova [dstq ], m1 757 mova [dstq +16], m6 758 mova [dst8q ], m6 759 palignr m0, m6, m1, 2 760 palignr m4, m2, m6, 2 761 mova [dstq +strideq ], m0 762 mova [dstq +strideq +16], m4 763 mova [dst8q+strideq ], m4 764 palignr m0, m6, m1, 4 765 palignr m4, m2, m6, 4 766 mova [dstq +strideq*2 ], m0 767 mova [dstq +strideq*2+16], m4 768 mova [dst8q+strideq*2 ], m4 769 palignr m0, m6, m1, 6 770 palignr m4, m2, m6, 6 771 mova [dstq +stride3q ], m0 772 mova [dstq +stride3q +16], m4 773 mova [dst8q+stride3q ], m4 774 lea dstq, [dstq +strideq*4] 775 lea dst8q, [dst8q+strideq*4] 776 palignr m0, m6, m1, 8 777 palignr m4, m2, m6, 8 778 mova [dstq ], m0 779 mova [dstq +16], m4 780 mova [dst8q ], m4 781 palignr m0, m6, m1, 10 782 palignr m4, m2, m6, 10 783 mova [dstq +strideq ], m0 784 mova [dstq +strideq +16], m4 785 mova [dst8q+strideq ], m4 786 palignr m0, m6, m1, 12 787 palignr m4, m2, m6, 12 788 mova [dstq +strideq*2 ], m0 789 mova [dstq +strideq*2+16], m4 790 mova [dst8q+strideq*2 ], m4 791 palignr m0, m6, m1, 14 792 palignr m4, m2, m6, 14 793 mova [dstq +stride3q ], m0 794 mova [dstq +stride3q +16], m4 795 mova [dst8q+stride3q ], m4 796 lea dstq, [dstq+strideq*4] 797 lea dst8q, [dst8q+strideq*4] 798 799 ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines 800 mova [dstq +16], m2 801 mova [dst8q ], m2 802 palignr m4, m7, m2, 2 803 mova [dstq +strideq +16], m4 804 mova [dst8q+strideq ], m4 805 palignr m4, m7, m2, 4 806 mova [dstq +strideq*2+16], m4 807 mova [dst8q+strideq*2 ], m4 808 palignr m4, m7, m2, 6 809 mova [dstq +stride3q +16], m4 810 mova [dst8q+stride3q ], m4 811 lea dstq, [dstq+strideq*4] 812 lea dst8q, [dst8q+strideq*4] 813 palignr m4, m7, m2, 8 814 mova [dstq +16], m4 815 mova [dst8q ], m4 816 palignr m4, m7, m2, 10 817 mova [dstq +strideq +16], m4 818 mova [dst8q+strideq ], m4 819 palignr m4, m7, m2, 12 820 mova [dstq +strideq*2+16], m4 821 mova [dst8q+strideq*2 ], m4 822 palignr m4, m7, m2, 14 823 mova [dstq +stride3q +16], m4 824 mova [dst8q+stride3q ], m4 825 lea dstq, [dstq+strideq*4] 826 lea dst8q, [dst8q+strideq*4] 827 828 ; output 2nd half of 3rd 8 lines and half of 4th 8 lines 829 mova m0, [GLOBAL(sh_b23456789abcdefff)] 830 mova [dstq +16], m7 831 mova [dst8q ], m7 832 pshufb m7, m0 833 mova [dstq +strideq +16], m7 834 mova [dst8q+strideq ], m7 835 pshufb m7, m0 836 mova [dstq +strideq*2+16], m7 837 mova [dst8q+strideq*2 ], m7 838 pshufb m7, m0 839 mova [dstq +stride3q +16], m7 840 mova [dst8q+stride3q ], m7 841 pshufb m7, m0 842 lea dstq, [dstq+strideq*4] 843 lea dst8q, [dst8q+strideq*4] 844 mova [dstq +16], m7 845 mova [dst8q ], m7 846 pshufb m7, m0 847 mova [dstq +strideq +16], m7 848 mova [dst8q+strideq ], m7 849 pshufb m7, m0 850 mova [dstq +strideq*2+16], m7 851 mova [dst8q+strideq*2 ], m7 852 pshufb m7, m0 853 mova [dstq +stride3q +16], m7 854 mova [dst8q+stride3q ], m7 855 pshufb m7, m0 856 lea dstq, [dstq+strideq*4] 857 858 ; output last half of 4th 8 lines 859 mova [dstq +16], m7 860 mova [dstq +strideq +16], m7 861 mova [dstq +strideq*2+16], m7 862 mova [dstq +stride3q +16], m7 863 lea dstq, [dstq+strideq*4] 864 mova [dstq +16], m7 865 mova [dstq +strideq +16], m7 866 mova [dstq +strideq*2+16], m7 867 mova [dstq +stride3q +16], m7 868 869 ; done! 870 RESTORE_GOT 871 RET 872