1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14
15pb_1: times 16 db 1
16sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
17sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
18sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
19sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
20sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
21sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
22sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
23sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0
24sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0
25sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
26sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0
27sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0
28sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
29sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
30
31SECTION .text
32
33INIT_XMM ssse3
34cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset
35  GET_GOT     goffsetq
36
37  mova                   m0, [aboveq]
38  DEFINE_ARGS dst, stride, stride3, dst8, line
39  lea              stride3q, [strideq*3]
40  lea                 dst8q, [dstq+strideq*8]
41  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
42  pshufb                 m2, m0, [GLOBAL(sh_b23456789abcdefff)]
43  pavgb                  m3, m2, m0
44  pxor                   m2, m0
45  pshufb                 m0, m1
46  pand                   m2, [GLOBAL(pb_1)]
47  psubb                  m3, m2
48  pavgb                  m0, m3
49
50  ; first 4 lines and first half of 3rd 4 lines
51  mov                 lined, 2
52.loop:
53  mova   [dstq            ], m0
54  movhps [dst8q           ], m0
55  pshufb                 m0, m1
56  mova   [dstq +strideq   ], m0
57  movhps [dst8q+strideq   ], m0
58  pshufb                 m0, m1
59  mova   [dstq +strideq*2 ], m0
60  movhps [dst8q+strideq*2 ], m0
61  pshufb                 m0, m1
62  mova   [dstq +stride3q  ], m0
63  movhps [dst8q+stride3q  ], m0
64  pshufb                 m0, m1
65  lea                  dstq, [dstq +strideq*4]
66  lea                 dst8q, [dst8q+strideq*4]
67  dec                 lined
68  jnz .loop
69
70  ; bottom-right 8x8 block
71  movhps [dstq          +8], m0
72  movhps [dstq+strideq  +8], m0
73  movhps [dstq+strideq*2+8], m0
74  movhps [dstq+stride3q +8], m0
75  lea                  dstq, [dstq+strideq*4]
76  movhps [dstq          +8], m0
77  movhps [dstq+strideq  +8], m0
78  movhps [dstq+strideq*2+8], m0
79  movhps [dstq+stride3q +8], m0
80
81  RESTORE_GOT
82  RET
83
84INIT_XMM ssse3
85cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
86  GET_GOT     goffsetq
87
88  mova                   m0, [aboveq]
89  mova                   m4, [aboveq+16]
90  DEFINE_ARGS dst, stride, stride3, dst16, line
91  lea              stride3q, [strideq*3]
92  lea                dst16q, [dstq  +strideq*8]
93  lea                dst16q, [dst16q+strideq*8]
94  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
95  pshufb                 m2, m4, [GLOBAL(sh_b23456789abcdefff)]
96  pavgb                  m3, m2, m4
97  pxor                   m2, m4
98  palignr                m5, m4, m0, 1
99  palignr                m6, m4, m0, 2
100  pshufb                 m4, m1
101  pand                   m2, [GLOBAL(pb_1)]
102  psubb                  m3, m2
103  pavgb                  m4, m3
104  pavgb                  m3, m0, m6
105  pxor                   m0, m6
106  pand                   m0, [GLOBAL(pb_1)]
107  psubb                  m3, m0
108  pavgb                  m5, m3
109
110  ; write 4x4 lines (and the first half of the second 4x4 lines)
111  mov                  lined, 4
112.loop:
113  mova [dstq               ], m5
114  mova [dstq            +16], m4
115  mova [dst16q             ], m4
116  palignr                 m3, m4, m5, 1
117  pshufb                  m4, m1
118  mova [dstq  +strideq     ], m3
119  mova [dstq  +strideq  +16], m4
120  mova [dst16q+strideq     ], m4
121  palignr                 m5, m4, m3, 1
122  pshufb                  m4, m1
123  mova [dstq  +strideq*2   ], m5
124  mova [dstq  +strideq*2+16], m4
125  mova [dst16q+strideq*2   ], m4
126  palignr                 m3, m4, m5, 1
127  pshufb                  m4, m1
128  mova [dstq  +stride3q    ], m3
129  mova [dstq  +stride3q +16], m4
130  mova [dst16q+stride3q    ], m4
131  palignr                 m5, m4, m3, 1
132  pshufb                  m4, m1
133  lea                  dstq, [dstq  +strideq*4]
134  lea                dst16q, [dst16q+strideq*4]
135  dec                 lined
136  jnz .loop
137
138  ; write second half of second 4x4 lines
139  mova [dstq            +16], m4
140  mova [dstq  +strideq  +16], m4
141  mova [dstq  +strideq*2+16], m4
142  mova [dstq  +stride3q +16], m4
143  lea                  dstq, [dstq  +strideq*4]
144  mova [dstq            +16], m4
145  mova [dstq  +strideq  +16], m4
146  mova [dstq  +strideq*2+16], m4
147  mova [dstq  +stride3q +16], m4
148  lea                  dstq, [dstq  +strideq*4]
149  mova [dstq            +16], m4
150  mova [dstq  +strideq  +16], m4
151  mova [dstq  +strideq*2+16], m4
152  mova [dstq  +stride3q +16], m4
153  lea                  dstq, [dstq  +strideq*4]
154  mova [dstq            +16], m4
155  mova [dstq  +strideq  +16], m4
156  mova [dstq  +strideq*2+16], m4
157  mova [dstq  +stride3q +16], m4
158
159  RESTORE_GOT
160  RET
161
162; ------------------------------------------
163; input: x, y, z, result
164;
165; trick from pascal
166; (x+2y+z+2)>>2 can be calculated as:
167; result = avg(x,z)
168; result -= xor(x,z) & 1
169; result = avg(result,y)
170; ------------------------------------------
171%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
172  pavgb               %4, %1, %3
173  pxor                %3, %1
174  pand                %3, [GLOBAL(pb_1)]
175  psubb               %4, %3
176  pavgb               %4, %2
177%endmacro
178
179INIT_XMM ssse3
180cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
181  GET_GOT     goffsetq
182
183  movq                m3, [aboveq]
184  pshufb              m1, m3, [GLOBAL(sh_b23456777)]
185  pshufb              m2, m3, [GLOBAL(sh_b12345677)]
186
187  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
188  pavgb               m3, m2
189
190  ; store 4 lines
191  movd    [dstq        ], m3
192  movd    [dstq+strideq], m4
193  lea               dstq, [dstq+strideq*2]
194  psrldq              m3, 1
195  psrldq              m4, 1
196  movd    [dstq        ], m3
197  movd    [dstq+strideq], m4
198  RESTORE_GOT
199  RET
200
201INIT_XMM ssse3
202cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
203  GET_GOT     goffsetq
204
205  movq                m3, [aboveq]
206  DEFINE_ARGS dst, stride, stride3
207  lea           stride3q, [strideq*3]
208  pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
209  pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
210  pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
211  pshufb              m3, [GLOBAL(sh_b0123456777777777)]
212
213  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
214  pavgb               m3, m2
215
216  ; store 4 lines
217  movq    [dstq        ], m3
218  movq    [dstq+strideq], m4
219  psrldq              m3, 1
220  psrldq              m4, 1
221  movq  [dstq+strideq*2], m3
222  movq  [dstq+stride3q ], m4
223  lea               dstq, [dstq+strideq*4]
224  psrldq              m3, 1
225  psrldq              m4, 1
226
227  ; store 4 lines
228  movq    [dstq        ], m3
229  movq    [dstq+strideq], m4
230  psrldq              m3, 1
231  psrldq              m4, 1
232  movq  [dstq+strideq*2], m3
233  movq  [dstq+stride3q ], m4
234  RESTORE_GOT
235  RET
236
237INIT_XMM ssse3
238cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
239  GET_GOT     goffsetq
240
241  mova                m0, [aboveq]
242  DEFINE_ARGS dst, stride, stride3, line
243  lea           stride3q, [strideq*3]
244  mova                m1, [GLOBAL(sh_b123456789abcdeff)]
245  pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
246  pshufb              m3, m0, m1
247
248  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
249  pavgb               m0, m3
250
251  mov              lined, 4
252.loop:
253  mova  [dstq          ], m0
254  mova  [dstq+strideq  ], m4
255  pshufb              m0, m1
256  pshufb              m4, m1
257  mova  [dstq+strideq*2], m0
258  mova  [dstq+stride3q ], m4
259  pshufb              m0, m1
260  pshufb              m4, m1
261  lea               dstq, [dstq+strideq*4]
262  dec              lined
263  jnz .loop
264  RESTORE_GOT
265  REP_RET
266
267INIT_XMM ssse3
268cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
269  GET_GOT     goffsetq
270
271  mova                   m0, [aboveq]
272  mova                   m7, [aboveq+16]
273  DEFINE_ARGS dst, stride, stride3, line
274  mova                   m1, [GLOBAL(sh_b123456789abcdeff)]
275  lea              stride3q, [strideq*3]
276  pshufb                 m2, m7, [GLOBAL(sh_b23456789abcdefff)]
277  pshufb                 m3, m7, m1
278
279  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
280  palignr                m6, m7, m0, 1
281  palignr                m5, m7, m0, 2
282  pavgb                  m7, m3
283
284  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
285  pavgb                  m0, m6
286
287  mov                 lined, 8
288.loop:
289  mova  [dstq             ], m0
290  mova  [dstq          +16], m7
291  mova  [dstq+strideq     ], m2
292  mova  [dstq+strideq  +16], m4
293  palignr                m3, m7, m0, 1
294  palignr                m5, m4, m2, 1
295  pshufb                 m7, m1
296  pshufb                 m4, m1
297
298  mova  [dstq+strideq*2   ], m3
299  mova  [dstq+strideq*2+16], m7
300  mova  [dstq+stride3q    ], m5
301  mova  [dstq+stride3q +16], m4
302  palignr                m0, m7, m3, 1
303  palignr                m2, m4, m5, 1
304  pshufb                 m7, m1
305  pshufb                 m4, m1
306  lea                  dstq, [dstq+strideq*4]
307  dec                 lined
308  jnz .loop
309  RESTORE_GOT
310  REP_RET
311
312INIT_XMM ssse3
313cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
314  GET_GOT     goffsetq
315  movd                m0, [leftq]               ; l1, l2, l3, l4
316  movd                m1, [aboveq-1]            ; tl, t1, t2, t3
317  punpckldq           m0, m1                    ; l1, l2, l3, l4, tl, t1, t2, t3
318  pshufb              m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3
319  psrldq              m1, m0, 1                 ; l3, l2, l1, tl, t1, t2, t3
320  psrldq              m2, m0, 2                 ; l2, l1, tl, t1, t2, t3
321  ; comments below are for a predictor like this
322  ; A1 B1 C1 D1
323  ; A2 B2 A1 B1
324  ; A3 B3 A2 B2
325  ; A4 B4 A3 B3
326  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3  ; 3-tap avg B4 B3 B2 B1 C1 D1
327  pavgb               m1, m0                    ; 2-tap avg A4 A3 A2 A1
328
329  punpcklqdq          m3, m1                    ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 ..
330
331  DEFINE_ARGS dst, stride, stride3
332  lea           stride3q, [strideq*3]
333  pshufb              m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 ..
334  movd  [dstq+stride3q ], m3
335  psrldq              m3, 2                     ; A3 B3 A2 B2 A1 B1 C1 D1 ..
336  movd  [dstq+strideq*2], m3
337  psrldq              m3, 2                     ; A2 B2 A1 B1 C1 D1 ..
338  movd  [dstq+strideq  ], m3
339  psrldq              m3, 2                     ; A1 B1 C1 D1 ..
340  movd  [dstq          ], m3
341  RESTORE_GOT
342  RET
343
344INIT_XMM ssse3
345cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset
346  GET_GOT     goffsetq
347  movq                m0, [leftq]                     ; [0- 7] l1-8 [byte]
348  movhps              m0, [aboveq-1]                  ; [8-15] tl, t1-7 [byte]
349  pshufb              m1, m0, [GLOBAL(sh_b76543210)]  ; l8-1 [word]
350  pshufb              m2, m0, [GLOBAL(sh_b65432108)]  ; l7-1,tl [word]
351  pshufb              m3, m0, [GLOBAL(sh_b54321089)]  ; l6-1,tl,t1 [word]
352  pshufb              m0, [GLOBAL(sh_b89abcdef)]      ; tl,t1-7 [word]
353  psrldq              m4, m0, 1                       ; t1-7 [word]
354  psrldq              m5, m0, 2                       ; t2-7 [word]
355  ; comments below are for a predictor like this
356  ; A1 B1 C1 D1 E1 F1 G1 H1
357  ; A2 B2 A1 B1 C1 D1 E1 F1
358  ; A3 B3 A2 B2 A1 B1 C1 D1
359  ; A4 B4 A3 B3 A2 B2 A1 B1
360  ; A5 B5 A4 B4 A3 B3 A2 B2
361  ; A6 B6 A5 B5 A4 B4 A3 B3
362  ; A7 B7 A6 B6 A5 B5 A4 B4
363  ; A8 B8 A7 B7 A6 B6 A5 B5
364  pavgb               m6, m1, m2                ; 2-tap avg A8-A1
365
366  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7  ; 3-tap avg C-H1
367
368  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0  ; 3-tap avg B8-1
369
370  punpcklbw           m6, m0                    ; A-B8, A-B7 ... A-B2, A-B1
371
372  DEFINE_ARGS dst, stride, stride3
373  lea           stride3q, [strideq*3]
374
375  movhps [dstq+stride3q], m6                    ; A-B4, A-B3, A-B2, A-B1
376  palignr             m0, m7, m6, 10            ; A-B3, A-B2, A-B1, C-H1
377  movq  [dstq+strideq*2], m0
378  psrldq              m0, 2                     ; A-B2, A-B1, C-H1
379  movq  [dstq+strideq  ], m0
380  psrldq              m0, 2                     ; A-H1
381  movq  [dstq          ], m0
382  lea               dstq, [dstq+strideq*4]
383  movq  [dstq+stride3q ], m6                    ; A-B8, A-B7, A-B6, A-B5
384  psrldq              m6, 2                     ; A-B7, A-B6, A-B5, A-B4
385  movq  [dstq+strideq*2], m6
386  psrldq              m6, 2                     ; A-B6, A-B5, A-B4, A-B3
387  movq  [dstq+strideq  ], m6
388  psrldq              m6, 2                     ; A-B5, A-B4, A-B3, A-B2
389  movq  [dstq          ], m6
390  RESTORE_GOT
391  RET
392
393INIT_XMM ssse3
394cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset
395  GET_GOT     goffsetq
396  mova                m0, [leftq]
397  movu                m7, [aboveq-1]
398  ; comments below are for a predictor like this
399  ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1
400  ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1
401  ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1
402  ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1
403  ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1
404  ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1
405  ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1
406  ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1
407  ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2
408  ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3
409  ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4
410  ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5
411  ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6
412  ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7
413  ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8
414  ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9
415  pshufb              m6, m7, [GLOBAL(sh_bfedcba9876543210)]
416  palignr             m5, m0, m6, 15
417  palignr             m3, m0, m6, 14
418
419  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
420  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)]
421  pavgb               m5, m0                            ; A1 - Ag
422
423  punpcklbw           m0, m4, m5                        ; A-B8 ... A-B1
424  punpckhbw           m4, m5                            ; A-B9 ... A-Bg
425
426  pshufb              m3, m7, [GLOBAL(sh_b123456789abcdeff)]
427  pshufb              m5, m7, [GLOBAL(sh_b23456789abcdefff)]
428
429  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg C1-P1
430
431  pshufb              m6, m0, [GLOBAL(sh_bfedcba9876543210)]
432  DEFINE_ARGS dst, stride, stride3
433  lea           stride3q, [strideq*3]
434  palignr             m2, m1, m6, 14
435  mova  [dstq          ], m2
436  palignr             m2, m1, m6, 12
437  mova  [dstq+strideq  ], m2
438  palignr             m2, m1, m6, 10
439  mova  [dstq+strideq*2], m2
440  palignr             m2, m1, m6, 8
441  mova  [dstq+stride3q ], m2
442  lea               dstq, [dstq+strideq*4]
443  palignr             m2, m1, m6, 6
444  mova  [dstq          ], m2
445  palignr             m2, m1, m6, 4
446  mova  [dstq+strideq  ], m2
447  palignr             m2, m1, m6, 2
448  mova  [dstq+strideq*2], m2
449  pshufb              m4, [GLOBAL(sh_bfedcba9876543210)]
450  mova  [dstq+stride3q ], m6
451  lea               dstq, [dstq+strideq*4]
452
453  palignr             m2, m6, m4, 14
454  mova  [dstq          ], m2
455  palignr             m2, m6, m4, 12
456  mova  [dstq+strideq  ], m2
457  palignr             m2, m6, m4, 10
458  mova  [dstq+strideq*2], m2
459  palignr             m2, m6, m4, 8
460  mova  [dstq+stride3q ], m2
461  lea               dstq, [dstq+strideq*4]
462  palignr             m2, m6, m4, 6
463  mova  [dstq          ], m2
464  palignr             m2, m6, m4, 4
465  mova  [dstq+strideq  ], m2
466  palignr             m2, m6, m4, 2
467  mova  [dstq+strideq*2], m2
468  mova  [dstq+stride3q ], m4
469  RESTORE_GOT
470  RET
471
472INIT_XMM ssse3
473cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset
474  GET_GOT     goffsetq
475  mova                  m0, [leftq]
476  movu                  m7, [aboveq-1]
477  movu                  m1, [aboveq+15]
478
479  pshufb                m4, m1, [GLOBAL(sh_b123456789abcdeff)]
480  pshufb                m6, m1, [GLOBAL(sh_b23456789abcdefff)]
481
482  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2          ; 3-tap avg above [high]
483
484  palignr               m3, m1, m7, 1
485  palignr               m5, m1, m7, 2
486
487  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1          ; 3-tap avg above [low]
488
489  pshufb                m7, [GLOBAL(sh_bfedcba9876543210)]
490  palignr               m5, m0, m7, 15
491  palignr               m3, m0, m7, 14
492
493  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4          ; 3-tap avg B3-Bg
494  pavgb                 m5, m0                            ; A1 - Ag
495  punpcklbw             m6, m4, m5                        ; A-B8 ... A-B1
496  punpckhbw             m4, m5                            ; A-B9 ... A-Bg
497  pshufb                m6, [GLOBAL(sh_bfedcba9876543210)]
498  pshufb                m4, [GLOBAL(sh_bfedcba9876543210)]
499
500  DEFINE_ARGS dst, stride, stride3, left, line
501  lea             stride3q, [strideq*3]
502
503  palignr               m5, m2, m1, 14
504  palignr               m7, m1, m6, 14
505  mova  [dstq            ], m7
506  mova  [dstq+16         ], m5
507  palignr               m5, m2, m1, 12
508  palignr               m7, m1, m6, 12
509  mova  [dstq+strideq    ], m7
510  mova  [dstq+strideq+16 ], m5
511  palignr                m5, m2, m1, 10
512  palignr                m7, m1, m6, 10
513  mova  [dstq+strideq*2   ], m7
514  mova  [dstq+strideq*2+16], m5
515  palignr                m5, m2, m1, 8
516  palignr                m7, m1, m6, 8
517  mova  [dstq+stride3q    ], m7
518  mova  [dstq+stride3q+16 ], m5
519  lea                  dstq, [dstq+strideq*4]
520  palignr                m5, m2, m1, 6
521  palignr                m7, m1, m6, 6
522  mova  [dstq             ], m7
523  mova  [dstq+16          ], m5
524  palignr                m5, m2, m1, 4
525  palignr                m7, m1, m6, 4
526  mova  [dstq+strideq     ], m7
527  mova  [dstq+strideq+16  ], m5
528  palignr                m5, m2, m1, 2
529  palignr                m7, m1, m6, 2
530  mova  [dstq+strideq*2   ], m7
531  mova  [dstq+strideq*2+16], m5
532  mova  [dstq+stride3q    ], m6
533  mova  [dstq+stride3q+16 ], m1
534  lea                  dstq, [dstq+strideq*4]
535
536  palignr                m5, m1, m6, 14
537  palignr                m3, m6, m4, 14
538  mova  [dstq             ], m3
539  mova  [dstq+16          ], m5
540  palignr                m5, m1, m6, 12
541  palignr                m3, m6, m4, 12
542  mova  [dstq+strideq     ], m3
543  mova  [dstq+strideq+16  ], m5
544  palignr                m5, m1, m6, 10
545  palignr                m3, m6, m4, 10
546  mova  [dstq+strideq*2   ], m3
547  mova  [dstq+strideq*2+16], m5
548  palignr                m5, m1, m6, 8
549  palignr                m3, m6, m4, 8
550  mova  [dstq+stride3q    ], m3
551  mova  [dstq+stride3q+16 ], m5
552  lea                  dstq, [dstq+strideq*4]
553  palignr                m5, m1, m6, 6
554  palignr                m3, m6, m4, 6
555  mova  [dstq             ], m3
556  mova  [dstq+16          ], m5
557  palignr                m5, m1, m6, 4
558  palignr                m3, m6, m4, 4
559  mova  [dstq+strideq     ], m3
560  mova  [dstq+strideq+16  ], m5
561  palignr                m5, m1, m6, 2
562  palignr                m3, m6, m4, 2
563  mova  [dstq+strideq*2   ], m3
564  mova  [dstq+strideq*2+16], m5
565  mova  [dstq+stride3q    ], m4
566  mova  [dstq+stride3q+16 ], m6
567  lea               dstq, [dstq+strideq*4]
568
569  mova                   m7, [leftq]
570  mova                   m3, [leftq+16]
571  palignr                m5, m3, m7, 15
572  palignr                m0, m3, m7, 14
573
574  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2          ; 3-tap avg Bh -
575  pavgb                  m5, m3                            ; Ah -
576  punpcklbw              m3, m2, m5                        ; A-B8 ... A-B1
577  punpckhbw              m2, m5                            ; A-B9 ... A-Bg
578  pshufb                 m3, [GLOBAL(sh_bfedcba9876543210)]
579  pshufb                 m2, [GLOBAL(sh_bfedcba9876543210)]
580
581  palignr                m7, m6, m4, 14
582  palignr                m0, m4, m3, 14
583  mova  [dstq             ], m0
584  mova  [dstq+16          ], m7
585  palignr                m7, m6, m4, 12
586  palignr                m0, m4, m3, 12
587  mova  [dstq+strideq     ], m0
588  mova  [dstq+strideq+16  ], m7
589  palignr                m7, m6, m4, 10
590  palignr                m0, m4, m3, 10
591  mova  [dstq+strideq*2   ], m0
592  mova  [dstq+strideq*2+16], m7
593  palignr                m7, m6, m4, 8
594  palignr                m0, m4, m3, 8
595  mova  [dstq+stride3q    ], m0
596  mova  [dstq+stride3q+16 ], m7
597  lea                  dstq, [dstq+strideq*4]
598  palignr                m7, m6, m4, 6
599  palignr                m0, m4, m3, 6
600  mova  [dstq             ], m0
601  mova  [dstq+16          ], m7
602  palignr                m7, m6, m4, 4
603  palignr                m0, m4, m3, 4
604  mova  [dstq+strideq     ], m0
605  mova  [dstq+strideq+16  ], m7
606  palignr                m7, m6, m4, 2
607  palignr                m0, m4, m3, 2
608  mova  [dstq+strideq*2   ], m0
609  mova  [dstq+strideq*2+16], m7
610  mova  [dstq+stride3q    ], m3
611  mova  [dstq+stride3q+16 ], m4
612  lea                  dstq, [dstq+strideq*4]
613
614  palignr                m7, m4, m3, 14
615  palignr                m0, m3, m2, 14
616  mova  [dstq             ], m0
617  mova  [dstq+16          ], m7
618  palignr                m7, m4, m3, 12
619  palignr                m0, m3, m2, 12
620  mova  [dstq+strideq     ], m0
621  mova  [dstq+strideq+16  ], m7
622  palignr                m7, m4, m3, 10
623  palignr                m0, m3, m2, 10
624  mova  [dstq+strideq*2   ], m0
625  mova  [dstq+strideq*2+16], m7
626  palignr                m7, m4, m3, 8
627  palignr                m0, m3, m2, 8
628  mova  [dstq+stride3q    ], m0
629  mova  [dstq+stride3q+16 ], m7
630  lea                  dstq, [dstq+strideq*4]
631  palignr                m7, m4, m3, 6
632  palignr                m0, m3, m2, 6
633  mova  [dstq             ], m0
634  mova  [dstq+16          ], m7
635  palignr                m7, m4, m3, 4
636  palignr                m0, m3, m2, 4
637  mova  [dstq+strideq     ], m0
638  mova  [dstq+strideq+16  ], m7
639  palignr                m7, m4, m3, 2
640  palignr                m0, m3, m2, 2
641  mova  [dstq+strideq*2   ], m0
642  mova  [dstq+strideq*2+16], m7
643  mova  [dstq+stride3q    ], m2
644  mova  [dstq+stride3q+16 ], m3
645
646  RESTORE_GOT
647  RET
648
649INIT_XMM ssse3
650cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset
651  GET_GOT     goffsetq
652  movq                m3, [leftq]            ; abcdefgh [byte]
653  lea           stride3q, [strideq*3]
654
655  pshufb              m1, m3, [GLOBAL(sh_b2345677777777777)]
656  pshufb              m0, m3, [GLOBAL(sh_b0123456777777777)]
657  pshufb              m2, m3, [GLOBAL(sh_b1234567777777777)]
658
659  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3
660  pavgb               m0, m2
661  punpcklbw           m0, m3        ; interleaved output
662
663  movq  [dstq          ], m0
664  psrldq              m0, 2
665  movq  [dstq+strideq  ], m0
666  psrldq              m0, 2
667  movq  [dstq+strideq*2], m0
668  psrldq              m0, 2
669  movq  [dstq+stride3q ], m0
670  lea               dstq, [dstq+strideq*4]
671  pshufhw             m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh
672  psrldq              m0, 2
673  movq  [dstq          ], m0
674  psrldq              m0, 2
675  movq  [dstq+strideq  ], m0
676  psrldq              m0, 2
677  movq  [dstq+strideq*2], m0
678  psrldq              m0, 2
679  movq  [dstq+stride3q ], m0
680  RESTORE_GOT
681  RET
682
683INIT_XMM ssse3
684cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset
685  GET_GOT     goffsetq
686  lea           stride3q, [strideq*3]
687  mova                m0, [leftq]            ; abcdefghijklmnop [byte]
688  pshufb              m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp
689  pshufb              m2, m0, [GLOBAL(sh_b23456789abcdefff)]
690
691  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3
692  pavgb               m1, m0                 ; ab, bc, cd .. no, op, pp [byte]
693
694  punpckhbw           m4, m1, m3    ; interleaved input
695  punpcklbw           m1, m3        ; interleaved output
696  mova  [dstq          ], m1
697  palignr             m3, m4, m1, 2
698  mova  [dstq+strideq  ], m3
699  palignr             m3, m4, m1, 4
700  mova  [dstq+strideq*2], m3
701  palignr             m3, m4, m1, 6
702  mova  [dstq+stride3q ], m3
703  lea               dstq, [dstq+strideq*4]
704  palignr             m3, m4, m1, 8
705  mova  [dstq          ], m3
706  palignr             m3, m4, m1, 10
707  mova  [dstq+strideq  ], m3
708  palignr             m3, m4, m1, 12
709  mova  [dstq+strideq*2], m3
710  palignr             m3, m4, m1, 14
711  mova  [dstq+stride3q ], m3
712  DEFINE_ARGS dst, stride, stride3, line
713  mov              lined, 2
714  mova                m0, [GLOBAL(sh_b23456789abcdefff)]
715.loop:
716  lea               dstq, [dstq+strideq*4]
717  mova  [dstq          ], m4
718  pshufb              m4, m0
719  mova  [dstq+strideq  ], m4
720  pshufb              m4, m0
721  mova  [dstq+strideq*2], m4
722  pshufb              m4, m0
723  mova  [dstq+stride3q ], m4
724  pshufb              m4, m0
725  dec              lined
726  jnz .loop
727  RESTORE_GOT
728  REP_RET
729
730INIT_XMM ssse3
731cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
732  GET_GOT     goffsetq
733  lea           stride3q, [strideq*3]
734  mova                m1, [leftq]              ;  0-15 [byte]
735  mova                m2, [leftq+16]           ; 16-31 [byte]
736  pshufb              m0, m2, [GLOBAL(sh_b23456789abcdefff)]
737  pshufb              m4, m2, [GLOBAL(sh_b123456789abcdeff)]
738
739  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3
740  palignr             m6, m2, m1, 1
741  palignr             m5, m2, m1, 2
742  pavgb               m2, m4         ; high 16px even lines
743
744  X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0
745  pavgb                   m1, m6         ; low 16px even lines
746
747  punpckhbw               m6, m1, m0               ; interleaved output 2
748  punpcklbw               m1, m0                   ; interleaved output 1
749
750  punpckhbw               m7, m2, m3               ; interleaved output 4
751  punpcklbw               m2, m3                   ; interleaved output 3
752
753  ; output 1st 8 lines (and half of 2nd 8 lines)
754  DEFINE_ARGS dst, stride, stride3, dst8
755  lea                  dst8q, [dstq+strideq*8]
756  mova  [dstq              ], m1
757  mova  [dstq           +16], m6
758  mova  [dst8q             ], m6
759  palignr             m0, m6, m1, 2
760  palignr             m4, m2, m6, 2
761  mova  [dstq +strideq     ], m0
762  mova  [dstq +strideq  +16], m4
763  mova  [dst8q+strideq     ], m4
764  palignr             m0, m6, m1, 4
765  palignr             m4, m2, m6, 4
766  mova  [dstq +strideq*2   ], m0
767  mova  [dstq +strideq*2+16], m4
768  mova  [dst8q+strideq*2   ], m4
769  palignr             m0, m6, m1, 6
770  palignr             m4, m2, m6, 6
771  mova  [dstq +stride3q    ], m0
772  mova  [dstq +stride3q +16], m4
773  mova  [dst8q+stride3q    ], m4
774  lea               dstq, [dstq +strideq*4]
775  lea              dst8q, [dst8q+strideq*4]
776  palignr             m0, m6, m1, 8
777  palignr             m4, m2, m6, 8
778  mova  [dstq              ], m0
779  mova  [dstq           +16], m4
780  mova  [dst8q             ], m4
781  palignr             m0, m6, m1, 10
782  palignr             m4, m2, m6, 10
783  mova  [dstq +strideq     ], m0
784  mova  [dstq +strideq  +16], m4
785  mova  [dst8q+strideq     ], m4
786  palignr             m0, m6, m1, 12
787  palignr             m4, m2, m6, 12
788  mova  [dstq +strideq*2   ], m0
789  mova  [dstq +strideq*2+16], m4
790  mova  [dst8q+strideq*2   ], m4
791  palignr             m0, m6, m1, 14
792  palignr             m4, m2, m6, 14
793  mova  [dstq +stride3q    ], m0
794  mova  [dstq +stride3q +16], m4
795  mova  [dst8q+stride3q    ], m4
796  lea               dstq, [dstq+strideq*4]
797  lea              dst8q, [dst8q+strideq*4]
798
799  ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines
800  mova  [dstq           +16], m2
801  mova  [dst8q             ], m2
802  palignr             m4, m7, m2, 2
803  mova  [dstq +strideq  +16], m4
804  mova  [dst8q+strideq     ], m4
805  palignr             m4, m7, m2, 4
806  mova  [dstq +strideq*2+16], m4
807  mova  [dst8q+strideq*2   ], m4
808  palignr             m4, m7, m2, 6
809  mova  [dstq +stride3q +16], m4
810  mova  [dst8q+stride3q    ], m4
811  lea               dstq, [dstq+strideq*4]
812  lea              dst8q, [dst8q+strideq*4]
813  palignr             m4, m7, m2, 8
814  mova  [dstq           +16], m4
815  mova  [dst8q             ], m4
816  palignr             m4, m7, m2, 10
817  mova  [dstq +strideq  +16], m4
818  mova  [dst8q+strideq     ], m4
819  palignr             m4, m7, m2, 12
820  mova  [dstq +strideq*2+16], m4
821  mova  [dst8q+strideq*2   ], m4
822  palignr             m4, m7, m2, 14
823  mova  [dstq +stride3q +16], m4
824  mova  [dst8q+stride3q    ], m4
825  lea               dstq, [dstq+strideq*4]
826  lea              dst8q, [dst8q+strideq*4]
827
828  ; output 2nd half of 3rd 8 lines and half of 4th 8 lines
829  mova                m0, [GLOBAL(sh_b23456789abcdefff)]
830  mova  [dstq           +16], m7
831  mova  [dst8q             ], m7
832  pshufb              m7, m0
833  mova  [dstq +strideq  +16], m7
834  mova  [dst8q+strideq     ], m7
835  pshufb              m7, m0
836  mova  [dstq +strideq*2+16], m7
837  mova  [dst8q+strideq*2   ], m7
838  pshufb              m7, m0
839  mova  [dstq +stride3q +16], m7
840  mova  [dst8q+stride3q    ], m7
841  pshufb              m7, m0
842  lea               dstq, [dstq+strideq*4]
843  lea              dst8q, [dst8q+strideq*4]
844  mova  [dstq           +16], m7
845  mova  [dst8q             ], m7
846  pshufb              m7, m0
847  mova  [dstq +strideq  +16], m7
848  mova  [dst8q+strideq     ], m7
849  pshufb              m7, m0
850  mova  [dstq +strideq*2+16], m7
851  mova  [dst8q+strideq*2   ], m7
852  pshufb              m7, m0
853  mova  [dstq +stride3q +16], m7
854  mova  [dst8q+stride3q    ], m7
855  pshufb              m7, m0
856  lea               dstq, [dstq+strideq*4]
857
858  ; output last half of 4th 8 lines
859  mova  [dstq           +16], m7
860  mova  [dstq +strideq  +16], m7
861  mova  [dstq +strideq*2+16], m7
862  mova  [dstq +stride3q +16], m7
863  lea               dstq, [dstq+strideq*4]
864  mova  [dstq           +16], m7
865  mova  [dstq +strideq  +16], m7
866  mova  [dstq +strideq*2+16], m7
867  mova  [dstq +stride3q +16], m7
868
869  ; done!
870  RESTORE_GOT
871  RET
872