1;
2;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_intra4x4_predict_armv6|
13
14    ARM
15    REQUIRE8
16    PRESERVE8
17
18    AREA ||.text||, CODE, READONLY, ALIGN=2
19
20
21;void vp8_intra4x4_predict_armv6(unsigned char *Above, unsigned char *yleft,
22;                                B_PREDICTION_MODE left_stride, int b_mode,
23;                                unsigned char *dst, int dst_stride,
24;                                unsigned char top_left)
25
26; r0: *Above
27; r1: *yleft
28; r2: left_stride
29; r3: b_mode
30; sp + #40: dst
31; sp + #44: dst_stride
32; sp + #48: top_left
33|vp8_intra4x4_predict_armv6| PROC
34    push        {r4-r12, lr}
35
36    cmp         r3, #10
37    addlt       pc, pc, r3, lsl #2       ; position independent switch
38    pop         {r4-r12, pc}             ; default
39    b           b_dc_pred
40    b           b_tm_pred
41    b           b_ve_pred
42    b           b_he_pred
43    b           b_ld_pred
44    b           b_rd_pred
45    b           b_vr_pred
46    b           b_vl_pred
47    b           b_hd_pred
48    b           b_hu_pred
49
50b_dc_pred
51    ; load values
52    ldr         r8, [r0]                 ; Above
53    ldrb        r4, [r1], r2             ; Left[0]
54    mov         r9, #0
55    ldrb        r5, [r1], r2             ; Left[1]
56    ldrb        r6, [r1], r2             ; Left[2]
57    usad8       r12, r8, r9
58    ldrb        r7, [r1]                 ; Left[3]
59
60    ; calculate dc
61    add         r4, r4, r5
62    add         r4, r4, r6
63    add         r4, r4, r7
64    add         r4, r4, r12
65    add         r4, r4, #4
66    ldr         r0, [sp, #44]           ; dst_stride
67    mov         r12, r4, asr #3         ; (expected_dc + 4) >> 3
68
69    add         r12, r12, r12, lsl #8
70    ldr         r3, [sp, #40]           ; dst
71    add         r12, r12, r12, lsl #16
72
73    ; store values
74    str         r12, [r3], r0
75    str         r12, [r3], r0
76    str         r12, [r3], r0
77    str         r12, [r3]
78
79    pop        {r4-r12, pc}
80
81b_tm_pred
82    ldr         r8, [r0]                ; Above
83    ldrb        r9, [sp, #48]           ; top_left
84    ldrb        r4, [r1], r2            ; Left[0]
85    ldrb        r5, [r1], r2            ; Left[1]
86    ldrb        r6, [r1], r2            ; Left[2]
87    ldrb        r7, [r1]                ; Left[3]
88    ldr         r0, [sp, #44]           ; dst_stride
89    ldr         r3, [sp, #40]           ; dst
90
91    add         r9, r9, r9, lsl #16     ; [tl|tl]
92    uxtb16      r10, r8                 ; a[2|0]
93    uxtb16      r11, r8, ror #8         ; a[3|1]
94    ssub16      r10, r10, r9            ; a[2|0] - [tl|tl]
95    ssub16      r11, r11, r9            ; a[3|1] - [tl|tl]
96
97    add         r4, r4, r4, lsl #16     ; l[0|0]
98    add         r5, r5, r5, lsl #16     ; l[1|1]
99    add         r6, r6, r6, lsl #16     ; l[2|2]
100    add         r7, r7, r7, lsl #16     ; l[3|3]
101
102    sadd16      r1, r4, r10             ; l[0|0] + a[2|0] - [tl|tl]
103    sadd16      r2, r4, r11             ; l[0|0] + a[3|1] - [tl|tl]
104    usat16      r1, #8, r1
105    usat16      r2, #8, r2
106
107    sadd16      r4, r5, r10             ; l[1|1] + a[2|0] - [tl|tl]
108    sadd16      r5, r5, r11             ; l[1|1] + a[3|1] - [tl|tl]
109
110    add         r12, r1, r2, lsl #8     ; [3|2|1|0]
111    str         r12, [r3], r0
112
113    usat16      r4, #8, r4
114    usat16      r5, #8, r5
115
116    sadd16      r1, r6, r10             ; l[2|2] + a[2|0] - [tl|tl]
117    sadd16      r2, r6, r11             ; l[2|2] + a[3|1] - [tl|tl]
118
119    add         r12, r4, r5, lsl #8     ; [3|2|1|0]
120    str         r12, [r3], r0
121
122    usat16      r1, #8, r1
123    usat16      r2, #8, r2
124
125    sadd16      r4, r7, r10             ; l[3|3] + a[2|0] - [tl|tl]
126    sadd16      r5, r7, r11             ; l[3|3] + a[3|1] - [tl|tl]
127
128    add         r12, r1, r2, lsl #8     ; [3|2|1|0]
129
130    usat16      r4, #8, r4
131    usat16      r5, #8, r5
132
133    str         r12, [r3], r0
134
135    add         r12, r4, r5, lsl #8     ; [3|2|1|0]
136    str         r12, [r3]
137
138    pop        {r4-r12, pc}
139
140b_ve_pred
141    ldr         r8, [r0]                ; a[3|2|1|0]
142    ldr         r11, c00FF00FF
143    ldrb        r9, [sp, #48]           ; top_left
144    ldrb        r10, [r0, #4]           ; a[4]
145
146    ldr         r0, c00020002
147
148    uxtb16      r4, r8                  ; a[2|0]
149    uxtb16      r5, r8, ror #8          ; a[3|1]
150    ldr         r2, [sp, #44]           ; dst_stride
151    pkhbt       r9, r9, r5, lsl #16     ; a[1|-1]
152
153    add         r9, r9, r4, lsl #1      ;[a[1]+2*a[2]       | tl+2*a[0]       ]
154    uxtab16     r9, r9, r5              ;[a[1]+2*a[2]+a[3]  | tl+2*a[0]+a[1]  ]
155    ldr         r3, [sp, #40]           ; dst
156    uxtab16     r9, r9, r0              ;[a[1]+2*a[2]+a[3]+2| tl+2*a[0]+a[1]+2]
157
158    add         r0, r0, r10, lsl #16    ;[a[4]+2            |                 2]
159    add         r0, r0, r4, asr #16     ;[a[4]+2            |            a[2]+2]
160    add         r0, r0, r5, lsl #1      ;[a[4]+2*a[3]+2     |     a[2]+2*a[1]+2]
161    uadd16      r4, r4, r0              ;[a[4]+2*a[3]+a[2]+2|a[2]+2*a[1]+a[0]+2]
162
163    and         r9, r11, r9, asr #2
164    and         r4, r11, r4, asr #2
165    add         r9, r9, r4, lsl #8
166
167    ; store values
168    str         r9, [r3], r2
169    str         r9, [r3], r2
170    str         r9, [r3], r2
171    str         r9, [r3]
172
173    pop        {r4-r12, pc}
174
175
176b_he_pred
177    ldrb        r4, [r1], r2            ; Left[0]
178    ldrb        r8, [sp, #48]           ; top_left
179    ldrb        r5, [r1], r2            ; Left[1]
180    ldrb        r6, [r1], r2            ; Left[2]
181    ldrb        r7, [r1]                ; Left[3]
182
183    add         r8, r8, r4              ; tl   + l[0]
184    add         r9, r4, r5              ; l[0] + l[1]
185    add         r10, r5, r6             ; l[1] + l[2]
186    add         r11, r6, r7             ; l[2] + l[3]
187
188    mov         r0, #2<<14
189
190    add         r8, r8, r9              ; tl + 2*l[0] + l[1]
191    add         r4, r9, r10             ; l[0] + 2*l[1] + l[2]
192    add         r5, r10, r11            ; l[1] + 2*l[2] + l[3]
193    add         r6, r11, r7, lsl #1     ; l[2] + 2*l[3] + l[3]
194
195
196    add         r8, r0, r8, lsl #14     ; (tl + 2*l[0] + l[1])>>2 in top half
197    add         r9, r0, r4, lsl #14     ; (l[0] + 2*l[1] + l[2])>>2 in top half
198    add         r10,r0, r5, lsl #14     ; (l[1] + 2*l[2] + l[3])>>2 in top half
199    add         r11,r0, r6, lsl #14     ; (l[2] + 2*l[3] + l[3])>>2 in top half
200
201    pkhtb       r8, r8, r8, asr #16     ; l[-|0|-|0]
202    pkhtb       r9, r9, r9, asr #16     ; l[-|1|-|1]
203    pkhtb       r10, r10, r10, asr #16  ; l[-|2|-|2]
204    pkhtb       r11, r11, r11, asr #16  ; l[-|3|-|3]
205
206    ldr         r0, [sp, #44]           ; dst_stride
207    ldr         r3, [sp, #40]           ; dst
208
209    add         r8, r8, r8, lsl #8      ; l[0|0|0|0]
210    add         r9, r9, r9, lsl #8      ; l[1|1|1|1]
211    add         r10, r10, r10, lsl #8   ; l[2|2|2|2]
212    add         r11, r11, r11, lsl #8   ; l[3|3|3|3]
213
214    ; store values
215    str         r8, [r3], r0
216    str         r9, [r3], r0
217    str         r10, [r3], r0
218    str         r11, [r3]
219
220    pop        {r4-r12, pc}
221
222b_ld_pred
223    ldr         r4, [r0]                ; Above[0-3]
224    ldr         r12, c00020002
225    ldr         r5, [r0, #4]            ; Above[4-7]
226    ldr         lr,  c00FF00FF
227
228    uxtb16      r6, r4                  ; a[2|0]
229    uxtb16      r7, r4, ror #8          ; a[3|1]
230    uxtb16      r8, r5                  ; a[6|4]
231    uxtb16      r9, r5, ror #8          ; a[7|5]
232    pkhtb       r10, r6, r8             ; a[2|4]
233    pkhtb       r11, r7, r9             ; a[3|5]
234
235    add         r4, r6, r7, lsl #1      ; [a2+2*a3      |      a0+2*a1]
236    add         r4, r4, r10, ror #16    ; [a2+2*a3+a4   |   a0+2*a1+a2]
237    uxtab16     r4, r4, r12             ; [a2+2*a3+a4+2 | a0+2*a1+a2+2]
238
239    add         r5, r7, r10, ror #15    ; [a3+2*a4      |      a1+2*a2]
240    add         r5, r5, r11, ror #16    ; [a3+2*a4+a5   |   a1+2*a2+a3]
241    uxtab16     r5, r5, r12             ; [a3+2*a4+a5+2 | a1+2*a2+a3+2]
242
243    pkhtb       r7, r9, r8, asr #16
244    add         r6, r8, r9, lsl #1      ; [a6+2*a7      |      a4+2*a5]
245    uadd16      r6, r6, r7              ; [a6+2*a7+a7   |   a4+2*a5+a6]
246    uxtab16     r6, r6, r12             ; [a6+2*a7+a7+2 | a4+2*a5+a6+2]
247
248    uxth        r7, r9                  ; [                         a5]
249    add         r7, r7, r8, asr #15     ; [                    a5+2*a6]
250    add         r7, r7, r9, asr #16     ; [                 a5+2*a6+a7]
251    uxtah       r7, r7, r12             ; [               a5+2*a6+a7+2]
252
253    ldr         r0, [sp, #44]           ; dst_stride
254    ldr         r3, [sp, #40]           ; dst
255
256    ; scale down
257    and         r4, lr, r4, asr #2
258    and         r5, lr, r5, asr #2
259    and         r6, lr, r6, asr #2
260    mov         r7, r7, asr #2
261
262    add         r8, r4, r5, lsl #8      ; [3|2|1|0]
263    str         r8, [r3], r0
264
265    mov         r9, r8, lsr #8
266    add         r9, r9, r6, lsl #24     ; [4|3|2|1]
267    str         r9, [r3], r0
268
269    mov         r10, r9, lsr #8
270    add         r10, r10, r7, lsl #24   ; [5|4|3|2]
271    str         r10, [r3], r0
272
273    mov         r6, r6, lsr #16
274    mov         r11, r10, lsr #8
275    add         r11, r11, r6, lsl #24   ; [6|5|4|3]
276    str         r11, [r3]
277
278    pop        {r4-r12, pc}
279
280b_rd_pred
281    ldrb        r7, [r1], r2            ; l[0] = pp[3]
282    ldr         lr, [r0]                ; Above = pp[8|7|6|5]
283    ldrb        r8, [sp, #48]           ; tl   = pp[4]
284    ldrb        r6, [r1], r2            ; l[1] = pp[2]
285    ldrb        r5, [r1], r2            ; l[2] = pp[1]
286    ldrb        r4, [r1], r2            ; l[3] = pp[0]
287
288
289    uxtb16      r9, lr                  ; p[7|5]
290    uxtb16      r10, lr, ror #8         ; p[8|6]
291    add         r4, r4, r6, lsl #16     ; p[2|0]
292    add         r5, r5, r7, lsl #16     ; p[3|1]
293    add         r6, r6, r8, lsl #16     ; p[4|2]
294    pkhbt       r7, r7, r9, lsl #16     ; p[5|3]
295    pkhbt       r8, r8, r10, lsl #16    ; p[6|4]
296
297    ldr         r12, c00020002
298    ldr         lr,  c00FF00FF
299
300    add         r4, r4, r5, lsl #1      ; [p2+2*p3      |      p0+2*p1]
301    add         r4, r4, r6              ; [p2+2*p3+p4   |   p0+2*p1+p2]
302    uxtab16     r4, r4, r12             ; [p2+2*p3+p4+2 | p0+2*p1+p2+2]
303
304    add         r5, r5, r6, lsl #1      ; [p3+2*p4      |      p1+2*p2]
305    add         r5, r5, r7              ; [p3+2*p4+p5   |   p1+2*p2+p3]
306    uxtab16     r5, r5, r12             ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
307
308    add         r6, r7, r8, lsl #1      ; [p5+2*p6      |      p3+2*p4]
309    add         r6, r6, r9              ; [p5+2*p6+p7   |   p3+2*p4+p5]
310    uxtab16     r6, r6, r12             ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
311
312    add         r7, r8, r9, lsl #1      ; [p6+2*p7      |      p4+2*p5]
313    add         r7, r7, r10             ; [p6+2*p7+p8   |   p4+2*p5+p6]
314    uxtab16     r7, r7, r12             ; [p6+2*p7+p8+2 | p4+2*p5+p6+2]
315
316    ldr         r0, [sp, #44]           ; dst_stride
317    ldr         r3, [sp, #40]           ; dst
318
319    ; scale down
320    and         r7, lr, r7, asr #2
321    and         r6, lr, r6, asr #2
322    and         r5, lr, r5, asr #2
323    and         r4, lr, r4, asr #2
324
325    add         r8, r6, r7, lsl #8      ; [6|5|4|3]
326    str         r8, [r3], r0
327
328    mov         r9, r8, lsl #8          ; [5|4|3|-]
329    uxtab       r9, r9, r4, ror #16     ; [5|4|3|2]
330    str         r9, [r3], r0
331
332    mov         r10, r9, lsl #8         ; [4|3|2|-]
333    uxtab       r10, r10, r5            ; [4|3|2|1]
334    str         r10, [r3], r0
335
336    mov         r11, r10, lsl #8        ; [3|2|1|-]
337    uxtab       r11, r11, r4            ; [3|2|1|0]
338    str         r11, [r3]
339
340    pop        {r4-r12, pc}
341
342b_vr_pred
343    ldrb        r7, [r1], r2            ; l[0] = pp[3]
344    ldr         lr, [r0]                ; Above = pp[8|7|6|5]
345    ldrb        r8, [sp, #48]           ; tl   = pp[4]
346    ldrb        r6, [r1], r2            ; l[1] = pp[2]
347    ldrb        r5, [r1], r2            ; l[2] = pp[1]
348    ldrb        r4, [r1]                ; l[3] = pp[0]
349
350    add         r5, r5, r7, lsl #16     ; p[3|1]
351    add         r6, r6, r8, lsl #16     ; p[4|2]
352    uxtb16      r9, lr                  ; p[7|5]
353    uxtb16      r10, lr, ror #8         ; p[8|6]
354    pkhbt       r7, r7, r9, lsl #16     ; p[5|3]
355    pkhbt       r8, r8, r10, lsl #16    ; p[6|4]
356
357    ldr         r4,  c00010001
358    ldr         r12, c00020002
359    ldr         lr,  c00FF00FF
360
361    add         r5, r5, r6, lsl #1      ; [p3+2*p4      |      p1+2*p2]
362    add         r5, r5, r7              ; [p3+2*p4+p5   |   p1+2*p2+p3]
363    uxtab16     r5, r5, r12             ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
364
365    add         r6, r6, r7, lsl #1      ; [p4+2*p5      |      p2+2*p3]
366    add         r6, r6, r8              ; [p4+2*p5+p6   |   p2+2*p3+p4]
367    uxtab16     r6, r6, r12             ; [p4+2*p5+p6+2 | p2+2*p3+p4+2]
368
369    uadd16      r11, r8, r9             ; [p6+p7        |        p4+p5]
370    uhadd16     r11, r11, r4            ; [(p6+p7+1)>>1 | (p4+p5+1)>>1]
371                                        ; [F|E]
372
373    add         r7, r7, r8, lsl #1      ; [p5+2*p6      |      p3+2*p4]
374    add         r7, r7, r9              ; [p5+2*p6+p7   |   p3+2*p4+p5]
375    uxtab16     r7, r7, r12             ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
376
377    uadd16      r2, r9, r10             ; [p7+p8        |        p5+p6]
378    uhadd16     r2, r2, r4              ; [(p7+p8+1)>>1 | (p5+p6+1)>>1]
379                                        ; [J|I]
380
381    add         r8, r8, r9, lsl #1      ; [p6+2*p7      |      p4+2*p5]
382    add         r8, r8, r10             ; [p6+2*p7+p8   |   p4+2*p5+p6]
383    uxtab16     r8, r8, r12             ; [p6+2*p7+p8+2 | p4+2*p5+p6+2]
384
385    ldr         r0, [sp, #44]           ; dst_stride
386    ldr         r3, [sp, #40]           ; dst
387
388    ; scale down
389    and         r5, lr, r5, asr #2      ; [B|A]
390    and         r6, lr, r6, asr #2      ; [D|C]
391    and         r7, lr, r7, asr #2      ; [H|G]
392    and         r8, lr, r8, asr #2      ; [L|K]
393
394    add         r12, r11, r2, lsl #8    ; [J|F|I|E]
395    str         r12, [r3], r0
396
397    add         r12, r7, r8, lsl #8     ; [L|H|K|G]
398    str         r12, [r3], r0
399
400    pkhbt       r2, r6, r2, lsl #16     ; [-|I|-|C]
401    add         r2, r2, r11, lsl #8     ; [F|I|E|C]
402
403    pkhtb       r12, r6, r5             ; [-|D|-|A]
404    pkhtb       r10, r7, r5, asr #16    ; [-|H|-|B]
405    str         r2, [r3], r0
406    add         r12, r12, r10, lsl #8   ; [H|D|B|A]
407    str         r12, [r3]
408
409    pop        {r4-r12, pc}
410
411b_vl_pred
412    ldr         r4, [r0]                ; [3|2|1|0] = Above[0-3]
413    ldr         r12, c00020002
414    ldr         r5, [r0, #4]            ; [7|6|5|4] = Above[4-7]
415    ldr         lr,  c00FF00FF
416    ldr         r2,  c00010001
417
418    mov         r0, r4, lsr #16         ; [-|-|3|2]
419    add         r0, r0, r5, lsl #16     ; [5|4|3|2]
420    uxtb16      r6, r4                  ; [2|0]
421    uxtb16      r7, r4, ror #8          ; [3|1]
422    uxtb16      r8, r0                  ; [4|2]
423    uxtb16      r9, r0, ror #8          ; [5|3]
424    uxtb16      r10, r5                 ; [6|4]
425    uxtb16      r11, r5, ror #8         ; [7|5]
426
427    uadd16      r4, r6, r7              ; [p2+p3        |        p0+p1]
428    uhadd16     r4, r4, r2              ; [(p2+p3+1)>>1 | (p0+p1+1)>>1]
429                                        ; [B|A]
430
431    add         r5, r6, r7, lsl #1      ; [p2+2*p3      |      p0+2*p1]
432    add         r5, r5, r8              ; [p2+2*p3+p4   |   p0+2*p1+p2]
433    uxtab16     r5, r5, r12             ; [p2+2*p3+p4+2 | p0+2*p1+p2+2]
434
435    uadd16      r6, r7, r8              ; [p3+p4        |        p1+p2]
436    uhadd16     r6, r6, r2              ; [(p3+p4+1)>>1 | (p1+p2+1)>>1]
437                                        ; [F|E]
438
439    add         r7, r7, r8, lsl #1      ; [p3+2*p4      |      p1+2*p2]
440    add         r7, r7, r9              ; [p3+2*p4+p5   |   p1+2*p2+p3]
441    uxtab16     r7, r7, r12             ; [p3+2*p4+p5+2 | p1+2*p2+p3+2]
442
443    add         r8, r8, r9, lsl #1      ; [p4+2*p5      |      p2+2*p3]
444    add         r8, r8, r10             ; [p4+2*p5+p6   |   p2+2*p3+p4]
445    uxtab16     r8, r8, r12             ; [p4+2*p5+p6+2 | p2+2*p3+p4+2]
446
447    add         r9, r9, r10, lsl #1     ; [p5+2*p6      |      p3+2*p4]
448    add         r9, r9, r11             ; [p5+2*p6+p7   |   p3+2*p4+p5]
449    uxtab16     r9, r9, r12             ; [p5+2*p6+p7+2 | p3+2*p4+p5+2]
450
451    ldr         r0, [sp, #44]           ; dst_stride
452    ldr         r3, [sp, #40]           ; dst
453
454    ; scale down
455    and         r5, lr, r5, asr #2      ; [D|C]
456    and         r7, lr, r7, asr #2      ; [H|G]
457    and         r8, lr, r8, asr #2      ; [I|D]
458    and         r9, lr, r9, asr #2      ; [J|H]
459
460    add         r10, r4, r6, lsl #8     ; [F|B|E|A]
461    str         r10, [r3], r0
462
463    add         r5, r5, r7, lsl #8      ; [H|C|G|D]
464    str         r5, [r3], r0
465
466    pkhtb       r12, r8, r4, asr #16    ; [-|I|-|B]
467    pkhtb       r10, r9, r8             ; [-|J|-|D]
468
469    add         r12, r6, r12, lsl #8    ; [I|F|B|E]
470    str         r12, [r3], r0
471
472    add         r10, r7, r10, lsl #8    ; [J|H|D|G]
473    str         r10, [r3]
474
475    pop        {r4-r12, pc}
476
477b_hd_pred
478    ldrb        r7, [r1], r2            ; l[0] = pp[3]
479    ldr         lr, [r0]                ; Above = pp[8|7|6|5]
480    ldrb        r8, [sp, #48]           ; tl   = pp[4]
481    ldrb        r6, [r1], r2            ; l[1] = pp[2]
482    ldrb        r5, [r1], r2            ; l[2] = pp[1]
483    ldrb        r4, [r1]                ; l[3] = pp[0]
484
485    uxtb16      r9, lr                  ; p[7|5]
486    uxtb16      r10, lr, ror #8         ; p[8|6]
487
488    add         r4, r4, r5, lsl #16     ; p[1|0]
489    add         r5, r5, r6, lsl #16     ; p[2|1]
490    add         r6, r6, r7, lsl #16     ; p[3|2]
491    add         r7, r7, r8, lsl #16     ; p[4|3]
492
493    ldr         r12, c00020002
494    ldr         lr,  c00FF00FF
495    ldr         r2,  c00010001
496
497    pkhtb       r8, r7, r9              ; p[4|5]
498    pkhtb       r1, r9, r10             ; p[7|6]
499    pkhbt       r10, r8, r10, lsl #16   ; p[6|5]
500
501    uadd16      r11, r4, r5             ; [p1+p2        |        p0+p1]
502    uhadd16     r11, r11, r2            ; [(p1+p2+1)>>1 | (p0+p1+1)>>1]
503                                        ; [B|A]
504
505    add         r4, r4, r5, lsl #1      ; [p1+2*p2      |      p0+2*p1]
506    add         r4, r4, r6              ; [p1+2*p2+p3   |   p0+2*p1+p2]
507    uxtab16     r4, r4, r12             ; [p1+2*p2+p3+2 | p0+2*p1+p2+2]
508
509    uadd16      r0, r6, r7              ; [p3+p4        |        p2+p3]
510    uhadd16     r0, r0, r2              ; [(p3+p4+1)>>1 | (p2+p3+1)>>1]
511                                        ; [F|E]
512
513    add         r5, r6, r7, lsl #1      ; [p3+2*p4      |      p2+2*p3]
514    add         r5, r5, r8, ror #16     ; [p3+2*p4+p5   |   p2+2*p3+p4]
515    uxtab16     r5, r5, r12             ; [p3+2*p4+p5+2 | p2+2*p3+p4+2]
516
517    add         r6, r12, r8, ror #16    ; [p5+2         |         p4+2]
518    add         r6, r6, r10, lsl #1     ; [p5+2+2*p6    |    p4+2+2*p5]
519    uxtab16     r6, r6, r1              ; [p5+2+2*p6+p7 | p4+2+2*p5+p6]
520
521    ; scale down
522    and         r4, lr, r4, asr #2      ; [D|C]
523    and         r5, lr, r5, asr #2      ; [H|G]
524    and         r6, lr, r6, asr #2      ; [J|I]
525
526    ldr         lr, [sp, #44]           ; dst_stride
527    ldr         r3, [sp, #40]           ; dst
528
529    pkhtb       r2, r0, r6              ; [-|F|-|I]
530    pkhtb       r12, r6, r5, asr #16    ; [-|J|-|H]
531    add         r12, r12, r2, lsl #8    ; [F|J|I|H]
532    add         r2, r0, r5, lsl #8      ; [H|F|G|E]
533    mov         r12, r12, ror #24       ; [J|I|H|F]
534    str         r12, [r3], lr
535
536    mov         r7, r11, asr #16        ; [-|-|-|B]
537    str         r2, [r3], lr
538    add         r7, r7, r0, lsl #16     ; [-|E|-|B]
539    add         r7, r7, r4, asr #8      ; [-|E|D|B]
540    add         r7, r7, r5, lsl #24     ; [G|E|D|B]
541    str         r7, [r3], lr
542
543    add         r5, r11, r4, lsl #8     ; [D|B|C|A]
544    str         r5, [r3]
545
546    pop        {r4-r12, pc}
547
548
549
550b_hu_pred
551    ldrb        r4, [r1], r2            ; Left[0]
552    ldr         r12, c00020002
553    ldrb        r5, [r1], r2            ; Left[1]
554    ldr         lr,  c00FF00FF
555    ldrb        r6, [r1], r2            ; Left[2]
556    ldr         r2,  c00010001
557    ldrb        r7, [r1]                ; Left[3]
558
559    add         r4, r4, r5, lsl #16     ; [1|0]
560    add         r5, r5, r6, lsl #16     ; [2|1]
561    add         r9, r6, r7, lsl #16     ; [3|2]
562
563    uadd16      r8, r4, r5              ; [p1+p2        |        p0+p1]
564    uhadd16     r8, r8, r2              ; [(p1+p2+1)>>1 | (p0+p1+1)>>1]
565                                        ; [B|A]
566
567    add         r4, r4, r5, lsl #1      ; [p1+2*p2      |      p0+2*p1]
568    add         r4, r4, r9              ; [p1+2*p2+p3   |   p0+2*p1+p2]
569    uxtab16     r4, r4, r12             ; [p1+2*p2+p3+2 | p0+2*p1+p2+2]
570    ldr         r2, [sp, #44]           ; dst_stride
571    ldr         r3, [sp, #40]           ; dst
572    and         r4, lr, r4, asr #2      ; [D|C]
573
574    add         r10, r6, r7             ; [p2+p3]
575    add         r11, r10, r7, lsl #1    ; [p2+3*p3]
576    add         r10, r10, #1
577    add         r11, r11, #2
578    mov         r10, r10, asr #1        ; [E]
579    mov         r11, r11, asr #2        ; [F]
580
581    add         r9, r7, r9, asr #8      ; [-|-|G|G]
582    add         r0, r8, r4, lsl #8      ; [D|B|C|A]
583    add         r7, r9, r9, lsl #16     ; [G|G|G|G]
584
585    str         r0, [r3], r2
586
587    mov         r1, r8, asr #16         ; [-|-|-|B]
588    add         r1, r1, r4, asr #8      ; [-|-|D|B]
589    add         r1, r1, r10, lsl #16    ; [-|E|D|B]
590    add         r1, r1, r11, lsl #24    ; [F|E|D|B]
591    str         r1, [r3], r2
592
593    add         r10, r11, lsl #8        ; [-|-|F|E]
594    add         r10, r10, r9, lsl #16   ; [G|G|F|E]
595    str         r10, [r3], r2
596
597    str         r7, [r3]
598
599    pop        {r4-r12, pc}
600
601    ENDP
602
603; constants
604c00010001
605    DCD         0x00010001
606c00020002
607    DCD         0x00020002
608c00FF00FF
609    DCD         0x00FF00FF
610
611    END
612