1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license and patent
5;  grant that can be found in the LICENSE file in the root of the source
6;  tree. All contributing project authors may be found in the AUTHORS
7;  file in the root of the source tree.
8;
9
10    EXPORT |vp8_dequant_idct_add_v6|
11
12    AREA |.text|, CODE, READONLY
13;void vp8_dequant_idct_v6(short *input, short *dq,
14;                         unsigned char *dest, int stride)
15; r0 = q
16; r1 = dq
17; r2 = dst
18; r3 = stride
19
20|vp8_dequant_idct_add_v6| PROC
21    stmdb   sp!, {r4-r11, lr}
22
23    ldr     r4, [r0]                ;input
24    ldr     r5, [r1], #4            ;dq
25
26    sub     sp, sp, #4
27    str     r3, [sp]
28
29    mov     r12, #4
30
31vp8_dequant_add_loop
32    smulbb  r6, r4, r5
33    smultt  r7, r4, r5
34
35    ldr     r4, [r0, #4]            ;input
36    ldr     r5, [r1], #4            ;dq
37
38    strh    r6, [r0], #2
39    strh    r7, [r0], #2
40
41    smulbb  r6, r4, r5
42    smultt  r7, r4, r5
43
44    subs    r12, r12, #1
45
46    ldrne   r4, [r0, #4]
47    ldrne   r5, [r1], #4
48
49    strh    r6, [r0], #2
50    strh    r7, [r0], #2
51
52    bne     vp8_dequant_add_loop
53
54    sub     r0, r0, #32
55    mov     r1, r0
56
57; short_idct4x4llm_v6_dual
58    ldr     r3, cospi8sqrt2minus1
59    ldr     r4, sinpi8sqrt2
60    ldr     r6, [r0, #8]
61    mov     r5, #2
62vp8_dequant_idct_loop1_v6
63    ldr     r12, [r0, #24]
64    ldr     r14, [r0, #16]
65    smulwt  r9, r3, r6
66    smulwb  r7, r3, r6
67    smulwt  r10, r4, r6
68    smulwb  r8, r4, r6
69    pkhbt   r7, r7, r9, lsl #16
70    smulwt  r11, r3, r12
71    pkhbt   r8, r8, r10, lsl #16
72    uadd16  r6, r6, r7
73    smulwt  r7, r4, r12
74    smulwb  r9, r3, r12
75    smulwb  r10, r4, r12
76    subs    r5, r5, #1
77    pkhbt   r9, r9, r11, lsl #16
78    ldr     r11, [r0], #4
79    pkhbt   r10, r10, r7, lsl #16
80    uadd16  r7, r12, r9
81    usub16  r7, r8, r7
82    uadd16  r6, r6, r10
83    uadd16  r10, r11, r14
84    usub16  r8, r11, r14
85    uadd16  r9, r10, r6
86    usub16  r10, r10, r6
87    uadd16  r6, r8, r7
88    usub16  r7, r8, r7
89    str     r6, [r1, #8]
90    ldrne   r6, [r0, #8]
91    str     r7, [r1, #16]
92    str     r10, [r1, #24]
93    str     r9, [r1], #4
94    bne     vp8_dequant_idct_loop1_v6
95
96    mov     r5, #2
97    sub     r0, r1, #8
98vp8_dequant_idct_loop2_v6
99    ldr     r6, [r0], #4
100    ldr     r7, [r0], #4
101    ldr     r8, [r0], #4
102    ldr     r9, [r0], #4
103    smulwt  r1, r3, r6
104    smulwt  r12, r4, r6
105    smulwt  lr, r3, r8
106    smulwt  r10, r4, r8
107    pkhbt   r11, r8, r6, lsl #16
108    pkhbt   r1, lr, r1, lsl #16
109    pkhbt   r12, r10, r12, lsl #16
110    pkhtb   r6, r6, r8, asr #16
111    uadd16  r6, r1, r6
112    pkhbt   lr, r9, r7, lsl #16
113    uadd16  r10, r11, lr
114    usub16  lr, r11, lr
115    pkhtb   r8, r7, r9, asr #16
116    subs    r5, r5, #1
117    smulwt  r1, r3, r8
118    smulwb  r7, r3, r8
119    smulwt  r11, r4, r8
120    smulwb  r9, r4, r8
121    pkhbt   r1, r7, r1, lsl #16
122    uadd16  r8, r1, r8
123    pkhbt   r11, r9, r11, lsl #16
124    usub16  r1, r12, r8
125    uadd16  r8, r11, r6
126    ldr     r9, c0x00040004
127    ldr     r12, [sp]               ; get stride from stack
128    uadd16  r6, r10, r8
129    usub16  r7, r10, r8
130    uadd16  r7, r7, r9
131    uadd16  r6, r6, r9
132    uadd16  r10, r14, r1
133    usub16  r1, r14, r1
134    uadd16  r10, r10, r9
135    uadd16  r1, r1, r9
136    ldr     r11, [r2]               ; load input from dst
137    mov     r8, r7, asr #3
138    pkhtb   r9, r8, r10, asr #19
139    mov     r8, r1, asr #3
140    pkhtb   r8, r8, r6, asr #19
141    uxtb16  lr, r11, ror #8
142    qadd16  r9, r9, lr
143    uxtb16  lr, r11
144    qadd16  r8, r8, lr
145    usat16  r9, #8, r9
146    usat16  r8, #8, r8
147    orr     r9, r8, r9, lsl #8
148    ldr     r11, [r2, r12]          ; load input from dst
149    mov     r7, r7, lsl #16
150    mov     r1, r1, lsl #16
151    mov     r10, r10, lsl #16
152    mov     r6, r6, lsl #16
153    mov     r7, r7, asr #3
154    pkhtb   r7, r7, r10, asr #19
155    mov     r1, r1, asr #3
156    pkhtb   r1, r1, r6, asr #19
157    uxtb16  r8, r11, ror #8
158    qadd16  r7, r7, r8
159    uxtb16  r8, r11
160    qadd16  r1, r1, r8
161    usat16  r7, #8, r7
162    usat16  r1, #8, r1
163    orr     r1, r1, r7, lsl #8
164    str     r9, [r2], r12           ; store output to dst
165    str     r1, [r2], r12           ; store output to dst
166    bne     vp8_dequant_idct_loop2_v6
167
168; memset
169    sub     r0, r0, #32
170    add     sp, sp, #4
171
172    mov     r12, #0
173    str     r12, [r0]
174    str     r12, [r0, #4]
175    str     r12, [r0, #8]
176    str     r12, [r0, #12]
177    str     r12, [r0, #16]
178    str     r12, [r0, #20]
179    str     r12, [r0, #24]
180    str     r12, [r0, #28]
181
182    ldmia   sp!, {r4 - r11, pc}
183    ENDP    ; |vp8_dequant_idct_add_v6|
184
185; Constant Pool
186cospi8sqrt2minus1 DCD 0x00004E7B
187sinpi8sqrt2       DCD 0x00008A8C
188c0x00040004       DCD 0x00040004
189
190    END
191