1;
2;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11    EXPORT |vp8_short_fdct4x4_armv6|
12
13    ARM
14    REQUIRE8
15    PRESERVE8
16
17    AREA    |.text|, CODE, READONLY
18; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
19|vp8_short_fdct4x4_armv6| PROC
20
21    stmfd       sp!, {r4 - r12, lr}
22
23    ; PART 1
24
25    ; coeffs 0-3
26    ldrd        r4, r5, [r0]        ; [i1 | i0] [i3 | i2]
27
28    ldr         r10, c7500
29    ldr         r11, c14500
30    ldr         r12, c0x22a453a0    ; [2217*4 | 5352*4]
31    ldr         lr, c0x00080008
32    ror         r5, r5, #16         ; [i2 | i3]
33
34    qadd16      r6, r4, r5          ; [i1+i2 | i0+i3] = [b1 | a1] without shift
35    qsub16      r7, r4, r5          ; [i1-i2 | i0-i3] = [c1 | d1] without shift
36
37    add         r0, r0, r2          ; update input pointer
38
39    qadd16      r7, r7, r7          ; 2*[c1|d1] --> we can use smlad and smlsd
40                                    ; with 2217*4 and 5352*4 without losing the
41                                    ; sign bit (overflow)
42
43    smuad       r4, r6, lr          ; o0 = (i1+i2)*8 + (i0+i3)*8
44    smusd       r5, r6, lr          ; o2 = (i1+i2)*8 - (i0+i3)*8
45
46    smlad       r6, r7, r12, r11    ; o1 = (c1 * 2217 + d1 * 5352 +  14500)
47    smlsdx      r7, r7, r12, r10    ; o3 = (d1 * 2217 - c1 * 5352 +   7500)
48
49    ldrd        r8, r9, [r0]        ; [i5 | i4] [i7 | i6]
50
51    pkhbt       r3, r4, r6, lsl #4  ; [o1 | o0], keep in register for PART 2
52    pkhbt       r6, r5, r7, lsl #4  ; [o3 | o2]
53
54    str         r6, [r1, #4]
55
56    ; coeffs 4-7
57    ror         r9, r9, #16         ; [i6 | i7]
58
59    qadd16      r6, r8, r9          ; [i5+i6 | i4+i7] = [b1 | a1] without shift
60    qsub16      r7, r8, r9          ; [i5-i6 | i4-i7] = [c1 | d1] without shift
61
62    add         r0, r0, r2          ; update input pointer
63
64    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
65                                    ; with 2217*4 and 5352*4 without losing the
66                                    ; sign bit (overflow)
67
68    smuad       r9, r6, lr          ; o4 = (i5+i6)*8 + (i4+i7)*8
69    smusd       r8, r6, lr          ; o6 = (i5+i6)*8 - (i4+i7)*8
70
71    smlad       r6, r7, r12, r11    ; o5 = (c1 * 2217 + d1 * 5352 +  14500)
72    smlsdx      r7, r7, r12, r10    ; o7 = (d1 * 2217 - c1 * 5352 +   7500)
73
74    ldrd        r4, r5, [r0]        ; [i9 | i8] [i11 | i10]
75
76    pkhbt       r9, r9, r6, lsl #4  ; [o5 | o4], keep in register for PART 2
77    pkhbt       r6, r8, r7, lsl #4  ; [o7 | o6]
78
79    str         r6, [r1, #12]
80
81    ; coeffs 8-11
82    ror         r5, r5, #16         ; [i10 | i11]
83
84    qadd16      r6, r4, r5          ; [i9+i10 | i8+i11]=[b1 | a1] without shift
85    qsub16      r7, r4, r5          ; [i9-i10 | i8-i11]=[c1 | d1] without shift
86
87    add         r0, r0, r2          ; update input pointer
88
89    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
90                                    ; with 2217*4 and 5352*4 without losing the
91                                    ; sign bit (overflow)
92
93    smuad       r2, r6, lr          ; o8 = (i9+i10)*8 + (i8+i11)*8
94    smusd       r8, r6, lr          ; o10 = (i9+i10)*8 - (i8+i11)*8
95
96    smlad       r6, r7, r12, r11    ; o9 = (c1 * 2217 + d1 * 5352 +  14500)
97    smlsdx      r7, r7, r12, r10    ; o11 = (d1 * 2217 - c1 * 5352 +   7500)
98
99    ldrd        r4, r5, [r0]        ; [i13 | i12] [i15 | i14]
100
101    pkhbt       r2, r2, r6, lsl #4  ; [o9 | o8], keep in register for PART 2
102    pkhbt       r6, r8, r7, lsl #4  ; [o11 | o10]
103
104    str         r6, [r1, #20]
105
106    ; coeffs 12-15
107    ror         r5, r5, #16         ; [i14 | i15]
108
109    qadd16      r6, r4, r5          ; [i13+i14 | i12+i15]=[b1|a1] without shift
110    qsub16      r7, r4, r5          ; [i13-i14 | i12-i15]=[c1|d1] without shift
111
112    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
113                                    ; with 2217*4 and 5352*4 without losing the
114                                    ; sign bit (overflow)
115
116    smuad       r4, r6, lr          ; o12 = (i13+i14)*8 + (i12+i15)*8
117    smusd       r5, r6, lr          ; o14 = (i13+i14)*8 - (i12+i15)*8
118
119    smlad       r6, r7, r12, r11    ; o13 = (c1 * 2217 + d1 * 5352 +  14500)
120    smlsdx      r7, r7, r12, r10    ; o15 = (d1 * 2217 - c1 * 5352 +   7500)
121
122    pkhbt       r0, r4, r6, lsl #4  ; [o13 | o12], keep in register for PART 2
123    pkhbt       r6, r5, r7, lsl #4  ; [o15 | o14]
124
125    str         r6, [r1, #28]
126
127
128    ; PART 2 -------------------------------------------------
129    ldr         r11, c12000
130    ldr         r10, c51000
131    ldr         lr, c0x00070007
132
133    qadd16      r4, r3, r0          ; a1 = [i1+i13 | i0+i12]
134    qadd16      r5, r9, r2          ; b1 = [i5+i9  |  i4+i8]
135    qsub16      r6, r9, r2          ; c1 = [i5-i9  |  i4-i8]
136    qsub16      r7, r3, r0          ; d1 = [i1-i13 | i0-i12]
137
138    qadd16      r4, r4, lr          ; a1 + 7
139
140    add         r0, r11, #0x10000   ; add (d!=0)
141
142    qadd16      r2, r4, r5          ; a1 + b1 + 7
143    qsub16      r3, r4, r5          ; a1 - b1 + 7
144
145    ldr         r12, c0x08a914e8    ; [2217 | 5352]
146
147    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
148    asr         r2, r2, #4          ; scale top halfword
149    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
150    asr         r3, r3, #4          ; scale top halfword
151    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
152    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
153
154    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
155    str         r4, [r1, #0]        ; [     o1 |      o0]
156    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
157    str         r5, [r1, #16]       ; [     o9 |      o8]
158
159    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
160    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
161
162    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
163    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
164
165    lsls        r6, r7, #16         ; d1 != 0 ?
166    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
167    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
168    asrs        r6, r7, #16
169    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
170    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
171
172    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
173    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
174
175    pkhtb       r9, r9, r8, asr #16
176
177    sub         r4, r4, r2
178    sub         r5, r5, r3
179
180    ldr         r3, [r1, #4]        ; [i3 | i2]
181
182    pkhtb       r5, r5, r4, asr #16 ; [o13|o12]
183
184    str         r9, [r1, #8]        ; [o5 | 04]
185
186    ldr         r9, [r1, #12]       ; [i7 | i6]
187    ldr         r8, [r1, #28]       ; [i15|i14]
188    ldr         r2, [r1, #20]       ; [i11|i10]
189    str         r5, [r1, #24]       ; [o13|o12]
190
191    qadd16      r4, r3, r8          ; a1 = [i3+i15 | i2+i14]
192    qadd16      r5, r9, r2          ; b1 = [i7+i11 | i6+i10]
193
194    qadd16      r4, r4, lr          ; a1 + 7
195
196    qsub16      r6, r9, r2          ; c1 = [i7-i11 | i6-i10]
197    qadd16      r2, r4, r5          ; a1 + b1 + 7
198    qsub16      r7, r3, r8          ; d1 = [i3-i15 | i2-i14]
199    qsub16      r3, r4, r5          ; a1 - b1 + 7
200
201    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
202    asr         r2, r2, #4          ; scale top halfword
203    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
204    asr         r3, r3, #4          ; scale top halfword
205    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
206    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
207
208    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
209    str         r4, [r1, #4]        ; [     o3 |      o2]
210    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
211    str         r5, [r1, #20]       ; [    o11 |     o10]
212
213    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
214    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
215
216    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
217    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
218
219    lsls        r6, r7, #16         ; d1 != 0 ?
220    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
221    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
222
223    asrs        r6, r7, #16
224    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
225    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
226
227    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
228    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
229
230    pkhtb       r9, r9, r8, asr #16
231
232    sub         r4, r4, r2
233    sub         r5, r5, r3
234
235    str         r9, [r1, #12]       ; [o7 | o6]
236    pkhtb       r5, r5, r4, asr #16 ; [o15|o14]
237
238    str         r5, [r1, #28]       ; [o15|o14]
239
240    ldmfd       sp!, {r4 - r12, pc}
241
242    ENDP
243
244; Used constants
245c7500
246    DCD     7500
247c14500
248    DCD     14500
249c0x22a453a0
250    DCD     0x22a453a0
251c0x00080008
252    DCD     0x00080008
253c12000
254    DCD     12000
255c51000
256    DCD     51000
257c0x00070007
258    DCD     0x00070007
259c0x08a914e8
260    DCD     0x08a914e8
261
262    END
263