1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_filter_block2d_bil_first_pass_armv6|
13    EXPORT  |vp8_filter_block2d_bil_second_pass_armv6|
14
15    AREA    |.text|, CODE, READONLY  ; name this block of code
16
17;-------------------------------------
18; r0    unsigned char  *src_ptr,
19; r1    unsigned short *dst_ptr,
20; r2    unsigned int    src_pitch,
21; r3    unsigned int    height,
22; stack unsigned int    width,
23; stack const short    *vp8_filter
24;-------------------------------------
25; The output is transposed stroed in output array to make it easy for second pass filtering.
26|vp8_filter_block2d_bil_first_pass_armv6| PROC
27    stmdb   sp!, {r4 - r11, lr}
28
29    ldr     r11, [sp, #40]                  ; vp8_filter address
30    ldr     r4, [sp, #36]                   ; width
31
32    mov     r12, r3                         ; outer-loop counter
33
34    add     r7, r2, r4                      ; preload next row
35    pld     [r0, r7]
36
37    sub     r2, r2, r4                      ; src increment for height loop
38
39    ldr     r5, [r11]                       ; load up filter coefficients
40
41    mov     r3, r3, lsl #1                  ; height*2
42    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
43
44    mov     r11, r1                         ; save dst_ptr for each row
45
46    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
47    beq     bil_null_1st_filter
48
49|bil_height_loop_1st_v6|
50    ldrb    r6, [r0]                        ; load source data
51    ldrb    r7, [r0, #1]
52    ldrb    r8, [r0, #2]
53    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter
54
55|bil_width_loop_1st_v6|
56    ldrb    r9, [r0, #3]
57    ldrb    r10, [r0, #4]
58
59    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]
60    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]
61
62    smuad   r6, r6, r5                      ; apply the filter
63    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]
64    smuad   r7, r7, r5
65    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]
66
67    smuad   r8, r8, r5
68    smuad   r9, r9, r5
69
70    add     r0, r0, #4
71    subs    lr, lr, #1
72
73    add     r6, r6, #0x40                   ; round_shift_and_clamp
74    add     r7, r7, #0x40
75    usat    r6, #16, r6, asr #7
76    usat    r7, #16, r7, asr #7
77
78    strh    r6, [r1], r3                    ; result is transposed and stored
79
80    add     r8, r8, #0x40                   ; round_shift_and_clamp
81    strh    r7, [r1], r3
82    add     r9, r9, #0x40
83    usat    r8, #16, r8, asr #7
84    usat    r9, #16, r9, asr #7
85
86    strh    r8, [r1], r3                    ; result is transposed and stored
87
88    ldrneb  r6, [r0]                        ; load source data
89    strh    r9, [r1], r3
90
91    ldrneb  r7, [r0, #1]
92    ldrneb  r8, [r0, #2]
93
94    bne     bil_width_loop_1st_v6
95
96    add     r0, r0, r2                      ; move to next input row
97    subs    r12, r12, #1
98
99    add     r9, r2, r4, lsl #1              ; adding back block width
100    pld     [r0, r9]                        ; preload next row
101
102    add     r11, r11, #2                    ; move over to next column
103    mov     r1, r11
104
105    bne     bil_height_loop_1st_v6
106
107    ldmia   sp!, {r4 - r11, pc}
108
109|bil_null_1st_filter|
110|bil_height_loop_null_1st|
111    mov     lr, r4, lsr #2                  ; loop counter
112
113|bil_width_loop_null_1st|
114    ldrb    r6, [r0]                        ; load data
115    ldrb    r7, [r0, #1]
116    ldrb    r8, [r0, #2]
117    ldrb    r9, [r0, #3]
118
119    strh    r6, [r1], r3                    ; store it to immediate buffer
120    add     r0, r0, #4
121    strh    r7, [r1], r3
122    subs    lr, lr, #1
123    strh    r8, [r1], r3
124    strh    r9, [r1], r3
125
126    bne     bil_width_loop_null_1st
127
128    subs    r12, r12, #1
129    add     r0, r0, r2                      ; move to next input line
130    add     r11, r11, #2                    ; move over to next column
131    mov     r1, r11
132
133    bne     bil_height_loop_null_1st
134
135    ldmia   sp!, {r4 - r11, pc}
136
137    ENDP  ; |vp8_filter_block2d_bil_first_pass_armv6|
138
139
140;---------------------------------
141; r0    unsigned short *src_ptr,
142; r1    unsigned char  *dst_ptr,
143; r2    int             dst_pitch,
144; r3    unsigned int    height,
145; stack unsigned int    width,
146; stack const short    *vp8_filter
147;---------------------------------
148|vp8_filter_block2d_bil_second_pass_armv6| PROC
149    stmdb   sp!, {r4 - r11, lr}
150
151    ldr     r11, [sp, #40]                  ; vp8_filter address
152    ldr     r4, [sp, #36]                   ; width
153
154    ldr     r5, [r11]                       ; load up filter coefficients
155    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
156    mov     r11, r1
157
158    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
159    beq     bil_null_2nd_filter
160
161|bil_height_loop_2nd|
162    ldr     r6, [r0]                        ; load the data
163    ldr     r8, [r0, #4]
164    ldrh    r10, [r0, #8]
165    mov     lr, r3, lsr #2                  ; loop counter
166
167|bil_width_loop_2nd|
168    pkhtb   r7, r6, r8                      ; src[1] | src[2]
169    pkhtb   r9, r8, r10                     ; src[3] | src[4]
170
171    smuad   r6, r6, r5                      ; apply filter
172    smuad   r8, r8, r5                      ; apply filter
173
174    subs    lr, lr, #1
175
176    smuadx  r7, r7, r5                      ; apply filter
177    smuadx  r9, r9, r5                      ; apply filter
178
179    add     r0, r0, #8
180
181    add     r6, r6, #0x40                   ; round_shift_and_clamp
182    add     r7, r7, #0x40
183    usat    r6, #8, r6, asr #7
184    usat    r7, #8, r7, asr #7
185    strb    r6, [r1], r2                    ; the result is transposed back and stored
186
187    add     r8, r8, #0x40                   ; round_shift_and_clamp
188    strb    r7, [r1], r2
189    add     r9, r9, #0x40
190    usat    r8, #8, r8, asr #7
191    usat    r9, #8, r9, asr #7
192    strb    r8, [r1], r2                    ; the result is transposed back and stored
193
194    ldrne   r6, [r0]                        ; load data
195    strb    r9, [r1], r2
196    ldrne   r8, [r0, #4]
197    ldrneh  r10, [r0, #8]
198
199    bne     bil_width_loop_2nd
200
201    subs    r12, r12, #1
202    add     r0, r0, #4                      ; update src for next row
203    add     r11, r11, #1
204    mov     r1, r11
205
206    bne     bil_height_loop_2nd
207    ldmia   sp!, {r4 - r11, pc}
208
209|bil_null_2nd_filter|
210|bil_height_loop_null_2nd|
211    mov     lr, r3, lsr #2
212
213|bil_width_loop_null_2nd|
214    ldr     r6, [r0], #4                    ; load data
215    subs    lr, lr, #1
216    ldr     r8, [r0], #4
217
218    strb    r6, [r1], r2                    ; store data
219    mov     r7, r6, lsr #16
220    strb    r7, [r1], r2
221    mov     r9, r8, lsr #16
222    strb    r8, [r1], r2
223    strb    r9, [r1], r2
224
225    bne     bil_width_loop_null_2nd
226
227    subs    r12, r12, #1
228    add     r0, r0, #4
229    add     r11, r11, #1
230    mov     r1, r11
231
232    bne     bil_height_loop_null_2nd
233
234    ldmia   sp!, {r4 - r11, pc}
235    ENDP  ; |vp8_filter_block2d_second_pass_armv6|
236
237    END
238