1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_filter_block2d_first_pass_armv6|
13    EXPORT  |vp8_filter_block2d_first_pass_16x16_armv6|
14    EXPORT  |vp8_filter_block2d_first_pass_8x8_armv6|
15    EXPORT  |vp8_filter_block2d_second_pass_armv6|
16    EXPORT  |vp8_filter4_block2d_second_pass_armv6|
17    EXPORT  |vp8_filter_block2d_first_pass_only_armv6|
18    EXPORT  |vp8_filter_block2d_second_pass_only_armv6|
19
20    AREA    |.text|, CODE, READONLY  ; name this block of code
21;-------------------------------------
22; r0    unsigned char *src_ptr
23; r1    short         *output_ptr
24; r2    unsigned int src_pixels_per_line
25; r3    unsigned int output_width
26; stack unsigned int output_height
27; stack const short *vp8_filter
28;-------------------------------------
29; vp8_filter the input and put in the output array.  Apply the 6 tap FIR filter with
30; the output being a 2 byte value and the intput being a 1 byte value.
31|vp8_filter_block2d_first_pass_armv6| PROC
32    stmdb   sp!, {r4 - r11, lr}
33
34    ldr     r11, [sp, #40]                  ; vp8_filter address
35    ldr     r7, [sp, #36]                   ; output height
36
37    sub     r2, r2, r3                      ; inside loop increments input array,
38                                            ; so the height loop only needs to add
39                                            ; r2 - width to the input pointer
40
41    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
42    add     r12, r3, #16                    ; square off the output
43    sub     sp, sp, #4
44
45    ldr     r4, [r11]                       ; load up packed filter coefficients
46    ldr     r5, [r11, #4]
47    ldr     r6, [r11, #8]
48
49    str     r1, [sp]                        ; push destination to stack
50    mov     r7, r7, lsl #16                 ; height is top part of counter
51
52; six tap filter
53|height_loop_1st_6|
54    ldrb    r8, [r0, #-2]                   ; load source data
55    ldrb    r9, [r0, #-1]
56    ldrb    r10, [r0], #2
57    orr     r7, r7, r3, lsr #2              ; construct loop counter
58
59|width_loop_1st_6|
60    ldrb    r11, [r0, #-1]
61
62    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
63    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
64
65    ldrb    r9, [r0]
66
67    smuad   lr, lr, r4                      ; apply the filter
68    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
69    smuad   r8, r8, r4
70    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
71
72    smlad   lr, r10, r5, lr
73    ldrb    r10, [r0, #1]
74    smlad   r8, r11, r5, r8
75    ldrb    r11, [r0, #2]
76
77    sub     r7, r7, #1
78
79    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
80    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
81
82    smlad   lr, r9, r6, lr
83    smlad   r11, r10, r6, r8
84
85    ands    r10, r7, #0xff                  ; test loop counter
86
87    add     lr, lr, #0x40                   ; round_shift_and_clamp
88    ldrneb  r8, [r0, #-2]                   ; load data for next loop
89    usat    lr, #8, lr, asr #7
90    add     r11, r11, #0x40
91    ldrneb  r9, [r0, #-1]
92    usat    r11, #8, r11, asr #7
93
94    strh    lr, [r1], r12                   ; result is transposed and stored, which
95                                            ; will make second pass filtering easier.
96    ldrneb  r10, [r0], #2
97    strh    r11, [r1], r12
98
99    bne     width_loop_1st_6
100
101    ldr     r1, [sp]                        ; load and update dst address
102    subs    r7, r7, #0x10000
103    add     r0, r0, r2                      ; move to next input line
104
105    add     r1, r1, #2                      ; move over to next column
106    str     r1, [sp]
107
108    bne     height_loop_1st_6
109
110    add     sp, sp, #4
111    ldmia   sp!, {r4 - r11, pc}
112
113    ENDP
114
115; --------------------------
116; 16x16 version
117; -----------------------------
118|vp8_filter_block2d_first_pass_16x16_armv6| PROC
119    stmdb   sp!, {r4 - r11, lr}
120
121    ldr     r11, [sp, #40]                  ; vp8_filter address
122    ldr     r7, [sp, #36]                   ; output height
123
124    add     r4, r2, #18                     ; preload next low
125    pld     [r0, r4]
126
127    sub     r2, r2, r3                      ; inside loop increments input array,
128                                            ; so the height loop only needs to add
129                                            ; r2 - width to the input pointer
130
131    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
132    add     r12, r3, #16                    ; square off the output
133    sub     sp, sp, #4
134
135    ldr     r4, [r11]                       ; load up packed filter coefficients
136    ldr     r5, [r11, #4]
137    ldr     r6, [r11, #8]
138
139    str     r1, [sp]                        ; push destination to stack
140    mov     r7, r7, lsl #16                 ; height is top part of counter
141
142; six tap filter
143|height_loop_1st_16_6|
144    ldrb    r8, [r0, #-2]                   ; load source data
145    ldrb    r9, [r0, #-1]
146    ldrb    r10, [r0], #2
147    orr     r7, r7, r3, lsr #2              ; construct loop counter
148
149|width_loop_1st_16_6|
150    ldrb    r11, [r0, #-1]
151
152    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
153    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
154
155    ldrb    r9, [r0]
156
157    smuad   lr, lr, r4                      ; apply the filter
158    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
159    smuad   r8, r8, r4
160    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
161
162    smlad   lr, r10, r5, lr
163    ldrb    r10, [r0, #1]
164    smlad   r8, r11, r5, r8
165    ldrb    r11, [r0, #2]
166
167    sub     r7, r7, #1
168
169    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
170    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
171
172    smlad   lr, r9, r6, lr
173    smlad   r11, r10, r6, r8
174
175    ands    r10, r7, #0xff                  ; test loop counter
176
177    add     lr, lr, #0x40                   ; round_shift_and_clamp
178    ldrneb  r8, [r0, #-2]                   ; load data for next loop
179    usat    lr, #8, lr, asr #7
180    add     r11, r11, #0x40
181    ldrneb  r9, [r0, #-1]
182    usat    r11, #8, r11, asr #7
183
184    strh    lr, [r1], r12                   ; result is transposed and stored, which
185                                            ; will make second pass filtering easier.
186    ldrneb  r10, [r0], #2
187    strh    r11, [r1], r12
188
189    bne     width_loop_1st_16_6
190
191    ldr     r1, [sp]                        ; load and update dst address
192    subs    r7, r7, #0x10000
193    add     r0, r0, r2                      ; move to next input line
194
195    add     r11, r2, #34                    ; adding back block width(=16)
196    pld     [r0, r11]                       ; preload next low
197
198    add     r1, r1, #2                      ; move over to next column
199    str     r1, [sp]
200
201    bne     height_loop_1st_16_6
202
203    add     sp, sp, #4
204    ldmia   sp!, {r4 - r11, pc}
205
206    ENDP
207
208; --------------------------
209; 8x8 version
210; -----------------------------
211|vp8_filter_block2d_first_pass_8x8_armv6| PROC
212    stmdb   sp!, {r4 - r11, lr}
213
214    ldr     r11, [sp, #40]                  ; vp8_filter address
215    ldr     r7, [sp, #36]                   ; output height
216
217    add     r4, r2, #10                     ; preload next low
218    pld     [r0, r4]
219
220    sub     r2, r2, r3                      ; inside loop increments input array,
221                                            ; so the height loop only needs to add
222                                            ; r2 - width to the input pointer
223
224    mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts
225    add     r12, r3, #16                    ; square off the output
226    sub     sp, sp, #4
227
228    ldr     r4, [r11]                       ; load up packed filter coefficients
229    ldr     r5, [r11, #4]
230    ldr     r6, [r11, #8]
231
232    str     r1, [sp]                        ; push destination to stack
233    mov     r7, r7, lsl #16                 ; height is top part of counter
234
235; six tap filter
236|height_loop_1st_8_6|
237    ldrb    r8, [r0, #-2]                   ; load source data
238    ldrb    r9, [r0, #-1]
239    ldrb    r10, [r0], #2
240    orr     r7, r7, r3, lsr #2              ; construct loop counter
241
242|width_loop_1st_8_6|
243    ldrb    r11, [r0, #-1]
244
245    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
246    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
247
248    ldrb    r9, [r0]
249
250    smuad   lr, lr, r4                      ; apply the filter
251    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
252    smuad   r8, r8, r4
253    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
254
255    smlad   lr, r10, r5, lr
256    ldrb    r10, [r0, #1]
257    smlad   r8, r11, r5, r8
258    ldrb    r11, [r0, #2]
259
260    sub     r7, r7, #1
261
262    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
263    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
264
265    smlad   lr, r9, r6, lr
266    smlad   r11, r10, r6, r8
267
268    ands    r10, r7, #0xff                  ; test loop counter
269
270    add     lr, lr, #0x40                   ; round_shift_and_clamp
271    ldrneb  r8, [r0, #-2]                   ; load data for next loop
272    usat    lr, #8, lr, asr #7
273    add     r11, r11, #0x40
274    ldrneb  r9, [r0, #-1]
275    usat    r11, #8, r11, asr #7
276
277    strh    lr, [r1], r12                   ; result is transposed and stored, which
278                                            ; will make second pass filtering easier.
279    ldrneb  r10, [r0], #2
280    strh    r11, [r1], r12
281
282    bne     width_loop_1st_8_6
283
284    ldr     r1, [sp]                        ; load and update dst address
285    subs    r7, r7, #0x10000
286    add     r0, r0, r2                      ; move to next input line
287
288    add     r11, r2, #18                    ; adding back block width(=8)
289    pld     [r0, r11]                       ; preload next low
290
291    add     r1, r1, #2                      ; move over to next column
292    str     r1, [sp]
293
294    bne     height_loop_1st_8_6
295
296    add     sp, sp, #4
297    ldmia   sp!, {r4 - r11, pc}
298
299    ENDP
300
301;---------------------------------
302; r0    short         *src_ptr,
303; r1    unsigned char *output_ptr,
304; r2    unsigned int output_pitch,
305; r3    unsigned int cnt,
306; stack const short *vp8_filter
307;---------------------------------
308|vp8_filter_block2d_second_pass_armv6| PROC
309    stmdb   sp!, {r4 - r11, lr}
310
311    ldr     r11, [sp, #36]                  ; vp8_filter address
312    sub     sp, sp, #4
313    mov     r7, r3, lsl #16                 ; height is top part of counter
314    str     r1, [sp]                        ; push destination to stack
315
316    ldr     r4, [r11]                       ; load up packed filter coefficients
317    ldr     r5, [r11, #4]
318    ldr     r6, [r11, #8]
319
320    pkhbt   r12, r5, r4                     ; pack the filter differently
321    pkhbt   r11, r6, r5
322
323    sub     r0, r0, #4                      ; offset input buffer
324
325|height_loop_2nd|
326    ldr     r8, [r0]                        ; load the data
327    ldr     r9, [r0, #4]
328    orr     r7, r7, r3, lsr #1              ; loop counter
329
330|width_loop_2nd|
331    smuad   lr, r4, r8                      ; apply filter
332    sub     r7, r7, #1
333    smulbt  r8, r4, r8
334
335    ldr     r10, [r0, #8]
336
337    smlad   lr, r5, r9, lr
338    smladx  r8, r12, r9, r8
339
340    ldrh    r9, [r0, #12]
341
342    smlad   lr, r6, r10, lr
343    smladx  r8, r11, r10, r8
344
345    add     r0, r0, #4
346    smlatb  r10, r6, r9, r8
347
348    add     lr, lr, #0x40                   ; round_shift_and_clamp
349    ands    r8, r7, #0xff
350    usat    lr, #8, lr, asr #7
351    add     r10, r10, #0x40
352    strb    lr, [r1], r2                    ; the result is transposed back and stored
353    usat    r10, #8, r10, asr #7
354
355    ldrne   r8, [r0]                        ; load data for next loop
356    ldrne   r9, [r0, #4]
357    strb    r10, [r1], r2
358
359    bne     width_loop_2nd
360
361    ldr     r1, [sp]                        ; update dst for next loop
362    subs    r7, r7, #0x10000
363    add     r0, r0, #16                     ; updata src for next loop
364    add     r1, r1, #1
365    str     r1, [sp]
366
367    bne     height_loop_2nd
368
369    add     sp, sp, #4
370    ldmia   sp!, {r4 - r11, pc}
371
372    ENDP
373
374;---------------------------------
375; r0    short         *src_ptr,
376; r1    unsigned char *output_ptr,
377; r2    unsigned int output_pitch,
378; r3    unsigned int cnt,
379; stack const short *vp8_filter
380;---------------------------------
381|vp8_filter4_block2d_second_pass_armv6| PROC
382    stmdb   sp!, {r4 - r11, lr}
383
384    ldr     r11, [sp, #36]                  ; vp8_filter address
385    mov     r7, r3, lsl #16                 ; height is top part of counter
386
387    ldr     r4, [r11]                       ; load up packed filter coefficients
388    add     lr, r1, r3                      ; save final destination pointer
389    ldr     r5, [r11, #4]
390    ldr     r6, [r11, #8]
391
392    pkhbt   r12, r5, r4                     ; pack the filter differently
393    pkhbt   r11, r6, r5
394    mov     r4, #0x40                       ; rounding factor (for smlad{x})
395
396|height_loop_2nd_4|
397    ldrd    r8, r9, [r0, #-4]               ; load the data
398    orr     r7, r7, r3, lsr #1              ; loop counter
399
400|width_loop_2nd_4|
401    ldr     r10, [r0, #4]!
402    smladx  r6, r9, r12, r4                 ; apply filter
403    pkhbt   r8, r9, r8
404    smlad   r5, r8, r12, r4
405    pkhbt   r8, r10, r9
406    smladx  r6, r10, r11, r6
407    sub     r7, r7, #1
408    smlad   r5, r8, r11, r5
409
410    mov     r8, r9                          ; shift the data for the next loop
411    mov     r9, r10
412
413    usat    r6, #8, r6, asr #7              ; shift and clamp
414    usat    r5, #8, r5, asr #7
415
416    strb    r5, [r1], r2                    ; the result is transposed back and stored
417    tst     r7, #0xff
418    strb    r6, [r1], r2
419
420    bne     width_loop_2nd_4
421
422    subs    r7, r7, #0x10000
423    add     r0, r0, #16                     ; update src for next loop
424    sub     r1, lr, r7, lsr #16             ; update dst for next loop
425
426    bne     height_loop_2nd_4
427
428    ldmia   sp!, {r4 - r11, pc}
429
430    ENDP
431
432;------------------------------------
433; r0    unsigned char *src_ptr
434; r1    unsigned char *output_ptr,
435; r2    unsigned int src_pixels_per_line
436; r3    unsigned int cnt,
437; stack unsigned int output_pitch,
438; stack const short *vp8_filter
439;------------------------------------
440|vp8_filter_block2d_first_pass_only_armv6| PROC
441    stmdb   sp!, {r4 - r11, lr}
442
443    add     r7, r2, r3                      ; preload next low
444    add     r7, r7, #2
445    pld     [r0, r7]
446
447    ldr     r4, [sp, #36]                   ; output pitch
448    ldr     r11, [sp, #40]                  ; HFilter address
449    sub     sp, sp, #8
450
451    mov     r7, r3
452    sub     r2, r2, r3                      ; inside loop increments input array,
453                                            ; so the height loop only needs to add
454                                            ; r2 - width to the input pointer
455
456    sub     r4, r4, r3
457    str     r4, [sp]                        ; save modified output pitch
458    str     r2, [sp, #4]
459
460    mov     r2, #0x40
461
462    ldr     r4, [r11]                       ; load up packed filter coefficients
463    ldr     r5, [r11, #4]
464    ldr     r6, [r11, #8]
465
466; six tap filter
467|height_loop_1st_only_6|
468    ldrb    r8, [r0, #-2]                   ; load data
469    ldrb    r9, [r0, #-1]
470    ldrb    r10, [r0], #2
471
472    mov     r12, r3, lsr #1                 ; loop counter
473
474|width_loop_1st_only_6|
475    ldrb    r11, [r0, #-1]
476
477    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
478    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
479
480    ldrb    r9, [r0]
481
482;;  smuad   lr, lr, r4
483    smlad   lr, lr, r4, r2
484    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
485;;  smuad   r8, r8, r4
486    smlad   r8, r8, r4, r2
487    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
488
489    smlad   lr, r10, r5, lr
490    ldrb    r10, [r0, #1]
491    smlad   r8, r11, r5, r8
492    ldrb    r11, [r0, #2]
493
494    subs    r12, r12, #1
495
496    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
497    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
498
499    smlad   lr, r9, r6, lr
500    smlad   r10, r10, r6, r8
501
502;;  add     lr, lr, #0x40                   ; round_shift_and_clamp
503    ldrneb  r8, [r0, #-2]                   ; load data for next loop
504    usat    lr, #8, lr, asr #7
505;;  add     r10, r10, #0x40
506    strb    lr, [r1], #1                    ; store the result
507    usat    r10, #8, r10, asr #7
508
509    ldrneb  r9, [r0, #-1]
510    strb    r10, [r1], #1
511    ldrneb  r10, [r0], #2
512
513    bne     width_loop_1st_only_6
514
515    ldr     lr, [sp]                        ; load back output pitch
516    ldr     r12, [sp, #4]                   ; load back output pitch
517    subs    r7, r7, #1
518    add     r0, r0, r12                     ; updata src for next loop
519
520    add     r11, r12, r3                    ; preload next low
521    add     r11, r11, #2
522    pld     [r0, r11]
523
524    add     r1, r1, lr                      ; update dst for next loop
525
526    bne     height_loop_1st_only_6
527
528    add     sp, sp, #8
529    ldmia   sp!, {r4 - r11, pc}
530    ENDP  ; |vp8_filter_block2d_first_pass_only_armv6|
531
532
533;------------------------------------
534; r0    unsigned char *src_ptr,
535; r1    unsigned char *output_ptr,
536; r2    unsigned int src_pixels_per_line
537; r3    unsigned int cnt,
538; stack unsigned int output_pitch,
539; stack const short *vp8_filter
540;------------------------------------
541|vp8_filter_block2d_second_pass_only_armv6| PROC
542    stmdb   sp!, {r4 - r11, lr}
543
544    ldr     r11, [sp, #40]                  ; VFilter address
545    ldr     r12, [sp, #36]                  ; output pitch
546
547    mov     r7, r3, lsl #16                 ; height is top part of counter
548    sub     r0, r0, r2, lsl #1              ; need 6 elements for filtering, 2 before, 3 after
549
550    sub     sp, sp, #8
551
552    ldr     r4, [r11]                       ; load up packed filter coefficients
553    ldr     r5, [r11, #4]
554    ldr     r6, [r11, #8]
555
556    str     r0, [sp]                        ; save r0 to stack
557    str     r1, [sp, #4]                    ; save dst to stack
558
559; six tap filter
560|width_loop_2nd_only_6|
561    ldrb    r8, [r0], r2                    ; load data
562    orr     r7, r7, r3                      ; loop counter
563    ldrb    r9, [r0], r2
564    ldrb    r10, [r0], r2
565
566|height_loop_2nd_only_6|
567    ; filter first column in this inner loop, than, move to next colum.
568    ldrb    r11, [r0], r2
569
570    pkhbt   lr, r8, r9, lsl #16             ; r9 | r8
571    pkhbt   r8, r9, r10, lsl #16            ; r10 | r9
572
573    ldrb    r9, [r0], r2
574
575    smuad   lr, lr, r4
576    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
577    smuad   r8, r8, r4
578    pkhbt   r11, r11, r9, lsl #16           ; r9 | r11
579
580    smlad   lr, r10, r5, lr
581    ldrb    r10, [r0], r2
582    smlad   r8, r11, r5, r8
583    ldrb    r11, [r0]
584
585    sub     r7, r7, #2
586    sub     r0, r0, r2, lsl #2
587
588    pkhbt   r9, r9, r10, lsl #16            ; r10 | r9
589    pkhbt   r10, r10, r11, lsl #16          ; r11 | r10
590
591    smlad   lr, r9, r6, lr
592    smlad   r10, r10, r6, r8
593
594    ands    r9, r7, #0xff
595
596    add     lr, lr, #0x40                   ; round_shift_and_clamp
597    ldrneb  r8, [r0], r2                    ; load data for next loop
598    usat    lr, #8, lr, asr #7
599    add     r10, r10, #0x40
600    strb    lr, [r1], r12                   ; store the result for the column
601    usat    r10, #8, r10, asr #7
602
603    ldrneb  r9, [r0], r2
604    strb    r10, [r1], r12
605    ldrneb  r10, [r0], r2
606
607    bne     height_loop_2nd_only_6
608
609    ldr     r0, [sp]
610    ldr     r1, [sp, #4]
611    subs    r7, r7, #0x10000
612    add     r0, r0, #1                      ; move to filter next column
613    str     r0, [sp]
614    add     r1, r1, #1
615    str     r1, [sp, #4]
616
617    bne     width_loop_2nd_only_6
618
619    add     sp, sp, #8
620
621    ldmia   sp!, {r4 - r11, pc}
622    ENDP  ; |vp8_filter_block2d_second_pass_only_armv6|
623
624    END
625