1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 EXPORT |vp8_sixtap_predict8x4_armv6| 13 14 AREA |.text|, CODE, READONLY ; name this block of code 15;------------------------------------- 16; r0 unsigned char *src_ptr, 17; r1 int src_pixels_per_line, 18; r2 int xoffset, 19; r3 int yoffset, 20; stack unsigned char *dst_ptr, 21; stack int dst_pitch 22;------------------------------------- 23;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184. 24;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack, 25;and the result is stored in transpose. 26|vp8_sixtap_predict8x4_armv6| PROC 27 stmdb sp!, {r4 - r11, lr} 28 str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset 29 30 cmp r2, #0 ;skip first_pass filter if xoffset=0 31 add lr, sp, #4 ;point to temporary buffer 32 beq skip_firstpass_filter 33 34;first-pass filter 35 adr r12, filter8_coeff 36 sub r0, r0, r1, lsl #1 37 38 add r3, r1, #10 ; preload next low 39 pld [r0, r3] 40 41 add r2, r12, r2, lsl #4 ;calculate filter location 42 add r0, r0, #3 ;adjust src only for loading convinience 43 44 ldr r3, [r2] ; load up packed filter coefficients 45 ldr r4, [r2, #4] 46 ldr r5, [r2, #8] 47 48 mov r2, #0x90000 ; height=9 is top part of counter 49 50 sub r1, r1, #8 51 52|first_pass_hloop_v6| 53 ldrb r6, [r0, #-5] ; load source data 54 ldrb r7, [r0, #-4] 55 ldrb r8, [r0, #-3] 56 ldrb r9, [r0, #-2] 57 ldrb r10, [r0, #-1] 58 59 orr r2, r2, #0x4 ; construct loop counter. width=8=4x2 60 61 pkhbt r6, r6, r7, lsl #16 ; r7 | r6 62 pkhbt r7, r7, r8, lsl #16 ; r8 | r7 63 64 pkhbt r8, r8, r9, lsl #16 ; r9 | r8 65 pkhbt r9, r9, r10, lsl #16 ; r10 | r9 66 67|first_pass_wloop_v6| 68 smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1] 69 smuad r12, r7, r3 70 71 ldrb r6, [r0], #1 72 73 smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3] 74 ldrb r7, [r0], #1 75 smlad r12, r9, r4, r12 76 77 pkhbt r10, r10, r6, lsl #16 ; r10 | r9 78 pkhbt r6, r6, r7, lsl #16 ; r11 | r10 79 smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5] 80 smlad r12, r6, r5, r12 81 82 sub r2, r2, #1 83 84 add r11, r11, #0x40 ; round_shift_and_clamp 85 tst r2, #0xff ; test loop counter 86 usat r11, #8, r11, asr #7 87 add r12, r12, #0x40 88 strh r11, [lr], #20 ; result is transposed and stored, which 89 usat r12, #8, r12, asr #7 90 91 strh r12, [lr], #20 92 93 movne r11, r6 94 movne r12, r7 95 96 movne r6, r8 97 movne r7, r9 98 movne r8, r10 99 movne r9, r11 100 movne r10, r12 101 102 bne first_pass_wloop_v6 103 104 ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines 105 ;;IF ARCHITECTURE=6 106 ;pld [src, ppl] 107 ;;pld [src, r9] 108 ;;ENDIF 109 110 subs r2, r2, #0x10000 111 112 sub lr, lr, #158 113 114 add r0, r0, r1 ; move to next input line 115 116 add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier 117 pld [r0, r11] 118 119 bne first_pass_hloop_v6 120 121;second pass filter 122secondpass_filter 123 ldr r3, [sp], #4 ; load back yoffset 124 ldr r0, [sp, #216] ; load dst address from stack 180+36 125 ldr r1, [sp, #220] ; load dst stride from stack 180+40 126 127 cmp r3, #0 128 beq skip_secondpass_filter 129 130 adr r12, filter8_coeff 131 add lr, r12, r3, lsl #4 ;calculate filter location 132 133 mov r2, #0x00080000 134 135 ldr r3, [lr] ; load up packed filter coefficients 136 ldr r4, [lr, #4] 137 ldr r5, [lr, #8] 138 139 pkhbt r12, r4, r3 ; pack the filter differently 140 pkhbt r11, r5, r4 141 142second_pass_hloop_v6 143 ldr r6, [sp] ; load the data 144 ldr r7, [sp, #4] 145 146 orr r2, r2, #2 ; loop counter 147 148second_pass_wloop_v6 149 smuad lr, r3, r6 ; apply filter 150 smulbt r10, r3, r6 151 152 ldr r8, [sp, #8] 153 154 smlad lr, r4, r7, lr 155 smladx r10, r12, r7, r10 156 157 ldrh r9, [sp, #12] 158 159 smlad lr, r5, r8, lr 160 smladx r10, r11, r8, r10 161 162 add sp, sp, #4 163 smlatb r10, r5, r9, r10 164 165 sub r2, r2, #1 166 167 add lr, lr, #0x40 ; round_shift_and_clamp 168 tst r2, #0xff 169 usat lr, #8, lr, asr #7 170 add r10, r10, #0x40 171 strb lr, [r0], r1 ; the result is transposed back and stored 172 usat r10, #8, r10, asr #7 173 174 strb r10, [r0],r1 175 176 movne r6, r7 177 movne r7, r8 178 179 bne second_pass_wloop_v6 180 181 subs r2, r2, #0x10000 182 add sp, sp, #12 ; updata src for next loop (20-8) 183 sub r0, r0, r1, lsl #2 184 add r0, r0, #1 185 186 bne second_pass_hloop_v6 187 188 add sp, sp, #20 189 ldmia sp!, {r4 - r11, pc} 190 191;-------------------- 192skip_firstpass_filter 193 sub r0, r0, r1, lsl #1 194 sub r1, r1, #8 195 mov r2, #9 196 197skip_firstpass_hloop 198 ldrb r4, [r0], #1 ; load data 199 subs r2, r2, #1 200 ldrb r5, [r0], #1 201 strh r4, [lr], #20 ; store it to immediate buffer 202 ldrb r6, [r0], #1 ; load data 203 strh r5, [lr], #20 204 ldrb r7, [r0], #1 205 strh r6, [lr], #20 206 ldrb r8, [r0], #1 207 strh r7, [lr], #20 208 ldrb r9, [r0], #1 209 strh r8, [lr], #20 210 ldrb r10, [r0], #1 211 strh r9, [lr], #20 212 ldrb r11, [r0], #1 213 strh r10, [lr], #20 214 add r0, r0, r1 ; move to next input line 215 strh r11, [lr], #20 216 217 sub lr, lr, #158 ; move over to next column 218 bne skip_firstpass_hloop 219 220 b secondpass_filter 221 222;-------------------- 223skip_secondpass_filter 224 mov r2, #8 225 add sp, sp, #4 ;start from src[0] instead of src[-2] 226 227skip_secondpass_hloop 228 ldr r6, [sp], #4 229 subs r2, r2, #1 230 ldr r8, [sp], #4 231 232 mov r7, r6, lsr #16 ; unpack 233 strb r6, [r0], r1 234 mov r9, r8, lsr #16 235 strb r7, [r0], r1 236 add sp, sp, #12 ; 20-8 237 strb r8, [r0], r1 238 strb r9, [r0], r1 239 240 sub r0, r0, r1, lsl #2 241 add r0, r0, #1 242 243 bne skip_secondpass_hloop 244 245 add sp, sp, #16 ; 180 - (160 +4) 246 247 ldmia sp!, {r4 - r11, pc} 248 249 ENDP 250 251;----------------- 252;One word each is reserved. Label filter_coeff can be used to access the data. 253;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... 254filter8_coeff 255 DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000 256 DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000 257 DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000 258 DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000 259 DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000 260 DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000 261 DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000 262 DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000 263 264 ;DCD 0, 0, 128, 0, 0, 0 265 ;DCD 0, -6, 123, 12, -1, 0 266 ;DCD 2, -11, 108, 36, -8, 1 267 ;DCD 0, -9, 93, 50, -6, 0 268 ;DCD 3, -16, 77, 77, -16, 3 269 ;DCD 0, -6, 50, 93, -9, 0 270 ;DCD 1, -8, 36, 108, -11, 2 271 ;DCD 0, -1, 12, 123, -6, 0 272 273 END 274