1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 EXPORT |vp8_loop_filter_horizontal_edge_armv6| 13 EXPORT |vp8_mbloop_filter_horizontal_edge_armv6| 14 EXPORT |vp8_loop_filter_vertical_edge_armv6| 15 EXPORT |vp8_mbloop_filter_vertical_edge_armv6| 16 17 AREA |.text|, CODE, READONLY ; name this block of code 18 19 MACRO 20 TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 21 ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 22 ; a0: 03 02 01 00 23 ; a1: 13 12 11 10 24 ; a2: 23 22 21 20 25 ; a3: 33 32 31 30 26 ; b3 b2 b1 b0 27 28 uxtb16 $b1, $a1 ; xx 12 xx 10 29 uxtb16 $b0, $a0 ; xx 02 xx 00 30 uxtb16 $b3, $a3 ; xx 32 xx 30 31 uxtb16 $b2, $a2 ; xx 22 xx 20 32 orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 33 orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 34 35 uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 36 uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 37 uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 38 uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 39 orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 40 orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 41 42 pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 43 pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 44 45 pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 46 pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 47 MEND 48 49 50src RN r0 51pstep RN r1 52count RN r5 53 54;r0 unsigned char *src_ptr, 55;r1 int src_pixel_step, 56;r2 const char *blimit, 57;r3 const char *limit, 58;stack const char *thresh, 59;stack int count 60 61;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 62|vp8_loop_filter_horizontal_edge_armv6| PROC 63;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 64 stmdb sp!, {r4 - r11, lr} 65 66 sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines 67 ldr count, [sp, #40] ; count for 8-in-parallel 68 ldr r6, [sp, #36] ; load thresh address 69 sub sp, sp, #16 ; create temp buffer 70 71 ldr r9, [src], pstep ; p3 72 ldrb r4, [r2] ; blimit 73 ldr r10, [src], pstep ; p2 74 ldrb r2, [r3] ; limit 75 ldr r11, [src], pstep ; p1 76 orr r4, r4, r4, lsl #8 77 ldrb r3, [r6] ; thresh 78 orr r2, r2, r2, lsl #8 79 mov count, count, lsl #1 ; 4-in-parallel 80 orr r4, r4, r4, lsl #16 81 orr r3, r3, r3, lsl #8 82 orr r2, r2, r2, lsl #16 83 orr r3, r3, r3, lsl #16 84 85|Hnext8| 86 ; vp8_filter_mask() function 87 ; calculate breakout conditions 88 ldr r12, [src], pstep ; p0 89 90 uqsub8 r6, r9, r10 ; p3 - p2 91 uqsub8 r7, r10, r9 ; p2 - p3 92 uqsub8 r8, r10, r11 ; p2 - p1 93 uqsub8 r10, r11, r10 ; p1 - p2 94 95 orr r6, r6, r7 ; abs (p3-p2) 96 orr r8, r8, r10 ; abs (p2-p1) 97 uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask 98 uqsub8 r8, r8, r2 ; compare to limit 99 uqsub8 r6, r11, r12 ; p1 - p0 100 orr lr, lr, r8 101 uqsub8 r7, r12, r11 ; p0 - p1 102 ldr r9, [src], pstep ; q0 103 ldr r10, [src], pstep ; q1 104 orr r6, r6, r7 ; abs (p1-p0) 105 uqsub8 r7, r6, r2 ; compare to limit 106 uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later 107 orr lr, lr, r7 108 109 uqsub8 r6, r11, r10 ; p1 - q1 110 uqsub8 r7, r10, r11 ; q1 - p1 111 uqsub8 r11, r12, r9 ; p0 - q0 112 uqsub8 r12, r9, r12 ; q0 - p0 113 orr r6, r6, r7 ; abs (p1-q1) 114 ldr r7, c0x7F7F7F7F 115 orr r12, r11, r12 ; abs (p0-q0) 116 ldr r11, [src], pstep ; q2 117 uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 118 and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 119 uqsub8 r7, r9, r10 ; q0 - q1 120 uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 121 uqsub8 r6, r10, r9 ; q1 - q0 122 uqsub8 r12, r12, r4 ; compare to flimit 123 uqsub8 r9, r11, r10 ; q2 - q1 124 125 orr lr, lr, r12 126 127 ldr r12, [src], pstep ; q3 128 uqsub8 r10, r10, r11 ; q1 - q2 129 orr r6, r7, r6 ; abs (q1-q0) 130 orr r10, r9, r10 ; abs (q2-q1) 131 uqsub8 r7, r6, r2 ; compare to limit 132 uqsub8 r10, r10, r2 ; compare to limit 133 uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later 134 orr lr, lr, r7 135 orr lr, lr, r10 136 137 uqsub8 r10, r12, r11 ; q3 - q2 138 uqsub8 r9, r11, r12 ; q2 - q3 139 140 mvn r11, #0 ; r11 == -1 141 142 orr r10, r10, r9 ; abs (q3-q2) 143 uqsub8 r10, r10, r2 ; compare to limit 144 145 mov r12, #0 146 orr lr, lr, r10 147 sub src, src, pstep, lsl #2 148 149 usub8 lr, r12, lr ; use usub8 instead of ssub8 150 sel lr, r11, r12 ; filter mask: lr 151 152 cmp lr, #0 153 beq hskip_filter ; skip filtering 154 155 sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines 156 157 ;vp8_hevmask() function 158 ;calculate high edge variance 159 orr r10, r6, r8 ; calculate vp8_hevmask 160 161 ldr r7, [src], pstep ; p1 162 163 usub8 r10, r12, r10 ; use usub8 instead of ssub8 164 sel r6, r12, r11 ; obtain vp8_hevmask: r6 165 166 ;vp8_filter() function 167 ldr r8, [src], pstep ; p0 168 ldr r12, c0x80808080 169 ldr r9, [src], pstep ; q0 170 ldr r10, [src], pstep ; q1 171 172 eor r7, r7, r12 ; p1 offset to convert to a signed value 173 eor r8, r8, r12 ; p0 offset to convert to a signed value 174 eor r9, r9, r12 ; q0 offset to convert to a signed value 175 eor r10, r10, r12 ; q1 offset to convert to a signed value 176 177 str r9, [sp] ; store qs0 temporarily 178 str r8, [sp, #4] ; store ps0 temporarily 179 str r10, [sp, #8] ; store qs1 temporarily 180 str r7, [sp, #12] ; store ps1 temporarily 181 182 qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) 183 qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) 184 185 and r7, r7, r6 ; vp8_filter (r7) &= hev 186 187 qadd8 r7, r7, r8 188 ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 189 190 qadd8 r7, r7, r8 191 ldr r10, c0x04040404 192 193 qadd8 r7, r7, r8 194 and r7, r7, lr ; vp8_filter &= mask; 195 196 ;modify code for vp8 -- Filter1 = vp8_filter (r7) 197 qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) 198 qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) 199 200 mov r9, #0 201 shadd8 r8 , r8 , r9 ; Filter2 >>= 3 202 shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 203 shadd8 r8 , r8 , r9 204 shadd8 r7 , r7 , r9 205 shadd8 lr , r8 , r9 ; lr: Filter2 206 shadd8 r7 , r7 , r9 ; r7: filter 207 208 ;usub8 lr, r8, r10 ; s = (s==4)*-1 209 ;sel lr, r11, r9 210 ;usub8 r8, r10, r8 211 ;sel r8, r11, r9 212 ;and r8, r8, lr ; -1 for each element that equals 4 213 214 ;calculate output 215 ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) 216 217 ldr r8, [sp] ; load qs0 218 ldr r9, [sp, #4] ; load ps0 219 220 ldr r10, c0x01010101 221 222 qsub8 r8 ,r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) 223 qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) 224 225 ;end of modification for vp8 226 227 mov lr, #0 228 sadd8 r7, r7 , r10 ; vp8_filter += 1 229 shadd8 r7, r7, lr ; vp8_filter >>= 1 230 231 ldr r11, [sp, #12] ; load ps1 232 ldr r10, [sp, #8] ; load qs1 233 234 bic r7, r7, r6 ; vp8_filter &= ~hev 235 sub src, src, pstep, lsl #2 236 237 qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) 238 qsub8 r10, r10,r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) 239 240 eor r11, r11, r12 ; *op1 = u^0x80 241 str r11, [src], pstep ; store op1 242 eor r9, r9, r12 ; *op0 = u^0x80 243 str r9, [src], pstep ; store op0 result 244 eor r8, r8, r12 ; *oq0 = u^0x80 245 str r8, [src], pstep ; store oq0 result 246 eor r10, r10, r12 ; *oq1 = u^0x80 247 str r10, [src], pstep ; store oq1 248 249 sub src, src, pstep, lsl #1 250 251|hskip_filter| 252 add src, src, #4 253 sub src, src, pstep, lsl #2 254 255 subs count, count, #1 256 257 ldrne r9, [src], pstep ; p3 258 ldrne r10, [src], pstep ; p2 259 ldrne r11, [src], pstep ; p1 260 261 bne Hnext8 262 263 add sp, sp, #16 264 ldmia sp!, {r4 - r11, pc} 265 ENDP ; |vp8_loop_filter_horizontal_edge_armv6| 266 267 268;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 269|vp8_mbloop_filter_horizontal_edge_armv6| PROC 270;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 271 stmdb sp!, {r4 - r11, lr} 272 273 sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines 274 ldr count, [sp, #40] ; count for 8-in-parallel 275 ldr r6, [sp, #36] ; load thresh address 276 sub sp, sp, #16 ; create temp buffer 277 278 ldr r9, [src], pstep ; p3 279 ldrb r4, [r2] ; blimit 280 ldr r10, [src], pstep ; p2 281 ldrb r2, [r3] ; limit 282 ldr r11, [src], pstep ; p1 283 orr r4, r4, r4, lsl #8 284 ldrb r3, [r6] ; thresh 285 orr r2, r2, r2, lsl #8 286 mov count, count, lsl #1 ; 4-in-parallel 287 orr r4, r4, r4, lsl #16 288 orr r3, r3, r3, lsl #8 289 orr r2, r2, r2, lsl #16 290 orr r3, r3, r3, lsl #16 291 292|MBHnext8| 293 294 ; vp8_filter_mask() function 295 ; calculate breakout conditions 296 ldr r12, [src], pstep ; p0 297 298 uqsub8 r6, r9, r10 ; p3 - p2 299 uqsub8 r7, r10, r9 ; p2 - p3 300 uqsub8 r8, r10, r11 ; p2 - p1 301 uqsub8 r10, r11, r10 ; p1 - p2 302 303 orr r6, r6, r7 ; abs (p3-p2) 304 orr r8, r8, r10 ; abs (p2-p1) 305 uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask 306 uqsub8 r8, r8, r2 ; compare to limit 307 308 uqsub8 r6, r11, r12 ; p1 - p0 309 orr lr, lr, r8 310 uqsub8 r7, r12, r11 ; p0 - p1 311 ldr r9, [src], pstep ; q0 312 ldr r10, [src], pstep ; q1 313 orr r6, r6, r7 ; abs (p1-p0) 314 uqsub8 r7, r6, r2 ; compare to limit 315 uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later 316 orr lr, lr, r7 317 318 uqsub8 r6, r11, r10 ; p1 - q1 319 uqsub8 r7, r10, r11 ; q1 - p1 320 uqsub8 r11, r12, r9 ; p0 - q0 321 uqsub8 r12, r9, r12 ; q0 - p0 322 orr r6, r6, r7 ; abs (p1-q1) 323 ldr r7, c0x7F7F7F7F 324 orr r12, r11, r12 ; abs (p0-q0) 325 ldr r11, [src], pstep ; q2 326 uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 327 and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 328 uqsub8 r7, r9, r10 ; q0 - q1 329 uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 330 uqsub8 r6, r10, r9 ; q1 - q0 331 uqsub8 r12, r12, r4 ; compare to flimit 332 uqsub8 r9, r11, r10 ; q2 - q1 333 334 orr lr, lr, r12 335 336 ldr r12, [src], pstep ; q3 337 338 uqsub8 r10, r10, r11 ; q1 - q2 339 orr r6, r7, r6 ; abs (q1-q0) 340 orr r10, r9, r10 ; abs (q2-q1) 341 uqsub8 r7, r6, r2 ; compare to limit 342 uqsub8 r10, r10, r2 ; compare to limit 343 uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later 344 orr lr, lr, r7 345 orr lr, lr, r10 346 347 uqsub8 r10, r12, r11 ; q3 - q2 348 uqsub8 r9, r11, r12 ; q2 - q3 349 350 mvn r11, #0 ; r11 == -1 351 352 orr r10, r10, r9 ; abs (q3-q2) 353 uqsub8 r10, r10, r2 ; compare to limit 354 355 mov r12, #0 356 357 orr lr, lr, r10 358 359 usub8 lr, r12, lr ; use usub8 instead of ssub8 360 sel lr, r11, r12 ; filter mask: lr 361 362 cmp lr, #0 363 beq mbhskip_filter ; skip filtering 364 365 ;vp8_hevmask() function 366 ;calculate high edge variance 367 sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines 368 sub src, src, pstep, lsl #1 369 370 orr r10, r6, r8 371 ldr r7, [src], pstep ; p1 372 373 usub8 r10, r12, r10 374 sel r6, r12, r11 ; hev mask: r6 375 376 ;vp8_mbfilter() function 377 ;p2, q2 are only needed at the end. Don't need to load them in now. 378 ldr r8, [src], pstep ; p0 379 ldr r12, c0x80808080 380 ldr r9, [src], pstep ; q0 381 ldr r10, [src] ; q1 382 383 eor r7, r7, r12 ; ps1 384 eor r8, r8, r12 ; ps0 385 eor r9, r9, r12 ; qs0 386 eor r10, r10, r12 ; qs1 387 388 qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) 389 str r7, [sp, #12] ; store ps1 temporarily 390 qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) 391 str r10, [sp, #8] ; store qs1 temporarily 392 qadd8 r7, r7, r12 393 str r9, [sp] ; store qs0 temporarily 394 qadd8 r7, r7, r12 395 str r8, [sp, #4] ; store ps0 temporarily 396 qadd8 r7, r7, r12 ; vp8_filter: r7 397 398 ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 399 ldr r9, c0x04040404 400 401 and r7, r7, lr ; vp8_filter &= mask (lr is free) 402 403 mov r12, r7 ; Filter2: r12 404 and r12, r12, r6 ; Filter2 &= hev 405 406 ;modify code for vp8 407 ;save bottom 3 bits so that we round one side +4 and the other +3 408 qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) 409 qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) 410 411 mov r10, #0 412 shadd8 r8 , r8 , r10 ; Filter1 >>= 3 413 shadd8 r12 , r12 , r10 ; Filter2 >>= 3 414 shadd8 r8 , r8 , r10 415 shadd8 r12 , r12 , r10 416 shadd8 r8 , r8 , r10 ; r8: Filter1 417 shadd8 r12 , r12 , r10 ; r12: Filter2 418 419 ldr r9, [sp] ; load qs0 420 ldr r11, [sp, #4] ; load ps0 421 422 qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) 423 qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) 424 425 ;save bottom 3 bits so that we round one side +4 and the other +3 426 ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) 427 ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) 428 ;mov r10, #0 429 ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 430 ;usub8 lr, r8, r9 ; s = (s==4)*-1 431 ;sel lr, r11, r10 432 ;shadd8 r12 , r12 , r10 433 ;usub8 r8, r9, r8 434 ;sel r8, r11, r10 435 ;ldr r9, [sp] ; load qs0 436 ;ldr r11, [sp, #4] ; load ps0 437 ;shadd8 r12 , r12 , r10 438 ;and r8, r8, lr ; -1 for each element that equals 4 439 ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) 440 ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) 441 ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) 442 443 ;end of modification for vp8 444 445 bic r12, r7, r6 ; vp8_filter &= ~hev ( r6 is free) 446 ;mov r12, r7 447 448 ;roughly 3/7th difference across boundary 449 mov lr, #0x1b ; 27 450 mov r7, #0x3f ; 63 451 452 sxtb16 r6, r12 453 sxtb16 r10, r12, ror #8 454 smlabb r8, r6, lr, r7 455 smlatb r6, r6, lr, r7 456 smlabb r7, r10, lr, r7 457 smultb r10, r10, lr 458 ssat r8, #8, r8, asr #7 459 ssat r6, #8, r6, asr #7 460 add r10, r10, #63 461 ssat r7, #8, r7, asr #7 462 ssat r10, #8, r10, asr #7 463 464 ldr lr, c0x80808080 465 466 pkhbt r6, r8, r6, lsl #16 467 pkhbt r10, r7, r10, lsl #16 468 uxtb16 r6, r6 469 uxtb16 r10, r10 470 471 sub src, src, pstep 472 473 orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) 474 475 qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) 476 qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) 477 eor r8, r8, lr ; *oq0 = s^0x80 478 str r8, [src] ; store *oq0 479 sub src, src, pstep 480 eor r10, r10, lr ; *op0 = s^0x80 481 str r10, [src] ; store *op0 482 483 ;roughly 2/7th difference across boundary 484 mov lr, #0x12 ; 18 485 mov r7, #0x3f ; 63 486 487 sxtb16 r6, r12 488 sxtb16 r10, r12, ror #8 489 smlabb r8, r6, lr, r7 490 smlatb r6, r6, lr, r7 491 smlabb r9, r10, lr, r7 492 smlatb r10, r10, lr, r7 493 ssat r8, #8, r8, asr #7 494 ssat r6, #8, r6, asr #7 495 ssat r9, #8, r9, asr #7 496 ssat r10, #8, r10, asr #7 497 498 ldr lr, c0x80808080 499 500 pkhbt r6, r8, r6, lsl #16 501 pkhbt r10, r9, r10, lsl #16 502 503 ldr r9, [sp, #8] ; load qs1 504 ldr r11, [sp, #12] ; load ps1 505 506 uxtb16 r6, r6 507 uxtb16 r10, r10 508 509 sub src, src, pstep 510 511 orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) 512 513 qadd8 r11, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) 514 qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) 515 eor r11, r11, lr ; *op1 = s^0x80 516 str r11, [src], pstep ; store *op1 517 eor r8, r8, lr ; *oq1 = s^0x80 518 add src, src, pstep, lsl #1 519 520 mov r7, #0x3f ; 63 521 522 str r8, [src], pstep ; store *oq1 523 524 ;roughly 1/7th difference across boundary 525 mov lr, #0x9 ; 9 526 ldr r9, [src] ; load q2 527 528 sxtb16 r6, r12 529 sxtb16 r10, r12, ror #8 530 smlabb r8, r6, lr, r7 531 smlatb r6, r6, lr, r7 532 smlabb r12, r10, lr, r7 533 smlatb r10, r10, lr, r7 534 ssat r8, #8, r8, asr #7 535 ssat r6, #8, r6, asr #7 536 ssat r12, #8, r12, asr #7 537 ssat r10, #8, r10, asr #7 538 539 sub src, src, pstep, lsl #2 540 541 pkhbt r6, r8, r6, lsl #16 542 pkhbt r10, r12, r10, lsl #16 543 544 sub src, src, pstep 545 ldr lr, c0x80808080 546 547 ldr r11, [src] ; load p2 548 549 uxtb16 r6, r6 550 uxtb16 r10, r10 551 552 eor r9, r9, lr 553 eor r11, r11, lr 554 555 orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) 556 557 qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) 558 qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) 559 eor r8, r8, lr ; *op2 = s^0x80 560 str r8, [src], pstep, lsl #2 ; store *op2 561 add src, src, pstep 562 eor r10, r10, lr ; *oq2 = s^0x80 563 str r10, [src], pstep, lsl #1 ; store *oq2 564 565|mbhskip_filter| 566 add src, src, #4 567 sub src, src, pstep, lsl #3 568 subs count, count, #1 569 570 ldrne r9, [src], pstep ; p3 571 ldrne r10, [src], pstep ; p2 572 ldrne r11, [src], pstep ; p1 573 574 bne MBHnext8 575 576 add sp, sp, #16 577 ldmia sp!, {r4 - r11, pc} 578 ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6| 579 580 581;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 582|vp8_loop_filter_vertical_edge_armv6| PROC 583;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 584 stmdb sp!, {r4 - r11, lr} 585 586 sub src, src, #4 ; move src pointer down by 4 587 ldr count, [sp, #40] ; count for 8-in-parallel 588 ldr r12, [sp, #36] ; load thresh address 589 sub sp, sp, #16 ; create temp buffer 590 591 ldr r6, [src], pstep ; load source data 592 ldrb r4, [r2] ; blimit 593 ldr r7, [src], pstep 594 ldrb r2, [r3] ; limit 595 ldr r8, [src], pstep 596 orr r4, r4, r4, lsl #8 597 ldrb r3, [r12] ; thresh 598 orr r2, r2, r2, lsl #8 599 ldr lr, [src], pstep 600 mov count, count, lsl #1 ; 4-in-parallel 601 orr r4, r4, r4, lsl #16 602 orr r3, r3, r3, lsl #8 603 orr r2, r2, r2, lsl #16 604 orr r3, r3, r3, lsl #16 605 606|Vnext8| 607 608 ; vp8_filter_mask() function 609 ; calculate breakout conditions 610 ; transpose the source data for 4-in-parallel operation 611 TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 612 613 uqsub8 r7, r9, r10 ; p3 - p2 614 uqsub8 r8, r10, r9 ; p2 - p3 615 uqsub8 r9, r10, r11 ; p2 - p1 616 uqsub8 r10, r11, r10 ; p1 - p2 617 orr r7, r7, r8 ; abs (p3-p2) 618 orr r10, r9, r10 ; abs (p2-p1) 619 uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask 620 uqsub8 r10, r10, r2 ; compare to limit 621 622 sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines 623 624 orr lr, lr, r10 625 626 uqsub8 r6, r11, r12 ; p1 - p0 627 uqsub8 r7, r12, r11 ; p0 - p1 628 add src, src, #4 ; move src pointer up by 4 629 orr r6, r6, r7 ; abs (p1-p0) 630 str r11, [sp, #12] ; save p1 631 uqsub8 r10, r6, r2 ; compare to limit 632 uqsub8 r11, r6, r3 ; compare to thresh 633 orr lr, lr, r10 634 635 ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now 636 ; transpose the source data for 4-in-parallel operation 637 ldr r6, [src], pstep ; load source data 638 str r11, [sp] ; push r11 to stack 639 ldr r7, [src], pstep 640 str r12, [sp, #4] ; save current reg before load q0 - q3 data 641 ldr r8, [src], pstep 642 str lr, [sp, #8] 643 ldr lr, [src], pstep 644 645 TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 646 647 ldr lr, [sp, #8] ; load back (f)limit accumulator 648 649 uqsub8 r6, r12, r11 ; q3 - q2 650 uqsub8 r7, r11, r12 ; q2 - q3 651 uqsub8 r12, r11, r10 ; q2 - q1 652 uqsub8 r11, r10, r11 ; q1 - q2 653 orr r6, r6, r7 ; abs (q3-q2) 654 orr r7, r12, r11 ; abs (q2-q1) 655 uqsub8 r6, r6, r2 ; compare to limit 656 uqsub8 r7, r7, r2 ; compare to limit 657 ldr r11, [sp, #4] ; load back p0 658 ldr r12, [sp, #12] ; load back p1 659 orr lr, lr, r6 660 orr lr, lr, r7 661 662 uqsub8 r6, r11, r9 ; p0 - q0 663 uqsub8 r7, r9, r11 ; q0 - p0 664 uqsub8 r8, r12, r10 ; p1 - q1 665 uqsub8 r11, r10, r12 ; q1 - p1 666 orr r6, r6, r7 ; abs (p0-q0) 667 ldr r7, c0x7F7F7F7F 668 orr r8, r8, r11 ; abs (p1-q1) 669 uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 670 and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 671 uqsub8 r11, r10, r9 ; q1 - q0 672 uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 673 uqsub8 r12, r9, r10 ; q0 - q1 674 uqsub8 r6, r6, r4 ; compare to flimit 675 676 orr r9, r11, r12 ; abs (q1-q0) 677 uqsub8 r8, r9, r2 ; compare to limit 678 uqsub8 r10, r9, r3 ; compare to thresh 679 orr lr, lr, r6 680 orr lr, lr, r8 681 682 mvn r11, #0 ; r11 == -1 683 mov r12, #0 684 685 usub8 lr, r12, lr 686 ldr r9, [sp] ; load the compared result 687 sel lr, r11, r12 ; filter mask: lr 688 689 cmp lr, #0 690 beq vskip_filter ; skip filtering 691 692 ;vp8_hevmask() function 693 ;calculate high edge variance 694 695 sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines 696 697 orr r9, r9, r10 698 699 ldrh r7, [src, #-2] 700 ldrh r8, [src], pstep 701 702 usub8 r9, r12, r9 703 sel r6, r12, r11 ; hev mask: r6 704 705 ;vp8_filter() function 706 ; load soure data to r6, r11, r12, lr 707 ldrh r9, [src, #-2] 708 ldrh r10, [src], pstep 709 710 pkhbt r12, r7, r8, lsl #16 711 712 ldrh r7, [src, #-2] 713 ldrh r8, [src], pstep 714 715 pkhbt r11, r9, r10, lsl #16 716 717 ldrh r9, [src, #-2] 718 ldrh r10, [src], pstep 719 720 ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first 721 str r6, [sp] 722 str lr, [sp, #4] 723 724 pkhbt r6, r7, r8, lsl #16 725 pkhbt lr, r9, r10, lsl #16 726 727 ;transpose r12, r11, r6, lr to r7, r8, r9, r10 728 TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 729 730 ;load back hev_mask r6 and filter_mask lr 731 ldr r12, c0x80808080 732 ldr r6, [sp] 733 ldr lr, [sp, #4] 734 735 eor r7, r7, r12 ; p1 offset to convert to a signed value 736 eor r8, r8, r12 ; p0 offset to convert to a signed value 737 eor r9, r9, r12 ; q0 offset to convert to a signed value 738 eor r10, r10, r12 ; q1 offset to convert to a signed value 739 740 str r9, [sp] ; store qs0 temporarily 741 str r8, [sp, #4] ; store ps0 temporarily 742 str r10, [sp, #8] ; store qs1 temporarily 743 str r7, [sp, #12] ; store ps1 temporarily 744 745 qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) 746 qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) 747 748 and r7, r7, r6 ; vp8_filter (r7) &= hev (r7 : filter) 749 750 qadd8 r7, r7, r8 751 ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 752 753 qadd8 r7, r7, r8 754 ldr r10, c0x04040404 755 756 qadd8 r7, r7, r8 757 ;mvn r11, #0 ; r11 == -1 758 759 and r7, r7, lr ; vp8_filter &= mask 760 761 ;modify code for vp8 -- Filter1 = vp8_filter (r7) 762 qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) 763 qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) 764 765 mov r9, #0 766 shadd8 r8 , r8 , r9 ; Filter2 >>= 3 767 shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 768 shadd8 r8 , r8 , r9 769 shadd8 r7 , r7 , r9 770 shadd8 lr , r8 , r9 ; lr: filter2 771 shadd8 r7 , r7 , r9 ; r7: filter 772 773 ;usub8 lr, r8, r10 ; s = (s==4)*-1 774 ;sel lr, r11, r9 775 ;usub8 r8, r10, r8 776 ;sel r8, r11, r9 777 ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s 778 779 ;calculate output 780 ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) 781 782 ldr r8, [sp] ; load qs0 783 ldr r9, [sp, #4] ; load ps0 784 785 ldr r10, c0x01010101 786 787 qsub8 r8, r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) 788 qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) 789 ;end of modification for vp8 790 791 eor r8, r8, r12 792 eor r9, r9, r12 793 794 mov lr, #0 795 796 sadd8 r7, r7, r10 797 shadd8 r7, r7, lr 798 799 ldr r10, [sp, #8] ; load qs1 800 ldr r11, [sp, #12] ; load ps1 801 802 bic r7, r7, r6 ; r7: vp8_filter 803 804 qsub8 r10 , r10, r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) 805 qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) 806 eor r10, r10, r12 807 eor r11, r11, r12 808 809 sub src, src, pstep, lsl #2 810 811 ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1 812 ;output is b0, b1, b2, b3 813 ;b0: 03 02 01 00 814 ;b1: 13 12 11 10 815 ;b2: 23 22 21 20 816 ;b3: 33 32 31 30 817 ; p1 p0 q0 q1 818 ; (a3 a2 a1 a0) 819 TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr 820 821 strh r6, [src, #-2] ; store the result 822 mov r6, r6, lsr #16 823 strh r6, [src], pstep 824 825 strh r7, [src, #-2] 826 mov r7, r7, lsr #16 827 strh r7, [src], pstep 828 829 strh r12, [src, #-2] 830 mov r12, r12, lsr #16 831 strh r12, [src], pstep 832 833 strh lr, [src, #-2] 834 mov lr, lr, lsr #16 835 strh lr, [src], pstep 836 837|vskip_filter| 838 sub src, src, #4 839 subs count, count, #1 840 841 ldrne r6, [src], pstep ; load source data 842 ldrne r7, [src], pstep 843 ldrne r8, [src], pstep 844 ldrne lr, [src], pstep 845 846 bne Vnext8 847 848 add sp, sp, #16 849 850 ldmia sp!, {r4 - r11, pc} 851 ENDP ; |vp8_loop_filter_vertical_edge_armv6| 852 853 854 855;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 856|vp8_mbloop_filter_vertical_edge_armv6| PROC 857;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- 858 stmdb sp!, {r4 - r11, lr} 859 860 sub src, src, #4 ; move src pointer down by 4 861 ldr count, [sp, #40] ; count for 8-in-parallel 862 ldr r12, [sp, #36] ; load thresh address 863 pld [src, #23] ; preload for next block 864 sub sp, sp, #16 ; create temp buffer 865 866 ldr r6, [src], pstep ; load source data 867 ldrb r4, [r2] ; blimit 868 pld [src, #23] 869 ldr r7, [src], pstep 870 ldrb r2, [r3] ; limit 871 pld [src, #23] 872 ldr r8, [src], pstep 873 orr r4, r4, r4, lsl #8 874 ldrb r3, [r12] ; thresh 875 orr r2, r2, r2, lsl #8 876 pld [src, #23] 877 ldr lr, [src], pstep 878 mov count, count, lsl #1 ; 4-in-parallel 879 orr r4, r4, r4, lsl #16 880 orr r3, r3, r3, lsl #8 881 orr r2, r2, r2, lsl #16 882 orr r3, r3, r3, lsl #16 883 884|MBVnext8| 885 ; vp8_filter_mask() function 886 ; calculate breakout conditions 887 ; transpose the source data for 4-in-parallel operation 888 TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 889 890 uqsub8 r7, r9, r10 ; p3 - p2 891 uqsub8 r8, r10, r9 ; p2 - p3 892 uqsub8 r9, r10, r11 ; p2 - p1 893 uqsub8 r10, r11, r10 ; p1 - p2 894 orr r7, r7, r8 ; abs (p3-p2) 895 orr r10, r9, r10 ; abs (p2-p1) 896 uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask 897 uqsub8 r10, r10, r2 ; compare to limit 898 899 sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines 900 901 orr lr, lr, r10 902 903 uqsub8 r6, r11, r12 ; p1 - p0 904 uqsub8 r7, r12, r11 ; p0 - p1 905 add src, src, #4 ; move src pointer up by 4 906 orr r6, r6, r7 ; abs (p1-p0) 907 str r11, [sp, #12] ; save p1 908 uqsub8 r10, r6, r2 ; compare to limit 909 uqsub8 r11, r6, r3 ; compare to thresh 910 orr lr, lr, r10 911 912 ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now 913 ; transpose the source data for 4-in-parallel operation 914 ldr r6, [src], pstep ; load source data 915 str r11, [sp] ; push r11 to stack 916 ldr r7, [src], pstep 917 str r12, [sp, #4] ; save current reg before load q0 - q3 data 918 ldr r8, [src], pstep 919 str lr, [sp, #8] 920 ldr lr, [src], pstep 921 922 923 TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 924 925 ldr lr, [sp, #8] ; load back (f)limit accumulator 926 927 uqsub8 r6, r12, r11 ; q3 - q2 928 uqsub8 r7, r11, r12 ; q2 - q3 929 uqsub8 r12, r11, r10 ; q2 - q1 930 uqsub8 r11, r10, r11 ; q1 - q2 931 orr r6, r6, r7 ; abs (q3-q2) 932 orr r7, r12, r11 ; abs (q2-q1) 933 uqsub8 r6, r6, r2 ; compare to limit 934 uqsub8 r7, r7, r2 ; compare to limit 935 ldr r11, [sp, #4] ; load back p0 936 ldr r12, [sp, #12] ; load back p1 937 orr lr, lr, r6 938 orr lr, lr, r7 939 940 uqsub8 r6, r11, r9 ; p0 - q0 941 uqsub8 r7, r9, r11 ; q0 - p0 942 uqsub8 r8, r12, r10 ; p1 - q1 943 uqsub8 r11, r10, r12 ; q1 - p1 944 orr r6, r6, r7 ; abs (p0-q0) 945 ldr r7, c0x7F7F7F7F 946 orr r8, r8, r11 ; abs (p1-q1) 947 uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 948 and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 949 uqsub8 r11, r10, r9 ; q1 - q0 950 uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 951 uqsub8 r12, r9, r10 ; q0 - q1 952 uqsub8 r6, r6, r4 ; compare to flimit 953 954 orr r9, r11, r12 ; abs (q1-q0) 955 uqsub8 r8, r9, r2 ; compare to limit 956 uqsub8 r10, r9, r3 ; compare to thresh 957 orr lr, lr, r6 958 orr lr, lr, r8 959 960 mvn r11, #0 ; r11 == -1 961 mov r12, #0 962 963 usub8 lr, r12, lr 964 ldr r9, [sp] ; load the compared result 965 sel lr, r11, r12 ; filter mask: lr 966 967 cmp lr, #0 968 beq mbvskip_filter ; skip filtering 969 970 971 972 ;vp8_hevmask() function 973 ;calculate high edge variance 974 975 sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines 976 977 orr r9, r9, r10 978 979 ldrh r7, [src, #-2] 980 ldrh r8, [src], pstep 981 982 usub8 r9, r12, r9 983 sel r6, r12, r11 ; hev mask: r6 984 985 986 ; vp8_mbfilter() function 987 ; p2, q2 are only needed at the end. Don't need to load them in now. 988 ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first 989 ; load soure data to r6, r11, r12, lr 990 ldrh r9, [src, #-2] 991 ldrh r10, [src], pstep 992 993 pkhbt r12, r7, r8, lsl #16 994 995 ldrh r7, [src, #-2] 996 ldrh r8, [src], pstep 997 998 pkhbt r11, r9, r10, lsl #16 999 1000 ldrh r9, [src, #-2] 1001 ldrh r10, [src], pstep 1002 1003 str r6, [sp] ; save r6 1004 str lr, [sp, #4] ; save lr 1005 1006 pkhbt r6, r7, r8, lsl #16 1007 pkhbt lr, r9, r10, lsl #16 1008 1009 ;transpose r12, r11, r6, lr to p1, p0, q0, q1 1010 TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 1011 1012 ;load back hev_mask r6 and filter_mask lr 1013 ldr r12, c0x80808080 1014 ldr r6, [sp] 1015 ldr lr, [sp, #4] 1016 1017 eor r7, r7, r12 ; ps1 1018 eor r8, r8, r12 ; ps0 1019 eor r9, r9, r12 ; qs0 1020 eor r10, r10, r12 ; qs1 1021 1022 qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) 1023 str r7, [sp, #12] ; store ps1 temporarily 1024 qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) 1025 str r10, [sp, #8] ; store qs1 temporarily 1026 qadd8 r7, r7, r12 1027 str r9, [sp] ; store qs0 temporarily 1028 qadd8 r7, r7, r12 1029 str r8, [sp, #4] ; store ps0 temporarily 1030 qadd8 r7, r7, r12 ; vp8_filter: r7 1031 1032 ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 1033 ldr r9, c0x04040404 1034 ;mvn r11, #0 ; r11 == -1 1035 1036 and r7, r7, lr ; vp8_filter &= mask (lr is free) 1037 1038 mov r12, r7 ; Filter2: r12 1039 and r12, r12, r6 ; Filter2 &= hev 1040 1041 ;modify code for vp8 1042 ;save bottom 3 bits so that we round one side +4 and the other +3 1043 qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) 1044 qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) 1045 1046 mov r10, #0 1047 shadd8 r8 , r8 , r10 ; Filter1 >>= 3 1048 shadd8 r12 , r12 , r10 ; Filter2 >>= 3 1049 shadd8 r8 , r8 , r10 1050 shadd8 r12 , r12 , r10 1051 shadd8 r8 , r8 , r10 ; r8: Filter1 1052 shadd8 r12 , r12 , r10 ; r12: Filter2 1053 1054 ldr r9, [sp] ; load qs0 1055 ldr r11, [sp, #4] ; load ps0 1056 1057 qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) 1058 qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) 1059 1060 ;save bottom 3 bits so that we round one side +4 and the other +3 1061 ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) 1062 ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) 1063 ;mov r10, #0 1064 ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 1065 ;usub8 lr, r8, r9 ; s = (s==4)*-1 1066 ;sel lr, r11, r10 1067 ;shadd8 r12 , r12 , r10 1068 ;usub8 r8, r9, r8 1069 ;sel r8, r11, r10 1070 ;ldr r9, [sp] ; load qs0 1071 ;ldr r11, [sp, #4] ; load ps0 1072 ;shadd8 r12 , r12 , r10 1073 ;and r8, r8, lr ; -1 for each element that equals 4 1074 ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) 1075 ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) 1076 ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) 1077 1078 ;end of modification for vp8 1079 1080 bic r12, r7, r6 ;vp8_filter &= ~hev ( r6 is free) 1081 ;mov r12, r7 1082 1083 ;roughly 3/7th difference across boundary 1084 mov lr, #0x1b ; 27 1085 mov r7, #0x3f ; 63 1086 1087 sxtb16 r6, r12 1088 sxtb16 r10, r12, ror #8 1089 smlabb r8, r6, lr, r7 1090 smlatb r6, r6, lr, r7 1091 smlabb r7, r10, lr, r7 1092 smultb r10, r10, lr 1093 ssat r8, #8, r8, asr #7 1094 ssat r6, #8, r6, asr #7 1095 add r10, r10, #63 1096 ssat r7, #8, r7, asr #7 1097 ssat r10, #8, r10, asr #7 1098 1099 ldr lr, c0x80808080 1100 1101 pkhbt r6, r8, r6, lsl #16 1102 pkhbt r10, r7, r10, lsl #16 1103 uxtb16 r6, r6 1104 uxtb16 r10, r10 1105 1106 sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines 1107 1108 orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) 1109 1110 qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) 1111 qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) 1112 eor r8, r8, lr ; *oq0 = s^0x80 1113 eor r10, r10, lr ; *op0 = s^0x80 1114 1115 strb r10, [src, #-1] ; store op0 result 1116 strb r8, [src], pstep ; store oq0 result 1117 mov r10, r10, lsr #8 1118 mov r8, r8, lsr #8 1119 strb r10, [src, #-1] 1120 strb r8, [src], pstep 1121 mov r10, r10, lsr #8 1122 mov r8, r8, lsr #8 1123 strb r10, [src, #-1] 1124 strb r8, [src], pstep 1125 mov r10, r10, lsr #8 1126 mov r8, r8, lsr #8 1127 strb r10, [src, #-1] 1128 strb r8, [src], pstep 1129 1130 ;roughly 2/7th difference across boundary 1131 mov lr, #0x12 ; 18 1132 mov r7, #0x3f ; 63 1133 1134 sxtb16 r6, r12 1135 sxtb16 r10, r12, ror #8 1136 smlabb r8, r6, lr, r7 1137 smlatb r6, r6, lr, r7 1138 smlabb r9, r10, lr, r7 1139 1140 smlatb r10, r10, lr, r7 1141 ssat r8, #8, r8, asr #7 1142 ssat r6, #8, r6, asr #7 1143 ssat r9, #8, r9, asr #7 1144 ssat r10, #8, r10, asr #7 1145 1146 sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines 1147 1148 pkhbt r6, r8, r6, lsl #16 1149 pkhbt r10, r9, r10, lsl #16 1150 1151 ldr r9, [sp, #8] ; load qs1 1152 ldr r11, [sp, #12] ; load ps1 1153 ldr lr, c0x80808080 1154 1155 uxtb16 r6, r6 1156 uxtb16 r10, r10 1157 1158 add src, src, #2 1159 1160 orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) 1161 1162 qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) 1163 qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) 1164 eor r8, r8, lr ; *oq1 = s^0x80 1165 eor r10, r10, lr ; *op1 = s^0x80 1166 1167 ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary 1168 strb r10, [src, #-4] ; store op1 1169 strb r8, [src, #-1] ; store oq1 1170 ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary 1171 1172 mov r10, r10, lsr #8 1173 mov r8, r8, lsr #8 1174 1175 ldrb r6, [src, #-5] 1176 strb r10, [src, #-4] 1177 strb r8, [src, #-1] 1178 ldrb r7, [src], pstep 1179 1180 mov r10, r10, lsr #8 1181 mov r8, r8, lsr #8 1182 orr r11, r11, r6, lsl #8 1183 orr r9, r9, r7, lsl #8 1184 1185 ldrb r6, [src, #-5] 1186 strb r10, [src, #-4] 1187 strb r8, [src, #-1] 1188 ldrb r7, [src], pstep 1189 1190 mov r10, r10, lsr #8 1191 mov r8, r8, lsr #8 1192 orr r11, r11, r6, lsl #16 1193 orr r9, r9, r7, lsl #16 1194 1195 ldrb r6, [src, #-5] 1196 strb r10, [src, #-4] 1197 strb r8, [src, #-1] 1198 ldrb r7, [src], pstep 1199 orr r11, r11, r6, lsl #24 1200 orr r9, r9, r7, lsl #24 1201 1202 ;roughly 1/7th difference across boundary 1203 eor r9, r9, lr 1204 eor r11, r11, lr 1205 1206 mov lr, #0x9 ; 9 1207 mov r7, #0x3f ; 63 1208 1209 sxtb16 r6, r12 1210 sxtb16 r10, r12, ror #8 1211 smlabb r8, r6, lr, r7 1212 smlatb r6, r6, lr, r7 1213 smlabb r12, r10, lr, r7 1214 smlatb r10, r10, lr, r7 1215 ssat r8, #8, r8, asr #7 1216 ssat r6, #8, r6, asr #7 1217 ssat r12, #8, r12, asr #7 1218 ssat r10, #8, r10, asr #7 1219 1220 sub src, src, pstep, lsl #2 1221 1222 pkhbt r6, r8, r6, lsl #16 1223 pkhbt r10, r12, r10, lsl #16 1224 1225 uxtb16 r6, r6 1226 uxtb16 r10, r10 1227 1228 ldr lr, c0x80808080 1229 1230 orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) 1231 1232 qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) 1233 qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) 1234 eor r8, r8, lr ; *op2 = s^0x80 1235 eor r10, r10, lr ; *oq2 = s^0x80 1236 1237 strb r8, [src, #-5] ; store *op2 1238 strb r10, [src], pstep ; store *oq2 1239 mov r8, r8, lsr #8 1240 mov r10, r10, lsr #8 1241 strb r8, [src, #-5] 1242 strb r10, [src], pstep 1243 mov r8, r8, lsr #8 1244 mov r10, r10, lsr #8 1245 strb r8, [src, #-5] 1246 strb r10, [src], pstep 1247 mov r8, r8, lsr #8 1248 mov r10, r10, lsr #8 1249 strb r8, [src, #-5] 1250 strb r10, [src], pstep 1251 1252 ;adjust src pointer for next loop 1253 sub src, src, #2 1254 1255|mbvskip_filter| 1256 sub src, src, #4 1257 subs count, count, #1 1258 1259 pld [src, #23] ; preload for next block 1260 ldrne r6, [src], pstep ; load source data 1261 pld [src, #23] 1262 ldrne r7, [src], pstep 1263 pld [src, #23] 1264 ldrne r8, [src], pstep 1265 pld [src, #23] 1266 ldrne lr, [src], pstep 1267 1268 bne MBVnext8 1269 1270 add sp, sp, #16 1271 1272 ldmia sp!, {r4 - r11, pc} 1273 ENDP ; |vp8_mbloop_filter_vertical_edge_armv6| 1274 1275; Constant Pool 1276c0x80808080 DCD 0x80808080 1277c0x03030303 DCD 0x03030303 1278c0x04040404 DCD 0x04040404 1279c0x01010101 DCD 0x01010101 1280c0x7F7F7F7F DCD 0x7F7F7F7F 1281 1282 END 1283