1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12 EXPORT |vp8_short_idct4x4llm_v6_dual| 13 14 AREA |.text|, CODE, READONLY 15 16 17; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, 18; unsigned char *dst, int stride) 19; r0 short* input 20; r1 unsigned char* pred 21; r2 int pitch 22; r3 unsigned char* dst 23; sp int stride 24 25|vp8_short_idct4x4llm_v6_dual| PROC 26 stmdb sp!, {r4-r11, lr} 27 28 sub sp, sp, #4 29 30 mov r4, #0x00008A00 ; sin 31 orr r4, r4, #0x0000008C ; sinpi8sqrt2 32 33 mov r5, #0x00004E00 ; cos 34 orr r5, r5, #0x0000007B ; cospi8sqrt2minus1 35 orr r5, r5, #1<<31 ; loop counter on top bit 36 37loop1_dual 38 ldr r6, [r0, #(4*2)] ; i5 | i4 39 ldr r12, [r0, #(12*2)] ; i13|i12 40 ldr r14, [r0, #(8*2)] ; i9 | i8 41 42 smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 43 smulbb r7, r5, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 44 smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 45 smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 46 47 smulbt r11, r5, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 48 pkhtb r7, r9, r7, asr #16 ; 5c | 4c 49 pkhbt r8, r8, r10, lsl #16 ; 5s | 4s 50 uadd16 r6, r6, r7 ; 5c+5 | 4c+4 51 52 smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 53 smulbb r9, r5, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 54 smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 55 56 subs r5, r5, #1<<31 ; i-- 57 58 pkhtb r9, r11, r9, asr #16 ; 13c | 12c 59 ldr r11, [r0] ; i1 | i0 60 pkhbt r10, r10, r7, lsl #16 ; 13s | 12s 61 uadd16 r7, r12, r9 ; 13c+13 | 12c+12 62 63 usub16 r7, r8, r7 ; c 64 uadd16 r6, r6, r10 ; d 65 uadd16 r10, r11, r14 ; a 66 usub16 r8, r11, r14 ; b 67 68 uadd16 r9, r10, r6 ; a+d 69 usub16 r10, r10, r6 ; a-d 70 uadd16 r6, r8, r7 ; b+c 71 usub16 r7, r8, r7 ; b-c 72 73 ; use input buffer to store intermediate results 74 str r6, [r0, #(4*2)] ; o5 | o4 75 str r7, [r0, #(8*2)] ; o9 | o8 76 str r10,[r0, #(12*2)] ; o13|o12 77 str r9, [r0], #4 ; o1 | o0 78 79 bcs loop1_dual 80 81 sub r0, r0, #8 ; reset input/output 82 str r0, [sp] 83 84loop2_dual 85 86 ldr r6, [r0, #(4*2)] ; i5 | i4 87 ldr r12,[r0, #(2*2)] ; i3 | i2 88 ldr r14,[r0, #(6*2)] ; i7 | i6 89 ldr r0, [r0, #(0*2)] ; i1 | i0 90 91 smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 92 smulbt r7, r5, r0 ; (ip[1] * cospi8sqrt2minus1) >> 16 93 smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 94 smulwt r8, r4, r0 ; (ip[1] * sinpi8sqrt2) >> 16 95 96 pkhbt r11, r6, r0, lsl #16 ; i0 | i4 97 pkhtb r7, r7, r9, asr #16 ; 1c | 5c 98 pkhtb r0, r0, r6, asr #16 ; i1 | i5 99 pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 100 101 uadd16 r0, r7, r0 ; 1c+1 | 5c+5 = temp2 102 pkhbt r9, r14, r12, lsl #16 ; i2 | i6 103 uadd16 r10, r11, r9 ; a 104 usub16 r9, r11, r9 ; b 105 pkhtb r6, r12, r14, asr #16 ; i3 | i7 106 107 subs r5, r5, #1<<31 ; i-- 108 109 smulbt r7, r5, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 110 smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 111 smulbb r12, r5, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 112 smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 113 114 pkhtb r7, r7, r12, asr #16 ; 3c | 7c 115 pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 116 117 uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 118 usub16 r12, r8, r6 ; c (o1 | o5) 119 uadd16 r6, r11, r0 ; d (o3 | o7) 120 uadd16 r7, r10, r6 ; a+d 121 122 mov r8, #4 ; set up 4's 123 orr r8, r8, #0x40000 ; 4|4 124 125 usub16 r6, r10, r6 ; a-d 126 uadd16 r6, r6, r8 ; a-d+4, 3|7 127 uadd16 r7, r7, r8 ; a+d+4, 0|4 128 uadd16 r10, r9, r12 ; b+c 129 usub16 r0, r9, r12 ; b-c 130 uadd16 r10, r10, r8 ; b+c+4, 1|5 131 uadd16 r8, r0, r8 ; b-c+4, 2|6 132 133 ldr lr, [sp, #40] ; dst stride 134 135 ldrb r0, [r1] ; pred p0 136 ldrb r11, [r1, #1] ; pred p1 137 ldrb r12, [r1, #2] ; pred p2 138 139 add r0, r0, r7, asr #19 ; p0 + o0 140 add r11, r11, r10, asr #19 ; p1 + o1 141 add r12, r12, r8, asr #19 ; p2 + o2 142 143 usat r0, #8, r0 ; d0 = clip8(p0 + o0) 144 usat r11, #8, r11 ; d1 = clip8(p1 + o1) 145 usat r12, #8, r12 ; d2 = clip8(p2 + o2) 146 147 add r0, r0, r11, lsl #8 ; |--|--|d1|d0| 148 149 ldrb r11, [r1, #3] ; pred p3 150 151 add r0, r0, r12, lsl #16 ; |--|d2|d1|d0| 152 153 add r11, r11, r6, asr #19 ; p3 + o3 154 155 sxth r7, r7 ; 156 sxth r10, r10 ; 157 158 usat r11, #8, r11 ; d3 = clip8(p3 + o3) 159 160 sxth r8, r8 ; 161 sxth r6, r6 ; 162 163 add r0, r0, r11, lsl #24 ; |d3|d2|d1|d0| 164 165 ldrb r12, [r1, r2]! ; pred p4 166 str r0, [r3], lr 167 ldrb r11, [r1, #1] ; pred p5 168 169 add r12, r12, r7, asr #3 ; p4 + o4 170 add r11, r11, r10, asr #3 ; p5 + o5 171 172 usat r12, #8, r12 ; d4 = clip8(p4 + o4) 173 usat r11, #8, r11 ; d5 = clip8(p5 + o5) 174 175 ldrb r7, [r1, #2] ; pred p6 176 ldrb r10, [r1, #3] ; pred p6 177 178 add r12, r12, r11, lsl #8 ; |--|--|d5|d4| 179 180 add r7, r7, r8, asr #3 ; p6 + o6 181 add r10, r10, r6, asr #3 ; p7 + o7 182 183 ldr r0, [sp] ; load input pointer 184 185 usat r7, #8, r7 ; d6 = clip8(p6 + o6) 186 usat r10, #8, r10 ; d7 = clip8(p7 + o7) 187 188 add r12, r12, r7, lsl #16 ; |--|d6|d5|d4| 189 add r12, r12, r10, lsl #24 ; |d7|d6|d5|d4| 190 191 str r12, [r3], lr 192 add r0, r0, #16 193 add r1, r1, r2 ; pred + pitch 194 195 bcs loop2_dual 196 197 add sp, sp, #4 ; idct_output buffer 198 ldmia sp!, {r4 - r11, pc} 199 200 ENDP 201 202 END 203