1; 2; Copyright (c) 2011 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 EXPORT |vp8_short_fdct4x4_armv6| 12 13 ARM 14 REQUIRE8 15 PRESERVE8 16 17 AREA |.text|, CODE, READONLY 18; void vp8_short_fdct4x4_c(short *input, short *output, int pitch) 19|vp8_short_fdct4x4_armv6| PROC 20 21 stmfd sp!, {r4 - r12, lr} 22 23 ; PART 1 24 25 ; coeffs 0-3 26 ldrd r4, r5, [r0] ; [i1 | i0] [i3 | i2] 27 28 ldr r10, c7500 29 ldr r11, c14500 30 ldr r12, c0x22a453a0 ; [2217*4 | 5352*4] 31 ldr lr, c0x00080008 32 ror r5, r5, #16 ; [i2 | i3] 33 34 qadd16 r6, r4, r5 ; [i1+i2 | i0+i3] = [b1 | a1] without shift 35 qsub16 r7, r4, r5 ; [i1-i2 | i0-i3] = [c1 | d1] without shift 36 37 add r0, r0, r2 ; update input pointer 38 39 qadd16 r7, r7, r7 ; 2*[c1|d1] --> we can use smlad and smlsd 40 ; with 2217*4 and 5352*4 without losing the 41 ; sign bit (overflow) 42 43 smuad r4, r6, lr ; o0 = (i1+i2)*8 + (i0+i3)*8 44 smusd r5, r6, lr ; o2 = (i1+i2)*8 - (i0+i3)*8 45 46 smlad r6, r7, r12, r11 ; o1 = (c1 * 2217 + d1 * 5352 + 14500) 47 smlsdx r7, r7, r12, r10 ; o3 = (d1 * 2217 - c1 * 5352 + 7500) 48 49 ldrd r8, r9, [r0] ; [i5 | i4] [i7 | i6] 50 51 pkhbt r3, r4, r6, lsl #4 ; [o1 | o0], keep in register for PART 2 52 pkhbt r6, r5, r7, lsl #4 ; [o3 | o2] 53 54 str r6, [r1, #4] 55 56 ; coeffs 4-7 57 ror r9, r9, #16 ; [i6 | i7] 58 59 qadd16 r6, r8, r9 ; [i5+i6 | i4+i7] = [b1 | a1] without shift 60 qsub16 r7, r8, r9 ; [i5-i6 | i4-i7] = [c1 | d1] without shift 61 62 add r0, r0, r2 ; update input pointer 63 64 qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd 65 ; with 2217*4 and 5352*4 without losing the 66 ; sign bit (overflow) 67 68 smuad r9, r6, lr ; o4 = (i5+i6)*8 + (i4+i7)*8 69 smusd r8, r6, lr ; o6 = (i5+i6)*8 - (i4+i7)*8 70 71 smlad r6, r7, r12, r11 ; o5 = (c1 * 2217 + d1 * 5352 + 14500) 72 smlsdx r7, r7, r12, r10 ; o7 = (d1 * 2217 - c1 * 5352 + 7500) 73 74 ldrd r4, r5, [r0] ; [i9 | i8] [i11 | i10] 75 76 pkhbt r9, r9, r6, lsl #4 ; [o5 | o4], keep in register for PART 2 77 pkhbt r6, r8, r7, lsl #4 ; [o7 | o6] 78 79 str r6, [r1, #12] 80 81 ; coeffs 8-11 82 ror r5, r5, #16 ; [i10 | i11] 83 84 qadd16 r6, r4, r5 ; [i9+i10 | i8+i11]=[b1 | a1] without shift 85 qsub16 r7, r4, r5 ; [i9-i10 | i8-i11]=[c1 | d1] without shift 86 87 add r0, r0, r2 ; update input pointer 88 89 qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd 90 ; with 2217*4 and 5352*4 without losing the 91 ; sign bit (overflow) 92 93 smuad r2, r6, lr ; o8 = (i9+i10)*8 + (i8+i11)*8 94 smusd r8, r6, lr ; o10 = (i9+i10)*8 - (i8+i11)*8 95 96 smlad r6, r7, r12, r11 ; o9 = (c1 * 2217 + d1 * 5352 + 14500) 97 smlsdx r7, r7, r12, r10 ; o11 = (d1 * 2217 - c1 * 5352 + 7500) 98 99 ldrd r4, r5, [r0] ; [i13 | i12] [i15 | i14] 100 101 pkhbt r2, r2, r6, lsl #4 ; [o9 | o8], keep in register for PART 2 102 pkhbt r6, r8, r7, lsl #4 ; [o11 | o10] 103 104 str r6, [r1, #20] 105 106 ; coeffs 12-15 107 ror r5, r5, #16 ; [i14 | i15] 108 109 qadd16 r6, r4, r5 ; [i13+i14 | i12+i15]=[b1|a1] without shift 110 qsub16 r7, r4, r5 ; [i13-i14 | i12-i15]=[c1|d1] without shift 111 112 qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd 113 ; with 2217*4 and 5352*4 without losing the 114 ; sign bit (overflow) 115 116 smuad r4, r6, lr ; o12 = (i13+i14)*8 + (i12+i15)*8 117 smusd r5, r6, lr ; o14 = (i13+i14)*8 - (i12+i15)*8 118 119 smlad r6, r7, r12, r11 ; o13 = (c1 * 2217 + d1 * 5352 + 14500) 120 smlsdx r7, r7, r12, r10 ; o15 = (d1 * 2217 - c1 * 5352 + 7500) 121 122 pkhbt r0, r4, r6, lsl #4 ; [o13 | o12], keep in register for PART 2 123 pkhbt r6, r5, r7, lsl #4 ; [o15 | o14] 124 125 str r6, [r1, #28] 126 127 128 ; PART 2 ------------------------------------------------- 129 ldr r11, c12000 130 ldr r10, c51000 131 ldr lr, c0x00070007 132 133 qadd16 r4, r3, r0 ; a1 = [i1+i13 | i0+i12] 134 qadd16 r5, r9, r2 ; b1 = [i5+i9 | i4+i8] 135 qsub16 r6, r9, r2 ; c1 = [i5-i9 | i4-i8] 136 qsub16 r7, r3, r0 ; d1 = [i1-i13 | i0-i12] 137 138 qadd16 r4, r4, lr ; a1 + 7 139 140 add r0, r11, #0x10000 ; add (d!=0) 141 142 qadd16 r2, r4, r5 ; a1 + b1 + 7 143 qsub16 r3, r4, r5 ; a1 - b1 + 7 144 145 ldr r12, c0x08a914e8 ; [2217 | 5352] 146 147 lsl r8, r2, #16 ; prepare bottom halfword for scaling 148 asr r2, r2, #4 ; scale top halfword 149 lsl r9, r3, #16 ; prepare bottom halfword for scaling 150 asr r3, r3, #4 ; scale top halfword 151 pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword 152 pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword 153 154 smulbt r2, r6, r12 ; [ ------ | c1*2217] 155 str r4, [r1, #0] ; [ o1 | o0] 156 smultt r3, r6, r12 ; [c1*2217 | ------ ] 157 str r5, [r1, #16] ; [ o9 | o8] 158 159 smlabb r8, r7, r12, r2 ; [ ------ | d1*5352] 160 smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ] 161 162 smulbb r2, r6, r12 ; [ ------ | c1*5352] 163 smultb r3, r6, r12 ; [c1*5352 | ------ ] 164 165 lsls r6, r7, #16 ; d1 != 0 ? 166 addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0) 167 addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0) 168 asrs r6, r7, #16 169 addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0) 170 addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0) 171 172 smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000 173 smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000 174 175 pkhtb r9, r9, r8, asr #16 176 177 sub r4, r4, r2 178 sub r5, r5, r3 179 180 ldr r3, [r1, #4] ; [i3 | i2] 181 182 pkhtb r5, r5, r4, asr #16 ; [o13|o12] 183 184 str r9, [r1, #8] ; [o5 | 04] 185 186 ldr r9, [r1, #12] ; [i7 | i6] 187 ldr r8, [r1, #28] ; [i15|i14] 188 ldr r2, [r1, #20] ; [i11|i10] 189 str r5, [r1, #24] ; [o13|o12] 190 191 qadd16 r4, r3, r8 ; a1 = [i3+i15 | i2+i14] 192 qadd16 r5, r9, r2 ; b1 = [i7+i11 | i6+i10] 193 194 qadd16 r4, r4, lr ; a1 + 7 195 196 qsub16 r6, r9, r2 ; c1 = [i7-i11 | i6-i10] 197 qadd16 r2, r4, r5 ; a1 + b1 + 7 198 qsub16 r7, r3, r8 ; d1 = [i3-i15 | i2-i14] 199 qsub16 r3, r4, r5 ; a1 - b1 + 7 200 201 lsl r8, r2, #16 ; prepare bottom halfword for scaling 202 asr r2, r2, #4 ; scale top halfword 203 lsl r9, r3, #16 ; prepare bottom halfword for scaling 204 asr r3, r3, #4 ; scale top halfword 205 pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword 206 pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword 207 208 smulbt r2, r6, r12 ; [ ------ | c1*2217] 209 str r4, [r1, #4] ; [ o3 | o2] 210 smultt r3, r6, r12 ; [c1*2217 | ------ ] 211 str r5, [r1, #20] ; [ o11 | o10] 212 213 smlabb r8, r7, r12, r2 ; [ ------ | d1*5352] 214 smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ] 215 216 smulbb r2, r6, r12 ; [ ------ | c1*5352] 217 smultb r3, r6, r12 ; [c1*5352 | ------ ] 218 219 lsls r6, r7, #16 ; d1 != 0 ? 220 addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0) 221 addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0) 222 223 asrs r6, r7, #16 224 addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0) 225 addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0) 226 227 smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000 228 smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000 229 230 pkhtb r9, r9, r8, asr #16 231 232 sub r4, r4, r2 233 sub r5, r5, r3 234 235 str r9, [r1, #12] ; [o7 | o6] 236 pkhtb r5, r5, r4, asr #16 ; [o15|o14] 237 238 str r5, [r1, #28] ; [o15|o14] 239 240 ldmfd sp!, {r4 - r12, pc} 241 242 ENDP 243 244; Used constants 245c7500 246 DCD 7500 247c14500 248 DCD 14500 249c0x22a453a0 250 DCD 0x22a453a0 251c0x00080008 252 DCD 0x00080008 253c12000 254 DCD 12000 255c51000 256 DCD 51000 257c0x00070007 258 DCD 0x00070007 259c0x08a914e8 260 DCD 0x08a914e8 261 262 END 263