1;
2;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vpx_variance_halfpixvar16x16_hv_media|
13
14    ARM
15    REQUIRE8
16    PRESERVE8
17
18    AREA ||.text||, CODE, READONLY, ALIGN=2
19
20; r0    unsigned char *src_ptr
21; r1    int source_stride
22; r2    unsigned char *ref_ptr
23; r3    int  recon_stride
24; stack unsigned int *sse
25|vpx_variance_halfpixvar16x16_hv_media| PROC
26
27    stmfd   sp!, {r4-r12, lr}
28
29    pld     [r0, r1, lsl #0]
30    pld     [r2, r3, lsl #0]
31
32    mov     r8, #0              ; initialize sum = 0
33    ldr     r10, c80808080
34    mov     r11, #0             ; initialize sse = 0
35    mov     r12, #16            ; set loop counter to 16 (=block height)
36    mov     lr, #0              ; constant zero
37loop
38    add     r9, r0, r1          ; pointer to pixels on the next row
39    ; 1st 4 pixels
40    ldr     r4, [r0, #0]        ; load source pixels a, row N
41    ldr     r6, [r0, #1]        ; load source pixels b, row N
42    ldr     r5, [r9, #0]        ; load source pixels c, row N+1
43    ldr     r7, [r9, #1]        ; load source pixels d, row N+1
44
45    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
46    mvn     r6, r6
47    uhsub8  r4, r4, r6
48    eor     r4, r4, r10
49    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
50    mvn     r7, r7
51    uhsub8  r5, r5, r7
52    eor     r5, r5, r10
53    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
54    mvn     r5, r5
55    uhsub8  r4, r4, r5
56    ldr     r5, [r2, #0]        ; load 4 ref pixels
57    eor     r4, r4, r10
58
59    usub8   r6, r4, r5          ; calculate difference
60    pld     [r0, r1, lsl #1]
61    sel     r7, r6, lr          ; select bytes with positive difference
62    usub8   r6, r5, r4          ; calculate difference with reversed operands
63    pld     [r2, r3, lsl #1]
64    sel     r6, r6, lr          ; select bytes with negative difference
65
66    ; calculate partial sums
67    usad8   r4, r7, lr          ; calculate sum of positive differences
68    usad8   r5, r6, lr          ; calculate sum of negative differences
69    orr     r6, r6, r7          ; differences of all 4 pixels
70    ; calculate total sum
71    adds    r8, r8, r4          ; add positive differences to sum
72    subs    r8, r8, r5          ; subtract negative differences from sum
73
74    ; calculate sse
75    uxtb16  r5, r6              ; byte (two pixels) to halfwords
76    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
77    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
78
79    ; 2nd 4 pixels
80    ldr     r4, [r0, #4]        ; load source pixels a, row N
81    ldr     r6, [r0, #5]        ; load source pixels b, row N
82    ldr     r5, [r9, #4]        ; load source pixels c, row N+1
83
84    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
85
86    ldr     r7, [r9, #5]        ; load source pixels d, row N+1
87
88    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
89    mvn     r6, r6
90    uhsub8  r4, r4, r6
91    eor     r4, r4, r10
92    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
93    mvn     r7, r7
94    uhsub8  r5, r5, r7
95    eor     r5, r5, r10
96    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
97    mvn     r5, r5
98    uhsub8  r4, r4, r5
99    ldr     r5, [r2, #4]        ; load 4 ref pixels
100    eor     r4, r4, r10
101
102    usub8   r6, r4, r5          ; calculate difference
103    sel     r7, r6, lr          ; select bytes with positive difference
104    usub8   r6, r5, r4          ; calculate difference with reversed operands
105    sel     r6, r6, lr          ; select bytes with negative difference
106
107    ; calculate partial sums
108    usad8   r4, r7, lr          ; calculate sum of positive differences
109    usad8   r5, r6, lr          ; calculate sum of negative differences
110    orr     r6, r6, r7          ; differences of all 4 pixels
111
112    ; calculate total sum
113    add     r8, r8, r4          ; add positive differences to sum
114    sub     r8, r8, r5          ; subtract negative differences from sum
115
116    ; calculate sse
117    uxtb16  r5, r6              ; byte (two pixels) to halfwords
118    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
119    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
120
121    ; 3rd 4 pixels
122    ldr     r4, [r0, #8]        ; load source pixels a, row N
123    ldr     r6, [r0, #9]        ; load source pixels b, row N
124    ldr     r5, [r9, #8]        ; load source pixels c, row N+1
125
126    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
127
128    ldr     r7, [r9, #9]        ; load source pixels d, row N+1
129
130    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
131    mvn     r6, r6
132    uhsub8  r4, r4, r6
133    eor     r4, r4, r10
134    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
135    mvn     r7, r7
136    uhsub8  r5, r5, r7
137    eor     r5, r5, r10
138    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
139    mvn     r5, r5
140    uhsub8  r4, r4, r5
141    ldr     r5, [r2, #8]        ; load 4 ref pixels
142    eor     r4, r4, r10
143
144    usub8   r6, r4, r5          ; calculate difference
145    sel     r7, r6, lr          ; select bytes with positive difference
146    usub8   r6, r5, r4          ; calculate difference with reversed operands
147    sel     r6, r6, lr          ; select bytes with negative difference
148
149    ; calculate partial sums
150    usad8   r4, r7, lr          ; calculate sum of positive differences
151    usad8   r5, r6, lr          ; calculate sum of negative differences
152    orr     r6, r6, r7          ; differences of all 4 pixels
153
154    ; calculate total sum
155    add     r8, r8, r4          ; add positive differences to sum
156    sub     r8, r8, r5          ; subtract negative differences from sum
157
158    ; calculate sse
159    uxtb16  r5, r6              ; byte (two pixels) to halfwords
160    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
161    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
162
163    ; 4th 4 pixels
164    ldr     r4, [r0, #12]       ; load source pixels a, row N
165    ldr     r6, [r0, #13]       ; load source pixels b, row N
166    ldr     r5, [r9, #12]       ; load source pixels c, row N+1
167    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
168    ldr     r7, [r9, #13]       ; load source pixels d, row N+1
169
170    ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
171    mvn     r6, r6
172    uhsub8  r4, r4, r6
173    eor     r4, r4, r10
174    ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
175    mvn     r7, r7
176    uhsub8  r5, r5, r7
177    eor     r5, r5, r10
178    ; z = (x + y + 1) >> 1, interpolate half pixel values vertically
179    mvn     r5, r5
180    uhsub8  r4, r4, r5
181    ldr     r5, [r2, #12]       ; load 4 ref pixels
182    eor     r4, r4, r10
183
184    usub8   r6, r4, r5          ; calculate difference
185    add     r0, r0, r1          ; set src_ptr to next row
186    sel     r7, r6, lr          ; select bytes with positive difference
187    usub8   r6, r5, r4          ; calculate difference with reversed operands
188    add     r2, r2, r3          ; set dst_ptr to next row
189    sel     r6, r6, lr          ; select bytes with negative difference
190
191    ; calculate partial sums
192    usad8   r4, r7, lr          ; calculate sum of positive differences
193    usad8   r5, r6, lr          ; calculate sum of negative differences
194    orr     r6, r6, r7          ; differences of all 4 pixels
195
196    ; calculate total sum
197    add     r8, r8, r4          ; add positive differences to sum
198    sub     r8, r8, r5          ; subtract negative differences from sum
199
200    ; calculate sse
201    uxtb16  r5, r6              ; byte (two pixels) to halfwords
202    uxtb16  r7, r6, ror #8      ; another two pixels to halfwords
203    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
204    subs    r12, r12, #1
205    smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2)
206
207    bne     loop
208
209    ; return stuff
210    ldr     r6, [sp, #40]       ; get address of sse
211    mul     r0, r8, r8          ; sum * sum
212    str     r11, [r6]           ; store sse
213    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
214
215    ldmfd   sp!, {r4-r12, pc}
216
217    ENDP
218
219c80808080
220    DCD     0x80808080
221
222    END
223