1;
2;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vpx_variance16x16_media|
13    EXPORT  |vpx_variance8x8_media|
14    EXPORT  |vpx_mse16x16_media|
15
16    ARM
17    REQUIRE8
18    PRESERVE8
19
20    AREA ||.text||, CODE, READONLY, ALIGN=2
21
22; r0    unsigned char *src_ptr
23; r1    int source_stride
24; r2    unsigned char *ref_ptr
25; r3    int  recon_stride
26; stack unsigned int *sse
27|vpx_variance16x16_media| PROC
28
29    stmfd   sp!, {r4-r12, lr}
30
31    pld     [r0, r1, lsl #0]
32    pld     [r2, r3, lsl #0]
33
34    mov     r8, #0              ; initialize sum = 0
35    mov     r11, #0             ; initialize sse = 0
36    mov     r12, #16            ; set loop counter to 16 (=block height)
37
38loop16x16
39    ; 1st 4 pixels
40    ldr     r4, [r0, #0]        ; load 4 src pixels
41    ldr     r5, [r2, #0]        ; load 4 ref pixels
42
43    mov     lr, #0              ; constant zero
44
45    usub8   r6, r4, r5          ; calculate difference
46    pld     [r0, r1, lsl #1]
47    sel     r7, r6, lr          ; select bytes with positive difference
48    usub8   r9, r5, r4          ; calculate difference with reversed operands
49    pld     [r2, r3, lsl #1]
50    sel     r6, r9, lr          ; select bytes with negative difference
51
52    ; calculate partial sums
53    usad8   r4, r7, lr          ; calculate sum of positive differences
54    usad8   r5, r6, lr          ; calculate sum of negative differences
55    orr     r6, r6, r7          ; differences of all 4 pixels
56    ; calculate total sum
57    adds    r8, r8, r4          ; add positive differences to sum
58    subs    r8, r8, r5          ; subtract negative differences from sum
59
60    ; calculate sse
61    uxtb16  r5, r6              ; byte (two pixels) to halfwords
62    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
63    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
64
65    ; 2nd 4 pixels
66    ldr     r4, [r0, #4]        ; load 4 src pixels
67    ldr     r5, [r2, #4]        ; load 4 ref pixels
68    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
69
70    usub8   r6, r4, r5          ; calculate difference
71    sel     r7, r6, lr          ; select bytes with positive difference
72    usub8   r9, r5, r4          ; calculate difference with reversed operands
73    sel     r6, r9, lr          ; select bytes with negative difference
74
75    ; calculate partial sums
76    usad8   r4, r7, lr          ; calculate sum of positive differences
77    usad8   r5, r6, lr          ; calculate sum of negative differences
78    orr     r6, r6, r7          ; differences of all 4 pixels
79
80    ; calculate total sum
81    add     r8, r8, r4          ; add positive differences to sum
82    sub     r8, r8, r5          ; subtract negative differences from sum
83
84    ; calculate sse
85    uxtb16  r5, r6              ; byte (two pixels) to halfwords
86    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
87    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
88
89    ; 3rd 4 pixels
90    ldr     r4, [r0, #8]        ; load 4 src pixels
91    ldr     r5, [r2, #8]        ; load 4 ref pixels
92    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
93
94    usub8   r6, r4, r5          ; calculate difference
95    sel     r7, r6, lr          ; select bytes with positive difference
96    usub8   r9, r5, r4          ; calculate difference with reversed operands
97    sel     r6, r9, lr          ; select bytes with negative difference
98
99    ; calculate partial sums
100    usad8   r4, r7, lr          ; calculate sum of positive differences
101    usad8   r5, r6, lr          ; calculate sum of negative differences
102    orr     r6, r6, r7          ; differences of all 4 pixels
103
104    ; calculate total sum
105    add     r8, r8, r4          ; add positive differences to sum
106    sub     r8, r8, r5          ; subtract negative differences from sum
107
108    ; calculate sse
109    uxtb16  r5, r6              ; byte (two pixels) to halfwords
110    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
111    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
112
113    ; 4th 4 pixels
114    ldr     r4, [r0, #12]       ; load 4 src pixels
115    ldr     r5, [r2, #12]       ; load 4 ref pixels
116    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
117
118    usub8   r6, r4, r5          ; calculate difference
119    add     r0, r0, r1          ; set src_ptr to next row
120    sel     r7, r6, lr          ; select bytes with positive difference
121    usub8   r9, r5, r4          ; calculate difference with reversed operands
122    add     r2, r2, r3          ; set dst_ptr to next row
123    sel     r6, r9, lr          ; select bytes with negative difference
124
125    ; calculate partial sums
126    usad8   r4, r7, lr          ; calculate sum of positive differences
127    usad8   r5, r6, lr          ; calculate sum of negative differences
128    orr     r6, r6, r7          ; differences of all 4 pixels
129
130    ; calculate total sum
131    add     r8, r8, r4          ; add positive differences to sum
132    sub     r8, r8, r5          ; subtract negative differences from sum
133
134    ; calculate sse
135    uxtb16  r5, r6              ; byte (two pixels) to halfwords
136    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
137    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
138    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
139
140
141    subs    r12, r12, #1
142
143    bne     loop16x16
144
145    ; return stuff
146    ldr     r6, [sp, #40]       ; get address of sse
147    mul     r0, r8, r8          ; sum * sum
148    str     r11, [r6]           ; store sse
149    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
150
151    ldmfd   sp!, {r4-r12, pc}
152
153    ENDP
154
155; r0    unsigned char *src_ptr
156; r1    int source_stride
157; r2    unsigned char *ref_ptr
158; r3    int  recon_stride
159; stack unsigned int *sse
160|vpx_variance8x8_media| PROC
161
162    push    {r4-r10, lr}
163
164    pld     [r0, r1, lsl #0]
165    pld     [r2, r3, lsl #0]
166
167    mov     r12, #8             ; set loop counter to 8 (=block height)
168    mov     r4, #0              ; initialize sum = 0
169    mov     r5, #0              ; initialize sse = 0
170
171loop8x8
172    ; 1st 4 pixels
173    ldr     r6, [r0, #0x0]      ; load 4 src pixels
174    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
175
176    mov     lr, #0              ; constant zero
177
178    usub8   r8, r6, r7          ; calculate difference
179    pld     [r0, r1, lsl #1]
180    sel     r10, r8, lr         ; select bytes with positive difference
181    usub8   r9, r7, r6          ; calculate difference with reversed operands
182    pld     [r2, r3, lsl #1]
183    sel     r8, r9, lr          ; select bytes with negative difference
184
185    ; calculate partial sums
186    usad8   r6, r10, lr         ; calculate sum of positive differences
187    usad8   r7, r8, lr          ; calculate sum of negative differences
188    orr     r8, r8, r10         ; differences of all 4 pixels
189    ; calculate total sum
190    add    r4, r4, r6           ; add positive differences to sum
191    sub    r4, r4, r7           ; subtract negative differences from sum
192
193    ; calculate sse
194    uxtb16  r7, r8              ; byte (two pixels) to halfwords
195    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
196    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
197
198    ; 2nd 4 pixels
199    ldr     r6, [r0, #0x4]      ; load 4 src pixels
200    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
201    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
202
203    usub8   r8, r6, r7          ; calculate difference
204    add     r0, r0, r1          ; set src_ptr to next row
205    sel     r10, r8, lr         ; select bytes with positive difference
206    usub8   r9, r7, r6          ; calculate difference with reversed operands
207    add     r2, r2, r3          ; set dst_ptr to next row
208    sel     r8, r9, lr          ; select bytes with negative difference
209
210    ; calculate partial sums
211    usad8   r6, r10, lr         ; calculate sum of positive differences
212    usad8   r7, r8, lr          ; calculate sum of negative differences
213    orr     r8, r8, r10         ; differences of all 4 pixels
214
215    ; calculate total sum
216    add     r4, r4, r6          ; add positive differences to sum
217    sub     r4, r4, r7          ; subtract negative differences from sum
218
219    ; calculate sse
220    uxtb16  r7, r8              ; byte (two pixels) to halfwords
221    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
222    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
223    subs    r12, r12, #1        ; next row
224    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
225
226    bne     loop8x8
227
228    ; return stuff
229    ldr     r8, [sp, #32]       ; get address of sse
230    mul     r1, r4, r4          ; sum * sum
231    str     r5, [r8]            ; store sse
232    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
233
234    pop     {r4-r10, pc}
235
236    ENDP
237
238; r0    unsigned char *src_ptr
239; r1    int source_stride
240; r2    unsigned char *ref_ptr
241; r3    int  recon_stride
242; stack unsigned int *sse
243;
244;note: Based on vpx_variance16x16_media. In this function, sum is never used.
245;      So, we can remove this part of calculation.
246
247|vpx_mse16x16_media| PROC
248
249    push    {r4-r9, lr}
250
251    pld     [r0, r1, lsl #0]
252    pld     [r2, r3, lsl #0]
253
254    mov     r12, #16            ; set loop counter to 16 (=block height)
255    mov     r4, #0              ; initialize sse = 0
256
257loopmse
258    ; 1st 4 pixels
259    ldr     r5, [r0, #0x0]      ; load 4 src pixels
260    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
261
262    mov     lr, #0              ; constant zero
263
264    usub8   r8, r5, r6          ; calculate difference
265    pld     [r0, r1, lsl #1]
266    sel     r7, r8, lr          ; select bytes with positive difference
267    usub8   r9, r6, r5          ; calculate difference with reversed operands
268    pld     [r2, r3, lsl #1]
269    sel     r8, r9, lr          ; select bytes with negative difference
270
271    ; calculate partial sums
272    usad8   r5, r7, lr          ; calculate sum of positive differences
273    usad8   r6, r8, lr          ; calculate sum of negative differences
274    orr     r8, r8, r7          ; differences of all 4 pixels
275
276    ldr     r5, [r0, #0x4]      ; load 4 src pixels
277
278    ; calculate sse
279    uxtb16  r6, r8              ; byte (two pixels) to halfwords
280    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
281    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
282
283    ; 2nd 4 pixels
284    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
285    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
286
287    usub8   r8, r5, r6          ; calculate difference
288    sel     r7, r8, lr          ; select bytes with positive difference
289    usub8   r9, r6, r5          ; calculate difference with reversed operands
290    sel     r8, r9, lr          ; select bytes with negative difference
291
292    ; calculate partial sums
293    usad8   r5, r7, lr          ; calculate sum of positive differences
294    usad8   r6, r8, lr          ; calculate sum of negative differences
295    orr     r8, r8, r7          ; differences of all 4 pixels
296    ldr     r5, [r0, #0x8]      ; load 4 src pixels
297    ; calculate sse
298    uxtb16  r6, r8              ; byte (two pixels) to halfwords
299    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
300    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
301
302    ; 3rd 4 pixels
303    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
304    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
305
306    usub8   r8, r5, r6          ; calculate difference
307    sel     r7, r8, lr          ; select bytes with positive difference
308    usub8   r9, r6, r5          ; calculate difference with reversed operands
309    sel     r8, r9, lr          ; select bytes with negative difference
310
311    ; calculate partial sums
312    usad8   r5, r7, lr          ; calculate sum of positive differences
313    usad8   r6, r8, lr          ; calculate sum of negative differences
314    orr     r8, r8, r7          ; differences of all 4 pixels
315
316    ldr     r5, [r0, #0xc]      ; load 4 src pixels
317
318    ; calculate sse
319    uxtb16  r6, r8              ; byte (two pixels) to halfwords
320    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
321    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
322
323    ; 4th 4 pixels
324    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
325    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
326
327    usub8   r8, r5, r6          ; calculate difference
328    add     r0, r0, r1          ; set src_ptr to next row
329    sel     r7, r8, lr          ; select bytes with positive difference
330    usub8   r9, r6, r5          ; calculate difference with reversed operands
331    add     r2, r2, r3          ; set dst_ptr to next row
332    sel     r8, r9, lr          ; select bytes with negative difference
333
334    ; calculate partial sums
335    usad8   r5, r7, lr          ; calculate sum of positive differences
336    usad8   r6, r8, lr          ; calculate sum of negative differences
337    orr     r8, r8, r7          ; differences of all 4 pixels
338
339    subs    r12, r12, #1        ; next row
340
341    ; calculate sse
342    uxtb16  r6, r8              ; byte (two pixels) to halfwords
343    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
344    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
345    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
346
347    bne     loopmse
348
349    ; return stuff
350    ldr     r1, [sp, #28]       ; get address of sse
351    mov     r0, r4              ; return sse
352    str     r4, [r1]            ; store sse
353
354    pop     {r4-r9, pc}
355
356    ENDP
357
358    END
359