1;
2;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    ; These functions are only valid when:
13    ; x_step_q4 == 16
14    ; w%4 == 0
15    ; h%4 == 0
16    ; taps == 8
17    ; VP9_FILTER_WEIGHT == 128
18    ; VP9_FILTER_SHIFT == 7
19
20    EXPORT  |vp9_convolve8_avg_horiz_neon|
21    EXPORT  |vp9_convolve8_avg_vert_neon|
22    IMPORT  |vp9_convolve8_avg_horiz_c|
23    IMPORT  |vp9_convolve8_avg_vert_c|
24    ARM
25    REQUIRE8
26    PRESERVE8
27
28    AREA ||.text||, CODE, READONLY, ALIGN=2
29
30    ; Multiply and accumulate by q0
31    MACRO
32    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
33    vmull.s16 $dst, $src0, d0[0]
34    vmlal.s16 $dst, $src1, d0[1]
35    vmlal.s16 $dst, $src2, d0[2]
36    vmlal.s16 $dst, $src3, d0[3]
37    vmlal.s16 $dst, $src4, d1[0]
38    vmlal.s16 $dst, $src5, d1[1]
39    vmlal.s16 $dst, $src6, d1[2]
40    vmlal.s16 $dst, $src7, d1[3]
41    MEND
42
43; r0    const uint8_t *src
44; r1    int src_stride
45; r2    uint8_t *dst
46; r3    int dst_stride
47; sp[]const int16_t *filter_x
48; sp[]int x_step_q4
49; sp[]const int16_t *filter_y ; unused
50; sp[]int y_step_q4           ; unused
51; sp[]int w
52; sp[]int h
53
54|vp9_convolve8_avg_horiz_neon| PROC
55    ldr             r12, [sp, #4]           ; x_step_q4
56    cmp             r12, #16
57    bne             vp9_convolve8_avg_horiz_c
58
59    push            {r4-r10, lr}
60
61    sub             r0, r0, #3              ; adjust for taps
62
63    ldr             r5, [sp, #32]           ; filter_x
64    ldr             r6, [sp, #48]           ; w
65    ldr             r7, [sp, #52]           ; h
66
67    vld1.s16        {q0}, [r5]              ; filter_x
68
69    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
70    add             r8, r8, #4              ; -src_stride * 3 + 4
71
72    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
73    add             r4, r4, #4              ; -dst_stride * 3 + 4
74
75    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
76    sub             r9, r9, #7
77    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
78
79    mov             r10, r6                 ; w loop counter
80
81loop_horiz_v
82    vld1.8          {d24}, [r0], r1
83    vld1.8          {d25}, [r0], r1
84    vld1.8          {d26}, [r0], r1
85    vld1.8          {d27}, [r0], r8
86
87    vtrn.16         q12, q13
88    vtrn.8          d24, d25
89    vtrn.8          d26, d27
90
91    pld             [r0, r1, lsl #2]
92
93    vmovl.u8        q8, d24
94    vmovl.u8        q9, d25
95    vmovl.u8        q10, d26
96    vmovl.u8        q11, d27
97
98    ; save a few instructions in the inner loop
99    vswp            d17, d18
100    vmov            d23, d21
101
102    add             r0, r0, #3
103
104loop_horiz
105    add             r5, r0, #64
106
107    vld1.32         {d28[]}, [r0], r1
108    vld1.32         {d29[]}, [r0], r1
109    vld1.32         {d31[]}, [r0], r1
110    vld1.32         {d30[]}, [r0], r8
111
112    pld             [r5]
113
114    vtrn.16         d28, d31
115    vtrn.16         d29, d30
116    vtrn.8          d28, d29
117    vtrn.8          d31, d30
118
119    pld             [r5, r1]
120
121    ; extract to s16
122    vtrn.32         q14, q15
123    vmovl.u8        q12, d28
124    vmovl.u8        q13, d29
125
126    pld             [r5, r1, lsl #1]
127
128    ; slightly out of order load to match the existing data
129    vld1.u32        {d6[0]}, [r2], r3
130    vld1.u32        {d7[0]}, [r2], r3
131    vld1.u32        {d6[1]}, [r2], r3
132    vld1.u32        {d7[1]}, [r2], r3
133
134    sub             r2, r2, r3, lsl #2      ; reset for store
135
136    ; src[] * filter_x
137    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
138    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
139    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
140    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
141
142    pld             [r5, -r8]
143
144    ; += 64 >> 7
145    vqrshrun.s32    d2, q1, #7
146    vqrshrun.s32    d3, q2, #7
147    vqrshrun.s32    d4, q14, #7
148    vqrshrun.s32    d5, q15, #7
149
150    ; saturate
151    vqmovn.u16      d2, q1
152    vqmovn.u16      d3, q2
153
154    ; transpose
155    vtrn.16         d2, d3
156    vtrn.32         d2, d3
157    vtrn.8          d2, d3
158
159    ; average the new value and the dst value
160    vrhadd.u8       q1, q1, q3
161
162    vst1.u32        {d2[0]}, [r2@32], r3
163    vst1.u32        {d3[0]}, [r2@32], r3
164    vst1.u32        {d2[1]}, [r2@32], r3
165    vst1.u32        {d3[1]}, [r2@32], r4
166
167    vmov            q8,  q9
168    vmov            d20, d23
169    vmov            q11, q12
170    vmov            q9,  q13
171
172    subs            r6, r6, #4              ; w -= 4
173    bgt             loop_horiz
174
175    ; outer loop
176    mov             r6, r10                 ; restore w counter
177    add             r0, r0, r9              ; src += src_stride * 4 - w
178    add             r2, r2, r12             ; dst += dst_stride * 4 - w
179    subs            r7, r7, #4              ; h -= 4
180    bgt loop_horiz_v
181
182    pop             {r4-r10, pc}
183
184    ENDP
185
186|vp9_convolve8_avg_vert_neon| PROC
187    ldr             r12, [sp, #12]
188    cmp             r12, #16
189    bne             vp9_convolve8_avg_vert_c
190
191    push            {r4-r8, lr}
192
193    ; adjust for taps
194    sub             r0, r0, r1
195    sub             r0, r0, r1, lsl #1
196
197    ldr             r4, [sp, #32]           ; filter_y
198    ldr             r6, [sp, #40]           ; w
199    ldr             lr, [sp, #44]           ; h
200
201    vld1.s16        {q0}, [r4]              ; filter_y
202
203    lsl             r1, r1, #1
204    lsl             r3, r3, #1
205
206loop_vert_h
207    mov             r4, r0
208    add             r7, r0, r1, asr #1
209    mov             r5, r2
210    add             r8, r2, r3, asr #1
211    mov             r12, lr                 ; h loop counter
212
213    vld1.u32        {d16[0]}, [r4], r1
214    vld1.u32        {d16[1]}, [r7], r1
215    vld1.u32        {d18[0]}, [r4], r1
216    vld1.u32        {d18[1]}, [r7], r1
217    vld1.u32        {d20[0]}, [r4], r1
218    vld1.u32        {d20[1]}, [r7], r1
219    vld1.u32        {d22[0]}, [r4], r1
220
221    vmovl.u8        q8, d16
222    vmovl.u8        q9, d18
223    vmovl.u8        q10, d20
224    vmovl.u8        q11, d22
225
226loop_vert
227    ; always process a 4x4 block at a time
228    vld1.u32        {d24[0]}, [r7], r1
229    vld1.u32        {d26[0]}, [r4], r1
230    vld1.u32        {d26[1]}, [r7], r1
231    vld1.u32        {d24[1]}, [r4], r1
232
233    ; extract to s16
234    vmovl.u8        q12, d24
235    vmovl.u8        q13, d26
236
237    vld1.u32        {d6[0]}, [r5@32], r3
238    vld1.u32        {d6[1]}, [r8@32], r3
239    vld1.u32        {d7[0]}, [r5@32], r3
240    vld1.u32        {d7[1]}, [r8@32], r3
241
242    pld             [r7]
243    pld             [r4]
244
245    ; src[] * filter_y
246    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
247
248    pld             [r7, r1]
249    pld             [r4, r1]
250
251    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
252
253    pld             [r5]
254    pld             [r8]
255
256    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
257
258    pld             [r5, r3]
259    pld             [r8, r3]
260
261    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
262
263    ; += 64 >> 7
264    vqrshrun.s32    d2, q1, #7
265    vqrshrun.s32    d3, q2, #7
266    vqrshrun.s32    d4, q14, #7
267    vqrshrun.s32    d5, q15, #7
268
269    ; saturate
270    vqmovn.u16      d2, q1
271    vqmovn.u16      d3, q2
272
273    ; average the new value and the dst value
274    vrhadd.u8       q1, q1, q3
275
276    sub             r5, r5, r3, lsl #1      ; reset for store
277    sub             r8, r8, r3, lsl #1
278
279    vst1.u32        {d2[0]}, [r5@32], r3
280    vst1.u32        {d2[1]}, [r8@32], r3
281    vst1.u32        {d3[0]}, [r5@32], r3
282    vst1.u32        {d3[1]}, [r8@32], r3
283
284    vmov            q8, q10
285    vmov            d18, d22
286    vmov            d19, d24
287    vmov            q10, q13
288    vmov            d22, d25
289
290    subs            r12, r12, #4            ; h -= 4
291    bgt             loop_vert
292
293    ; outer loop
294    add             r0, r0, #4
295    add             r2, r2, #4
296    subs            r6, r6, #4              ; w -= 4
297    bgt             loop_vert_h
298
299    pop             {r4-r8, pc}
300
301    ENDP
302    END
303