1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_yv12_copy_src_frame_func_neon|
13    ARM
14    REQUIRE8
15    PRESERVE8
16
17    INCLUDE vpx_scale_asm_offsets.asm
18
19    AREA ||.text||, CODE, READONLY, ALIGN=2
20;Note: This function is used to copy source data in src_buffer[i] at beginning
21;of the encoding. The buffer has a width and height of cpi->oxcf.Width and
22;cpi->oxcf.Height, which can be ANY numbers(NOT always multiples of 16 or 4).
23
24;void vp8_yv12_copy_src_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc,
25;                                       YV12_BUFFER_CONFIG *dst_ybc);
26
27|vp8_yv12_copy_src_frame_func_neon| PROC
28    push            {r4 - r11, lr}
29    vpush           {d8 - d15}
30
31    ;Copy Y plane
32    ldr             r4, [r0, #yv12_buffer_config_y_height]
33    ldr             r5, [r0, #yv12_buffer_config_y_width]
34    ldr             r6, [r0, #yv12_buffer_config_y_stride]
35    ldr             r7, [r1, #yv12_buffer_config_y_stride]
36    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
37    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1
38
39    add             r10, r2, r6             ;second row src
40    add             r11, r3, r7             ;second row dst
41    mov             r6, r6, lsl #1
42    mov             r7, r7, lsl #1
43    sub             r6, r6, r5              ;adjust stride
44    sub             r7, r7, r5
45
46    ; copy two rows at one time
47    mov             lr, r4, lsr #1
48
49cp_src_to_dst_height_loop
50    mov             r12, r5
51
52cp_width_128_loop
53    vld1.8          {q0, q1}, [r2]!
54    vld1.8          {q4, q5}, [r10]!
55    vld1.8          {q2, q3}, [r2]!
56    vld1.8          {q6, q7}, [r10]!
57    vld1.8          {q8, q9}, [r2]!
58    vld1.8          {q12, q13}, [r10]!
59    vld1.8          {q10, q11}, [r2]!
60    vld1.8          {q14, q15}, [r10]!
61    sub             r12, r12, #128
62    cmp             r12, #128
63    vst1.8          {q0, q1}, [r3]!
64    vst1.8          {q4, q5}, [r11]!
65    vst1.8          {q2, q3}, [r3]!
66    vst1.8          {q6, q7}, [r11]!
67    vst1.8          {q8, q9}, [r3]!
68    vst1.8          {q12, q13}, [r11]!
69    vst1.8          {q10, q11}, [r3]!
70    vst1.8          {q14, q15}, [r11]!
71    bhs             cp_width_128_loop
72
73    cmp             r12, #0
74    beq             cp_width_done
75
76cp_width_8_loop
77    vld1.8          {d0}, [r2]!
78    vld1.8          {d1}, [r10]!
79    sub             r12, r12, #8
80    cmp             r12, #8
81    vst1.8          {d0}, [r3]!
82    vst1.8          {d1}, [r11]!
83    bhs             cp_width_8_loop
84
85    cmp             r12, #0
86    beq             cp_width_done
87
88cp_width_1_loop
89    ldrb            r8, [r2], #1
90    subs            r12, r12, #1
91    strb            r8, [r3], #1
92    ldrb            r8, [r10], #1
93    strb            r8, [r11], #1
94    bne             cp_width_1_loop
95
96cp_width_done
97    subs            lr, lr, #1
98    add             r2, r2, r6
99    add             r3, r3, r7
100    add             r10, r10, r6
101    add             r11, r11, r7
102    bne             cp_src_to_dst_height_loop
103
104;copy last line for Y if y_height is odd
105    tst             r4, #1
106    beq             cp_width_done_1
107    mov             r12, r5
108
109cp_width_128_loop_1
110    vld1.8          {q0, q1}, [r2]!
111    vld1.8          {q2, q3}, [r2]!
112    vld1.8          {q8, q9}, [r2]!
113    vld1.8          {q10, q11}, [r2]!
114    sub             r12, r12, #128
115    cmp             r12, #128
116    vst1.8          {q0, q1}, [r3]!
117    vst1.8          {q2, q3}, [r3]!
118    vst1.8          {q8, q9}, [r3]!
119    vst1.8          {q10, q11}, [r3]!
120    bhs             cp_width_128_loop_1
121
122    cmp             r12, #0
123    beq             cp_width_done_1
124
125cp_width_8_loop_1
126    vld1.8          {d0}, [r2]!
127    sub             r12, r12, #8
128    cmp             r12, #8
129    vst1.8          {d0}, [r3]!
130    bhs             cp_width_8_loop_1
131
132    cmp             r12, #0
133    beq             cp_width_done_1
134
135cp_width_1_loop_1
136    ldrb            r8, [r2], #1
137    subs            r12, r12, #1
138    strb            r8, [r3], #1
139    bne             cp_width_1_loop_1
140cp_width_done_1
141
142;Copy U & V planes
143    ldr             r4, [r0, #yv12_buffer_config_uv_height]
144    ldr             r5, [r0, #yv12_buffer_config_uv_width]
145    ldr             r6, [r0, #yv12_buffer_config_uv_stride]
146    ldr             r7, [r1, #yv12_buffer_config_uv_stride]
147    ldr             r2, [r0, #yv12_buffer_config_u_buffer]       ;srcptr1
148    ldr             r3, [r1, #yv12_buffer_config_u_buffer]       ;dstptr1
149
150    add             r10, r2, r6             ;second row src
151    add             r11, r3, r7             ;second row dst
152    mov             r6, r6, lsl #1
153    mov             r7, r7, lsl #1
154    sub             r6, r6, r5              ;adjust stride
155    sub             r7, r7, r5
156
157    mov             r9, #2
158
159cp_uv_loop
160    ;copy two rows at one time
161    mov             lr, r4, lsr #1
162
163cp_src_to_dst_height_uv_loop
164    mov             r12, r5
165
166cp_width_uv_64_loop
167    vld1.8          {q0, q1}, [r2]!
168    vld1.8          {q4, q5}, [r10]!
169    vld1.8          {q2, q3}, [r2]!
170    vld1.8          {q6, q7}, [r10]!
171    sub             r12, r12, #64
172    cmp             r12, #64
173    vst1.8          {q0, q1}, [r3]!
174    vst1.8          {q4, q5}, [r11]!
175    vst1.8          {q2, q3}, [r3]!
176    vst1.8          {q6, q7}, [r11]!
177    bhs             cp_width_uv_64_loop
178
179    cmp             r12, #0
180    beq             cp_width_uv_done
181
182cp_width_uv_8_loop
183    vld1.8          {d0}, [r2]!
184    vld1.8          {d1}, [r10]!
185    sub             r12, r12, #8
186    cmp             r12, #8
187    vst1.8          {d0}, [r3]!
188    vst1.8          {d1}, [r11]!
189    bhs             cp_width_uv_8_loop
190
191    cmp             r12, #0
192    beq             cp_width_uv_done
193
194cp_width_uv_1_loop
195    ldrb            r8, [r2], #1
196    subs            r12, r12, #1
197    strb            r8, [r3], #1
198    ldrb            r8, [r10], #1
199    strb            r8, [r11], #1
200    bne             cp_width_uv_1_loop
201
202cp_width_uv_done
203    subs            lr, lr, #1
204    add             r2, r2, r6
205    add             r3, r3, r7
206    add             r10, r10, r6
207    add             r11, r11, r7
208    bne             cp_src_to_dst_height_uv_loop
209
210;copy last line for U & V if uv_height is odd
211    tst             r4, #1
212    beq             cp_width_uv_done_1
213    mov             r12, r5
214
215cp_width_uv_64_loop_1
216    vld1.8          {q0, q1}, [r2]!
217    vld1.8          {q2, q3}, [r2]!
218    sub             r12, r12, #64
219    cmp             r12, #64
220    vst1.8          {q0, q1}, [r3]!
221    vst1.8          {q2, q3}, [r3]!
222    bhs             cp_width_uv_64_loop_1
223
224    cmp             r12, #0
225    beq             cp_width_uv_done_1
226
227cp_width_uv_8_loop_1
228    vld1.8          {d0}, [r2]!
229    sub             r12, r12, #8
230    cmp             r12, #8
231    vst1.8          {d0}, [r3]!
232    bhs             cp_width_uv_8_loop_1
233
234    cmp             r12, #0
235    beq             cp_width_uv_done_1
236
237cp_width_uv_1_loop_1
238    ldrb            r8, [r2], #1
239    subs            r12, r12, #1
240    strb            r8, [r3], #1
241    bne             cp_width_uv_1_loop_1
242cp_width_uv_done_1
243
244    subs            r9, r9, #1
245    ldrne           r2, [r0, #yv12_buffer_config_v_buffer]      ;srcptr1
246    ldrne           r3, [r1, #yv12_buffer_config_v_buffer]      ;dstptr1
247    ldrne           r10, [r0, #yv12_buffer_config_uv_stride]
248    ldrne           r11, [r1, #yv12_buffer_config_uv_stride]
249
250    addne           r10, r2, r10                ;second row src
251    addne           r11, r3, r11                ;second row dst
252
253    bne             cp_uv_loop
254
255    vpop            {d8 - d15}
256    pop             {r4 - r11, pc}
257
258    ENDP
259    END
260