1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20@**
21@******************************************************************************
22@* @file
23@*  ih264_inter_pred_chroma_a9q.s
24@*
25@* @brief
26@*  Contains function definitions for inter prediction  interpolation.
27@*
28@* @author
29@*  Ittaim
30@*
31@* @par List of Functions:
32@*
33@*  - ih264_inter_pred_chroma_a9q()
34@*
35@* @remarks
36@*  None
37@*
38@*******************************************************************************
39@*
40
41@* All the functions here are replicated from ih264_inter_pred_filters.c
42@
43
44@**
45@**
46@**
47@
48@**
49@*******************************************************************************
50@*
51@* @brief
52@*    Interprediction chroma filter
53@*
54@* @par Description:
55@*   Applies filtering to chroma samples as mentioned in
56@*    sec 8.4.2.2.2 titled "chroma sample interpolation process"
57@*
58@* @param[in] pu1_src
59@*  UWORD8 pointer to the source containing alternate U and V samples
60@*
61@* @param[out] pu1_dst
62@*  UWORD8 pointer to the destination
63@*
64@* @param[in] src_strd
65@*  integer source stride
66@*
67@* @param[in] dst_strd
68@*  integer destination stride
69@*
70@* @param[in]uc_dx
71@*  dx value where the sample is to be produced(refer sec 8.4.2.2.2 )
72@*
73@* @param[in] uc_dy
74@*  dy value where the sample is to be produced(refer sec 8.4.2.2.2 )
75@*
76@* @param[in] ht
77@*  integer height of the array
78@*
79@* @param[in] wd
80@*  integer width of the array
81@*
82@* @returns
83@*
84@* @remarks
85@*  None
86@*
87@*******************************************************************************
88@*
89
90@void ih264_inter_pred_chroma(UWORD8 *pu1_src,
91@                             UWORD8 *pu1_dst,
92@                             WORD32 src_strd,
93@                             WORD32 dst_strd,
94@                             UWORD8 u1_dx,
95@                             UWORD8 u1_dy,
96@                             WORD32 ht,
97@                             WORD32 wd)
98@**************Variables Vs Registers*****************************************
99@   r0 => *pu1_src
100@   r1 => *pu1_dst
101@   r2 =>  src_strd
102@   r3 =>  dst_strd
103@   r4 =>  u1_dx
104@   r5 =>  u1_dy
105@   r6 =>  height
106@   r7 => width
107@
108.text
109.p2align 2
110
111    .global ih264_inter_pred_chroma_a9q
112
113ih264_inter_pred_chroma_a9q:
114
115    stmfd         sp!, {r4-r12, r14}    @store register values to stack
116    vstmdb        sp!, {d8-d15}         @push neon registers to stack
117    ldr           r4, [sp, #104]
118    ldr           r5, [sp, #108]
119    ldr           r6, [sp, #112]
120    ldr           r7, [sp, #116]
121
122    rsb           r8, r4, #8            @8-u1_dx
123    rsb           r9, r5, #8            @8-u1_dy
124    mul           r10, r8, r9
125    mul           r11, r4, r9
126
127    vdup.u8       d28, r10
128    vdup.u8       d29, r11
129
130    mul           r10, r8, r5
131    mul           r11, r4, r5
132
133    vdup.u8       d30, r10
134    vdup.u8       d31, r11
135
136    subs          r12, r7, #2           @if wd=4 branch to loop_4
137    beq           loop_2
138    subs          r12, r7, #4           @if wd=8 branch to loop_8
139    beq           loop_4
140
141loop_8:
142    sub           r6, #1
143    vld1.8        {d0, d1, d2}, [r0], r2 @ Load row0
144    vld1.8        {d5, d6, d7}, [r0], r2 @ Load row1
145    vext.8        d3, d0, d1, #2
146    vext.8        d8, d5, d6, #2
147
148    vmull.u8      q5, d0, d28
149    vmlal.u8      q5, d5, d30
150    vmlal.u8      q5, d3, d29
151    vmlal.u8      q5, d8, d31
152    vext.8        d9, d6, d7, #2
153    vext.8        d4, d1, d2, #2
154
155inner_loop_8:
156    vmull.u8      q6, d6, d30
157    vmlal.u8      q6, d1, d28
158    vmlal.u8      q6, d9, d31
159    vmlal.u8      q6, d4, d29
160    vmov          d0, d5
161    vmov          d3, d8
162
163    vqrshrun.s16  d14, q5, #6
164    vmov          d1, d6
165    vmov          d4, d9
166
167    vld1.8        {d5, d6, d7}, [r0], r2 @ Load row1
168    vqrshrun.s16  d15, q6, #6
169
170    vext.8        d8, d5, d6, #2
171    subs          r6, #1
172    vext.8        d9, d6, d7, #2
173    vst1.8        {q7}, [r1], r3        @ Store dest row
174
175    vmull.u8      q5, d0, d28
176    vmlal.u8      q5, d5, d30
177    vmlal.u8      q5, d3, d29
178    vmlal.u8      q5, d8, d31
179    bne           inner_loop_8
180
181    vmull.u8      q6, d6, d30
182    vmlal.u8      q6, d1, d28
183    vmlal.u8      q6, d9, d31
184    vmlal.u8      q6, d4, d29
185
186    vqrshrun.s16  d14, q5, #6
187    vqrshrun.s16  d15, q6, #6
188
189    vst1.8        {q7}, [r1], r3        @ Store dest row
190
191    b             end_func
192
193loop_4:
194    sub           r6, #1
195    vld1.8        {d0, d1}, [r0], r2    @ Load row0
196    vld1.8        {d2, d3}, [r0], r2    @ Load row1
197    vext.8        d1, d0, d1, #2
198    vext.8        d3, d2, d3, #2
199
200    vmull.u8      q2, d2, d30
201    vmlal.u8      q2, d0, d28
202    vmlal.u8      q2, d3, d31
203    vmlal.u8      q2, d1, d29
204
205inner_loop_4:
206    subs          r6, #1
207    vmov          d0, d2
208    vmov          d1, d3
209
210    vld1.8        {d2, d3}, [r0], r2    @ Load row1
211    vqrshrun.s16  d6, q2, #6
212
213    vext.8        d3, d2, d3, #2
214    vst1.8        {d6}, [r1], r3        @ Store dest row
215
216    vmull.u8      q2, d0, d28
217    vmlal.u8      q2, d2, d30
218    vmlal.u8      q2, d1, d29
219    vmlal.u8      q2, d3, d31
220    bne           inner_loop_4
221
222    vqrshrun.s16  d6, q2, #6
223    vst1.8        {d6}, [r1], r3        @ Store dest row
224
225    b             end_func
226
227loop_2:
228    vld1.8        {d0}, [r0], r2        @ Load row0
229    vext.8        d1, d0, d0, #2
230    vld1.8        {d2}, [r0], r2        @ Load row1
231    vext.8        d3, d2, d2, #2
232    vmull.u8      q2, d0, d28
233    vmlal.u8      q2, d1, d29
234    vmlal.u8      q2, d2, d30
235    vmlal.u8      q2, d3, d31
236    vld1.8        {d6}, [r0]            @ Load row2
237    vqrshrun.s16  d4, q2, #6
238    vext.8        d7, d6, d6, #2
239    vst1.32       d4[0], [r1], r3       @ Store dest row0
240    vmull.u8      q4, d2, d28
241    vmlal.u8      q4, d3, d29
242    vmlal.u8      q4, d6, d30
243    vmlal.u8      q4, d7, d31
244    subs          r6, #2
245    vqrshrun.s16  d8, q4, #6
246    vst1.32       d8[0], [r1], r3       @ Store dest row1
247    bne           loop_2                @ repeat if ht=2
248
249end_func:
250    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
251    ldmfd         sp!, {r4-r12, pc}     @ Restoring registers from stack
252
253