1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20@**
21@******************************************************************************
22@* @file
23@*  ih264_inter_pred_luma_vert_qpel_a9q.s
24@*
25@* @brief
26@*  Contains function definitions for inter prediction vertical quarter pel interpolation.
27@*
28@* @author
29@*  Mohit
30@*
31@* @par List of Functions:
32@*
33@*  - ih264_inter_pred_luma_vert_qpel_a9q()
34@*
35@* @remarks
36@*  None
37@*
38@*******************************************************************************
39@*
40
41@* All the functions here are replicated from ih264_inter_pred_filters.c
42@
43
44@*******************************************************************************
45@*
46@* @brief
47@*     Quarter pel interprediction luma filter for vertical input
48@*
49@* @par Description:
50@* Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
51@* sec 8.4.2.2.1 titled "Luma sample interpolation process"
52@*
53@* @param[in] pu1_src
54@*  UWORD8 pointer to the source
55@*
56@* @param[out] pu1_dst
57@*  UWORD8 pointer to the destination
58@*
59@* @param[in] src_strd
60@*  integer source stride
61@*
62@* @param[in] dst_strd
63@*  integer destination stride
64@*
65@* @param[in] ht
66@*  integer height of the array
67@*
68@* @param[in] wd
69@*  integer width of the array
70@*
71@* @param[in] pu1_tmp: temporary buffer: UNUSED in this function
72@*
73@* @param[in] dydx: x and y reference offset for qpel calculations.
74@* @returns
75@*
76@ @remarks
77@*  None
78@*
79@*******************************************************************************
80@*
81
82@void ih264_inter_pred_luma_vert (
83@                            UWORD8 *pu1_src,
84@                            UWORD8 *pu1_dst,
85@                            WORD32 src_strd,
86@                            WORD32 dst_strd,
87@                            WORD32 ht,
88@                            WORD32 wd,
89@                            UWORD8* pu1_tmp,
90@                            UWORD32 dydx)
91
92@**************Variables Vs Registers*****************************************
93@   r0 => *pu1_src
94@   r1 => *pu1_dst
95@   r2 =>  src_strd
96@   r3 =>  dst_strd
97@   r5 =>  ht
98@   r6 =>  wd
99@   r7 =>  dydx
100
101.text
102.p2align 2
103
104    .global ih264_inter_pred_luma_vert_qpel_a9q
105
106ih264_inter_pred_luma_vert_qpel_a9q:
107
108    stmfd         sp!, {r4-r12, r14}    @store register values to stack
109    vstmdb        sp!, {d8-d15}         @push neon registers to stack
110    ldr           r5, [sp, #104]        @Loads ht
111
112    ldr           r6, [sp, #108]        @Loads wd
113    ldr           r7, [sp, #116]        @Loads dydx
114    and           r7, r7, #12           @Finds y-offset
115    lsr           r7, r7, #3            @dydx>>3
116    mul           r7, r2, r7
117    add           r7, r0, r7            @pu1_src + (y_offset>>1)*src_strd
118    vmov.u16      q11, #20              @ Filter coeff 0x14 into Q11
119    sub           r0, r0, r2, lsl #1    @pu1_src-2*src_strd
120    subs          r12, r6, #8           @if wd=8 branch to loop_8
121    vmov.u16      q12, #5               @ Filter coeff 0x5  into Q12
122    beq           loop_8
123
124    subs          r12, r6, #4           @if wd=4 branch to loop_4
125    beq           loop_4
126
127loop_16:                                @when  wd=16
128
129    vld1.u32      {q0}, [r0], r2        @ Vector load from src[0_0]
130    vld1.u32      {q1}, [r0], r2        @ Vector load from src[1_0]
131    vld1.u32      {q2}, [r0], r2        @ Vector load from src[2_0]
132    vld1.u32      {q3}, [r0], r2        @ Vector load from src[3_0]
133    vld1.u32      {q4}, [r0], r2        @ Vector load from src[4_0]
134    vaddl.u8      q6, d4, d6            @ temp1 = src[2_0] + src[3_0]
135    vld1.u32      {q5}, [r0], r2        @ Vector load from src[5_0]
136    vaddl.u8      q7, d0, d10           @ temp = src[0_0] + src[5_0]
137    vaddl.u8      q8, d2, d8            @ temp2 = src[1_0] + src[4_0]
138    vmla.u16      q7, q6, q11           @ temp += temp1 * 20
139    vaddl.u8      q10, d1, d11          @ temp4 = src[0_8] + src[5_8]
140    vaddl.u8      q9, d5, d7            @ temp3 = src[2_8] + src[3_8]
141    vmla.u16      q10, q9, q11          @ temp4 += temp3 * 20
142    vld1.u32      {q0}, [r0], r2
143    vaddl.u8      q13, d3, d9           @ temp5 = src[1_8] + src[4_8]
144    vaddl.u8      q6, d6, d8
145    vmls.u16      q7, q8, q12           @ temp -= temp2 * 5
146    vaddl.u8      q8, d2, d0
147    vaddl.u8      q9, d4, d10
148    vmla.u16      q8, q6, q11
149    vmls.u16      q10, q13, q12         @ temp4 -= temp5 * 5
150    vaddl.u8      q13, d5, d11
151    vaddl.u8      q6, d7, d9
152    vqrshrun.s16  d30, q7, #5           @ dst[0_0] = CLIP_U8((temp +16) >> 5)
153    vaddl.u8      q7, d3, d1
154    vld1.u32      {q1}, [r0], r2
155    vmla.u16      q7, q6, q11
156    vmls.u16      q8, q9, q12
157    vqrshrun.s16  d31, q10, #5          @ dst[0_8] = CLIP_U8((temp4 +16) >> 5)
158    vld1.u32      {q10}, [r7], r2       @ Load for interpolation row 0
159    vrhadd.u8     q15, q10, q15         @ Interpolation to obtain qpel value
160    vaddl.u8      q9, d4, d2
161    vaddl.u8      q6, d8, d10
162
163    vst1.u32      {q15}, [r1], r3       @ Vector store to dst[0_0]
164    vmla.u16      q9, q6, q11
165    vaddl.u8      q10, d6, d0
166    vmls.u16      q7, q13, q12
167    vqrshrun.s16  d30, q8, #5
168    vaddl.u8      q6, d9, d11
169    vaddl.u8      q8, d5, d3
170    vaddl.u8      q13, d7, d1
171    vmla.u16      q8, q6, q11
172    vmls.u16      q9, q10, q12
173    vld1.u32      {q2}, [r0], r2
174
175    vqrshrun.s16  d31, q7, #5
176    vld1.u32      {q7}, [r7], r2        @ Load for interpolation row 1
177    vaddl.u8      q6, d10, d0
178    vrhadd.u8     q15, q7, q15          @ Interpolation to obtain qpel value
179    vaddl.u8      q7, d6, d4
180    vaddl.u8      q10, d8, d2
181    vmla.u16      q7, q6, q11
182    vmls.u16      q8, q13, q12
183    vst1.u32      {q15}, [r1], r3       @store row 1
184    vqrshrun.s16  d30, q9, #5
185    vaddl.u8      q9, d7, d5
186    vaddl.u8      q6, d11, d1
187    vmla.u16      q9, q6, q11
188    vaddl.u8      q13, d9, d3
189    vmls.u16      q7, q10, q12
190    vqrshrun.s16  d31, q8, #5
191    vld1.u32      {q8}, [r7], r2        @ Load for interpolation row 2
192    vmls.u16      q9, q13, q12
193    vrhadd.u8     q15, q8, q15          @ Interpolation to obtain qpel value
194    vaddl.u8      q6, d0, d2            @ temp1 = src[2_0] + src[3_0]
195    vst1.u32      {q15}, [r1], r3       @store row 2
196    vaddl.u8      q8, d10, d4           @ temp2 = src[1_0] + src[4_0]
197    vaddl.u8      q10, d9, d7           @ temp4 = src[0_8] + src[5_8]
198    vqrshrun.s16  d30, q7, #5
199    vaddl.u8      q13, d5, d11          @ temp5 = src[1_8] + src[4_8]
200    vaddl.u8      q7, d8, d6            @ temp = src[0_0] + src[5_0]
201    vqrshrun.s16  d31, q9, #5
202    vld1.u32      {q9}, [r7], r2        @ Load for interpolation row 3
203    vmla.u16      q7, q6, q11           @ temp += temp1 * 20
204    vrhadd.u8     q15, q9, q15          @ Interpolation to obtain qpel value
205    vaddl.u8      q9, d1, d3            @ temp3 = src[2_8] + src[3_8]
206    vst1.u32      {q15}, [r1], r3       @store row 3
207    subs          r5, r5, #4            @ 4 rows processed, decrement by 4
208    subne         r0, r0 , r2, lsl #2
209    subne         r0, r0, r2
210    beq           end_func              @ Branch if height==4
211
212    b             loop_16               @ looping if height = 8 or 16
213
214
215loop_8:
216
217    @ Processing row0 and row1
218    vld1.u32      d0, [r0], r2          @ Vector load from src[0_0]
219    vld1.u32      d1, [r0], r2          @ Vector load from src[1_0]
220    vld1.u32      d2, [r0], r2          @ Vector load from src[2_0]
221    vld1.u32      d3, [r0], r2          @ Vector load from src[3_0]
222    vld1.u32      d4, [r0], r2          @ Vector load from src[4_0]
223    vld1.u32      d5, [r0], r2          @ Vector load from src[5_0]
224
225    vaddl.u8      q3, d2, d3            @ temp1 = src[2_0] + src[3_0]
226    vaddl.u8      q4, d0, d5            @ temp = src[0_0] + src[5_0]
227    vaddl.u8      q5, d1, d4            @ temp2 = src[1_0] + src[4_0]
228    vmla.u16      q4, q3, q11           @ temp += temp1 * 20
229    vld1.u32      d6, [r0], r2
230    vaddl.u8      q7, d3, d4
231    vaddl.u8      q8, d1, d6
232    vaddl.u8      q9, d2, d5
233    vmls.u16      q4, q5, q12           @ temp -= temp2 * 5
234    vmla.u16      q8, q7, q11
235    vld1.u32      d7, [r0], r2
236    vaddl.u8      q10, d4, d5
237    vaddl.u8      q6, d2, d7
238    vaddl.u8      q5, d3, d6
239    vmls.u16      q8, q9, q12
240    vqrshrun.s16  d26, q4, #5           @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
241    vmla.u16      q6, q10, q11
242    vld1.32       d8, [r7], r2          @Load value for interpolation           (row0)
243    vld1.32       d9, [r7], r2          @Load value for interpolation           (row1)
244    vld1.u32      d0, [r0], r2
245    vaddl.u8      q7, d5, d6
246    vqrshrun.s16  d27, q8, #5
247    vrhadd.u8     q13, q4, q13          @ Interpolation step for qpel calculation
248    vaddl.u8      q10, d3, d0
249    vmls.u16      q6, q5, q12
250    vst1.u32      d26, [r1], r3         @ Vector store to dst[0_0]
251    vaddl.u8      q9, d4, d7
252    vmla.u16      q10, q7, q11
253    vst1.u32      d27, [r1], r3         @ Vector store to dst[1_0]
254    vqrshrun.s16  d28, q6, #5
255    vmls.u16      q10, q9, q12
256    vld1.32       d12, [r7], r2         @Load value for interpolation           (row2)
257    vld1.32       d13, [r7], r2         @Load value for interpolation           (row3)
258    vqrshrun.s16  d29, q10, #5
259    subs          r9, r5, #4
260    vrhadd.u8     q14, q6, q14
261    vst1.u32      d28, [r1], r3         @store row 2
262    vst1.u32      d29, [r1], r3         @store row 3
263
264    subs          r5, r5, #4            @ 4 rows processed, decrement by 4
265    subne         r0, r0 , r2, lsl #2
266    subne         r0, r0, r2
267    beq           end_func              @ Branch if height==4
268    b             loop_8                @looping if height == 8 or 16
269
270loop_4:
271@ Processing row0 and row1
272
273    vld1.u32      d0[0], [r0], r2       @ Vector load from src[0_0]
274    vld1.u32      d1[0], [r0], r2       @ Vector load from src[1_0]
275    vld1.u32      d2[0], [r0], r2       @ Vector load from src[2_0]
276    vld1.u32      d3[0], [r0], r2       @ Vector load from src[3_0]
277    vld1.u32      d4[0], [r0], r2       @ Vector load from src[4_0]
278    vld1.u32      d5[0], [r0], r2       @ Vector load from src[5_0]
279
280    vaddl.u8      q3, d2, d3            @ temp1 = src[2_0] + src[3_0]
281    vaddl.u8      q4, d0, d5            @ temp = src[0_0] + src[5_0]
282    vaddl.u8      q5, d1, d4            @ temp2 = src[1_0] + src[4_0]
283    vmla.u16      q4, q3, q11           @ temp += temp1 * 20
284    vld1.u32      d6, [r0], r2
285    vaddl.u8      q7, d3, d4
286    vaddl.u8      q8, d1, d6
287    vaddl.u8      q9, d2, d5
288    vmls.u16      q4, q5, q12           @ temp -= temp2 * 5
289    vld1.u32      d7[0], [r0], r2
290    vmla.u16      q8, q7, q11
291    vaddl.u8      q10, d4, d5
292    vaddl.u8      q6, d2, d7
293    vaddl.u8      q5, d3, d6
294    vmls.u16      q8, q9, q12
295    vqrshrun.s16  d26, q4, #5           @ dst[0_0] = CLIP_U8( (temp + 16) >> 5)
296    vld1.u32      d8[0], [r7], r2       @Load value for interpolation - row 0
297    vld1.u32      d9[0], [r7], r2       @Load value for interpolation - row 1
298    vmla.u16      q6, q10, q11
299    vld1.u32      d0[0], [r0], r2
300    vaddl.u8      q7, d5, d6
301    vqrshrun.s16  d27, q8, #5
302    vaddl.u8      q10, d3, d0
303    vrhadd.u8     q13, q13, q4          @Interpolation step for qpel calculation
304    vmls.u16      q6, q5, q12
305    vst1.u32      d26[0], [r1], r3      @ Vector store to dst[0_0]
306    vaddl.u8      q9, d4, d7
307    vmla.u16      q10, q7, q11
308    vst1.u32      d27[0], [r1], r3      @ store row 1
309    vqrshrun.s16  d28, q6, #5
310    vld1.u32      d12[0], [r7], r2      @Load value for interpolation - row 2
311    vld1.u32      d13[0], [r7], r2      @Load value for interpolation - row 3
312
313    vmls.u16      q10, q9, q12
314    vqrshrun.s16  d29, q10, #5
315    vrhadd.u8     q14, q6, q14          @Interpolation step for qpel calculation
316    vst1.u32      d28[0], [r1], r3      @store row 2
317    vst1.u32      d29[0], [r1], r3      @store row 3
318
319    subs          r5, r5, #8
320    subeq         r0, r0, r2, lsl #2
321    subeq         r0, r0, r2
322    beq           loop_4                @ Loop if height==8
323
324end_func:
325    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
326    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
327
328
329