1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20@**
21@******************************************************************************
22@* @file
23@*  ih264_inter_pred_luma_bilinear_a9q.s
24@*
25@* @brief
26@*  Contains function definitions for inter prediction  interpolation.
27@*
28@* @author
29@* Ittiam
30@*
31@* @par List of Functions:
32@*
33@*  - ih264_inter_pred_luma_bilinear_a9q()
34@*
35@* @remarks
36@*  None
37@*
38@*******************************************************************************
39@*
40
41@* All the functions here are replicated from ih264_inter_pred_filters.c
42@
43
44@**
45@**
46@**
47@ *******************************************************************************
48@ *  function:ih264_inter_pred_luma_bilinear
49@ *
50@* @brief
51@*    This routine applies the bilinear filter to the predictors .
52@*    The  filtering operation is described in
53@*    sec 8.4.2.2.1 titled "Luma sample interpolation process"
54@*
55@* @par Description:
56@\note
57@*     This function is called to obtain pixels lying at the following
58@*    locations (1/4,1), (3/4,1),(1,1/4), (1,3/4) ,(1/4,1/2), (3/4,1/2),(1/2,1/4), (1/2,3/4),(3/4,1/4),(1/4,3/4),(3/4,3/4)&& (1/4,1/4) .
59@*    The function averages the two adjacent values from the two input arrays in horizontal direction.
60@*
61@*
62@* @param[in] pu1_src1:
63@*  UWORD8 Pointer to the buffer containing the first input array.
64@*
65@* @param[in] pu1_src2:
66@*  UWORD8 Pointer to the buffer containing the second input array.
67@*
68@* @param[out] pu1_dst
69@*  UWORD8 pointer to the destination where the output of bilinear filter is stored.
70@*
71@* @param[in] src_strd1
72@*  Stride of the first input buffer
73@*
74@* @param[in] src_strd2
75@*  Stride of the second input buffer
76@*
77@* @param[in] dst_strd
78@*  integer destination stride of pu1_dst
79@*
80@* @param[in] ht
81@*  integer height of the array
82@*
83@* @param[in] wd
84@*  integer width of the array
85@*
86@* @returns
87@*
88@* @remarks
89@*  None
90@*
91@*******************************************************************************
92@*
93
94@void ih264_inter_pred_luma_bilinear(UWORD8 *pu1_src1,
95@                                   UWORD8 *pu1_src2,
96@                                   UWORD8 *pu1_dst,
97@                                   WORD32 src_strd1,
98@                                   WORD32 src_strd2,
99@                                   WORD32 dst_strd,
100@                                   WORD32 height,
101@                                   WORD32 width)
102@
103@**************Variables Vs Registers*****************************************
104@   r0 => *pu1_src1
105@   r1 => *pu1_src2
106@   r2 => *pu1_dst
107@   r3 =>  src_strd1
108@   r4 =>  src_strd2
109@   r5 =>  dst_strd
110@   r6 =>  height
111@   r7 => width
112@
113.text
114.p2align 2
115
116    .global ih264_inter_pred_luma_bilinear_a9q
117
118ih264_inter_pred_luma_bilinear_a9q:
119
120
121
122    stmfd         sp!, {r4-r12, r14}    @store register values to stack
123    vstmdb        sp!, {d8-d15}         @push neon registers to stack
124    ldr           r4, [sp, #104]
125    ldr           r5, [sp, #108]        @
126    ldr           r6, [sp, #112]
127    ldr           r7, [sp, #116]
128
129    subs          r12, r7, #4           @if wd=4 branch to loop_4
130    beq           loop_4
131    subs          r12, r7, #8           @if wd=8 branch to loop_8
132    beq           loop_8
133
134loop_16:                                @when  wd=16
135
136    vld1.8        {q0}, [r0], r3        @// Load row0 ;src1
137    vld1.8        {q2}, [r1], r4        @// Load row0  ;src2
138    vld1.8        {q1}, [r0], r3        @// Load row1 ;src1
139    vaddl.u8      q10, d0, d4
140    vld1.8        {q3}, [r1], r4        @// Load row1  ;src2
141    vaddl.u8      q11, d1, d5
142    vld1.8        {q4}, [r0], r3        @// Load row2 ;src1
143    vaddl.u8      q12, d2, d6
144    vld1.8        {q5}, [r0], r3        @// Load row3 ;src1
145    vaddl.u8      q13, d3, d7
146    vld1.8        {q6}, [r1], r4        @// Load row2  ;src2
147    vaddl.u8      q8, d8, d12
148    vld1.8        {q7}, [r1], r4        @// Load row3  ;src2
149    vaddl.u8      q9, d9, d13
150    vqrshrun.s16  d28, q10, #1
151    vqrshrun.s16  d29, q11, #1
152    vaddl.u8      q10, d10, d14
153    vqrshrun.s16  d30, q12, #1
154    vqrshrun.s16  d31, q13, #1
155    vst1.8        {q14}, [r2], r5       @//Store dest row0
156    vaddl.u8      q11, d11, d15
157    vst1.8        {q15}, [r2], r5       @//Store dest row1
158    vqrshrun.s16  d28, q8, #1
159    vld1.8        {q0}, [r0], r3        @// Load row4 ;src1
160    vqrshrun.s16  d29, q9, #1
161    vld1.8        {q1}, [r0], r3        @// Load row5 ;src1
162    vqrshrun.s16  d30, q10, #1
163    vld1.8        {q2}, [r1], r4        @// Load row4  ;src2
164    vqrshrun.s16  d31, q11, #1
165    vld1.8        {q3}, [r1], r4        @// Load row5  ;src2
166    vaddl.u8      q10, d0, d4
167    vst1.8        {q14}, [r2], r5       @//Store dest row2
168    vaddl.u8      q13, d3, d7
169    vst1.8        {q15}, [r2], r5       @//Store dest row3
170    vaddl.u8      q11, d1, d5
171    vld1.8        {q4}, [r0], r3        @// Load row6 ;src1
172    vaddl.u8      q12, d2, d6
173    vld1.8        {q5}, [r0], r3        @// Load row7 ;src1
174    vqrshrun.s16  d28, q10, #1
175    vld1.8        {q6}, [r1], r4        @// Load row6  ;src2
176    vqrshrun.s16  d29, q11, #1
177    vld1.8        {q7}, [r1], r4        @// Load row7  ;src2
178    vaddl.u8      q8, d8, d12
179    vaddl.u8      q9, d9, d13
180    vaddl.u8      q10, d10, d14
181    vqrshrun.s16  d30, q12, #1
182    vqrshrun.s16  d31, q13, #1
183    vst1.8        {q14}, [r2], r5       @//Store dest row4
184    vaddl.u8      q11, d11, d15
185    vst1.8        {q15}, [r2], r5       @//Store dest row5
186    vqrshrun.s16  d28, q8, #1
187    vqrshrun.s16  d30, q10, #1
188    vqrshrun.s16  d29, q9, #1
189    vld1.8        {q2}, [r1], r4        @// Load row8  ;src2
190    vqrshrun.s16  d31, q11, #1
191    vst1.8        {q14}, [r2], r5       @//Store dest row6
192    subs          r12, r6, #8
193    vst1.8        {q15}, [r2], r5       @//Store dest row7
194
195    beq           end_func              @ end function if ht=8
196
197    vld1.8        {q0}, [r0], r3        @// Load row8 ;src1
198    vaddl.u8      q10, d0, d4
199    vld1.8        {q1}, [r0], r3        @// Load row9 ;src1
200    vaddl.u8      q11, d1, d5
201    vld1.8        {q3}, [r1], r4        @// Load row9  ;src2
202    vqrshrun.s16  d28, q10, #1
203    vld1.8        {q4}, [r0], r3        @// Load row10 ;src1
204    vqrshrun.s16  d29, q11, #1
205    vld1.8        {q5}, [r0], r3        @// Load row11 ;src1
206    vaddl.u8      q12, d2, d6
207    vld1.8        {q6}, [r1], r4        @// Load row10  ;src2
208    vaddl.u8      q13, d3, d7
209    vld1.8        {q7}, [r1], r4        @// Load row11 ;src2
210    vaddl.u8      q8, d8, d12
211    vaddl.u8      q9, d9, d13
212    vaddl.u8      q10, d10, d14
213    vqrshrun.s16  d30, q12, #1
214    vst1.8        {q14}, [r2], r5       @//Store dest row8
215    vqrshrun.s16  d31, q13, #1
216    vst1.8        {q15}, [r2], r5       @//Store dest row9
217    vqrshrun.s16  d28, q8, #1
218    vld1.8        {q0}, [r0], r3        @// Load row12 ;src1
219    vaddl.u8      q11, d11, d15
220    vld1.8        {q1}, [r0], r3        @// Load row13 ;src1
221    vqrshrun.s16  d29, q9, #1
222    vld1.8        {q2}, [r1], r4        @// Load row12  ;src2
223    vqrshrun.s16  d30, q10, #1
224    vld1.8        {q3}, [r1], r4        @// Load row13  ;src2
225    vqrshrun.s16  d31, q11, #1
226    vst1.8        {q14}, [r2], r5       @//Store dest row10
227    vaddl.u8      q10, d0, d4
228    vst1.8        {q15}, [r2], r5       @//Store dest row11
229    vaddl.u8      q11, d1, d5
230    vld1.8        {q4}, [r0], r3        @// Load row14 ;src1
231    vaddl.u8      q13, d3, d7
232    vld1.8        {q5}, [r0], r3        @// Load row15 ;src1
233    vaddl.u8      q12, d2, d6
234    vld1.8        {q6}, [r1], r4        @// Load row14  ;src2
235    vaddl.u8      q8, d8, d12
236    vld1.8        {q7}, [r1], r4        @// Load row15  ;src2
237    vaddl.u8      q9, d9, d13
238    vqrshrun.s16  d28, q10, #1
239    vqrshrun.s16  d29, q11, #1
240    vaddl.u8      q10, d10, d14
241    vst1.8        {q14}, [r2], r5       @//Store dest row12
242    vqrshrun.s16  d30, q12, #1
243    vqrshrun.s16  d31, q13, #1
244    vaddl.u8      q11, d11, d15
245    vst1.8        {q15}, [r2], r5       @//Store dest row13
246    vqrshrun.s16  d28, q8, #1
247    vqrshrun.s16  d29, q9, #1
248    vqrshrun.s16  d30, q10, #1
249    vst1.8        {q14}, [r2], r5       @//Store dest row14
250    vqrshrun.s16  d31, q11, #1
251    vst1.8        {q15}, [r2], r5       @//Store dest row15
252    b             end_func
253
254
255
256loop_8: @wd=8;
257    vld1.8        {d0}, [r0], r3        @// Load row0 ;src1
258    vld1.8        {d4}, [r1], r4        @// Load row0  ;src2
259    vld1.8        {d1}, [r0], r3        @// Load row1 ;src1
260    vaddl.u8      q10, d0, d4
261    vld1.8        {d5}, [r1], r4        @// Load row1  ;src2
262    vld1.8        {d2}, [r0], r3        @// Load row2 ;src1
263    vqrshrun.s16  d28, q10, #1
264    vld1.8        {d6}, [r1], r4        @// Load row2  ;src2
265    vaddl.u8      q11, d1, d5
266    vld1.8        {d3}, [r0], r3        @// Load row3 ;src1
267    vaddl.u8      q12, d2, d6
268    vst1.8        {d28}, [r2], r5       @//Store dest row0
269    vqrshrun.s16  d29, q11, #1
270    vld1.8        {d7}, [r1], r4        @// Load row3  ;src2
271    vqrshrun.s16  d30, q12, #1
272    vst1.8        {d29}, [r2], r5       @//Store dest row1
273    vaddl.u8      q13, d3, d7
274    vst1.8        {d30}, [r2], r5       @//Store dest row2
275    vqrshrun.s16  d31, q13, #1
276    subs          r12, r6, #4
277    vst1.8        {d31}, [r2], r5       @//Store dest row3
278    beq           end_func              @ end function if ht=4
279
280    vld1.8        {d12}, [r1], r4       @// Load row4 ;src2
281    vld1.8        {d8}, [r0], r3        @// Load row4 ;src1
282    vld1.8        {d9}, [r0], r3        @// Load row5 ;src1
283    vaddl.u8      q8, d8, d12
284    vld1.8        {d13}, [r1], r4       @// Load row5  ;src2
285    vld1.8        {d10}, [r0], r3       @// Load row6;src1
286    vaddl.u8      q9, d9, d13
287    vld1.8        {d14}, [r1], r4       @// Load row6  ;src2
288    vqrshrun.s16  d28, q8, #1
289    vld1.8        {d11}, [r0], r3       @// Load row7 ;src1
290    vqrshrun.s16  d29, q9, #1
291    vst1.8        {d28}, [r2], r5       @//Store dest row4
292    vaddl.u8      q10, d10, d14
293    vst1.8        {d29}, [r2], r5       @//Store dest row5
294    vqrshrun.s16  d30, q10, #1
295    vld1.8        {d15}, [r1], r4       @// Load row7 ;src2
296    vaddl.u8      q11, d11, d15
297    vst1.8        {d30}, [r2], r5       @//Store dest row6
298    vqrshrun.s16  d31, q11, #1
299    subs          r12, r6, #8
300    vst1.8        {d31}, [r2], r5       @//Store dest row7
301    beq           end_func              @ end function if ht=8
302
303    vld1.8        {d0}, [r0], r3        @// Load row8 ;src1
304    vld1.8        {d4}, [r1], r4        @// Load row8  ;src2
305    vld1.8        {d1}, [r0], r3        @// Load row9 ;src1
306    vaddl.u8      q10, d0, d4
307    vld1.8        {d5}, [r1], r4        @// Load row9  ;src2
308    vld1.8        {d2}, [r0], r3        @// Load row10 ;src1
309    vaddl.u8      q11, d1, d5
310    vld1.8        {d6}, [r1], r4        @// Load row10  ;src2
311    vqrshrun.s16  d28, q10, #1
312    vld1.8        {d3}, [r0], r3        @// Load row11 ;src1
313    vaddl.u8      q12, d2, d6
314    vld1.8        {d7}, [r1], r4        @// Load row11  ;src2
315    vqrshrun.s16  d29, q11, #1
316    vld1.8        {d8}, [r0], r3        @// Load row12 ;src1
317    vaddl.u8      q13, d3, d7
318    vst1.8        {d28}, [r2], r5       @//Store dest row8
319    vqrshrun.s16  d30, q12, #1
320    vld1.8        {d12}, [r1], r4       @// Load row12  ;src2
321    vqrshrun.s16  d31, q13, #1
322    vst1.8        {d29}, [r2], r5       @//Store dest row9
323    vaddl.u8      q8, d8, d12
324    vld1.8        {d9}, [r0], r3        @// Load row13 ;src1
325    vqrshrun.s16  d28, q8, #1
326    vld1.8        {d13}, [r1], r4       @// Load row13  ;src2
327    vld1.8        {d10}, [r0], r3       @// Load row14;src1
328    vaddl.u8      q9, d9, d13
329    vld1.8        {d11}, [r0], r3       @// Load row15 ;src1
330    vld1.8        {d14}, [r1], r4       @// Load row14  ;src2
331    vqrshrun.s16  d29, q9, #1
332    vld1.8        {d15}, [r1], r4       @// Load roW15 ;src2
333    vaddl.u8      q10, d10, d14
334    vst1.8        {d30}, [r2], r5       @//Store dest row10
335    vaddl.u8      q11, d11, d15
336    vst1.8        {d31}, [r2], r5       @//Store dest row11
337    vqrshrun.s16  d30, q10, #1
338    vst1.8        {d28}, [r2], r5       @//Store dest row12
339    vqrshrun.s16  d31, q11, #1
340    vst1.8        {d29}, [r2], r5       @//Store dest row13
341    vst1.8        {d30}, [r2], r5       @//Store dest row14
342    vst1.8        {d31}, [r2], r5       @//Store dest row15
343
344    b             end_func
345
346
347
348loop_4:
349    vld1.32       d0[0], [r0], r3       @// Load row0 ;src1
350    vld1.32       d4[0], [r1], r4       @// Load row0  ;src2
351    vld1.32       d1[0], [r0], r3       @// Load row1 ;src1
352    vaddl.u8      q10, d0, d4
353    vld1.32       d5[0], [r1], r4       @// Load row1  ;src2
354    vld1.32       d2[0], [r0], r3       @// Load row2 ;src1
355    vqrshrun.s16  d28, q10, #1
356    vld1.32       d6[0], [r1], r4       @// Load row2  ;src2
357    vaddl.u8      q11, d1, d5
358    vld1.32       d3[0], [r0], r3       @// Load row3 ;src1
359    vaddl.u8      q12, d2, d6
360    vst1.32       d28[0], [r2], r5      @//Store dest row0
361    vqrshrun.s16  d29, q11, #1
362    vld1.32       d7[0], [r1], r4       @// Load row3  ;src2
363    vqrshrun.s16  d30, q12, #1
364    vst1.32       d29[0], [r2], r5      @//Store dest row1
365    vaddl.u8      q13, d3, d7
366    vst1.32       d30[0], [r2], r5      @//Store dest row2
367    vqrshrun.s16  d31, q13, #1
368    subs          r12, r6, #4
369    vst1.32       d31[0], [r2], r5      @//Store dest row3
370    beq           end_func              @ end function if ht=4
371
372    vld1.32       d12[0], [r1], r4      @// Load row4 ;src2
373    vld1.32       d8[0], [r0], r3       @// Load row4 ;src1
374    vld1.32       d9[0], [r0], r3       @// Load row5 ;src1
375    vaddl.u8      q8, d8, d12
376    vld1.32       d13[0], [r1], r4      @// Load row5  ;src2
377    vld1.32       d10[0], [r0], r3      @// Load row6;src1
378    vaddl.u8      q9, d9, d13
379    vld1.32       d14[0], [r1], r4      @// Load row6  ;src2
380    vqrshrun.s16  d28, q8, #1
381    vld1.32       d11[0], [r0], r3      @// Load row7 ;src1
382    vqrshrun.s16  d29, q9, #1
383    vst1.32       d28[0], [r2], r5      @//Store dest row4
384    vaddl.u8      q10, d10, d14
385    vst1.32       d29[0], [r2], r5      @//Store dest row5
386    vqrshrun.s16  d30, q10, #1
387    vld1.32       d15[0], [r1], r4      @// Load row7 ;src2
388    vaddl.u8      q11, d11, d15
389    vst1.32       d30[0], [r2], r5      @//Store dest row6
390    vqrshrun.s16  d31, q11, #1
391    vst1.32       d31[0], [r2], r5      @//Store dest row7
392
393end_func:
394
395    vldmia        sp!, {d8-d15}         @ Restore neon registers that were saved
396    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
397
398
399