1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21//******************************************************************************
22//* @file
23//*  ih264_inter_pred_chroma_av8.s
24//*
25//* @brief
26//*  Contains function definitions for inter prediction  interpolation.
27//*
28//* @author
29//*  Ittaim
30//*
31//* @par List of Functions:
32//*
33//*  - ih264_inter_pred_chroma_av8()
34//*
35//* @remarks
36//*  None
37//*
38//*******************************************************************************
39//*/
40
41///* All the functions here are replicated from ih264_inter_pred_filters.c
42//
43
44///**
45///**
46///**
47//
48///**
49//*******************************************************************************
50//*
51//* @brief
52//*    Interprediction chroma filter
53//*
54//* @par Description:
55//*   Applies filtering to chroma samples as mentioned in
56//*    sec 8.4.2.2.2 titled "chroma sample interpolation process"
57//*
58//* @param[in] pu1_src
59//*  UWORD8 pointer to the source containing alternate U and V samples
60//*
61//* @param[out] pu1_dst
62//*  UWORD8 pointer to the destination
63//*
64//* @param[in] src_strd
65//*  integer source stride
66//*
67//* @param[in] dst_strd
68//*  integer destination stride
69//*
70//* @param[in]uc_dx
71//*  dx value where the sample is to be produced(refer sec 8.4.2.2.2 )
72//*
73//* @param[in] uc_dy
74//*  dy value where the sample is to be produced(refer sec 8.4.2.2.2 )
75//*
76//* @param[in] ht
77//*  integer height of the array
78//*
79//* @param[in] wd
80//*  integer width of the array
81//*
82//* @returns
83//*
84//* @remarks
85//*  None
86//*
87//*******************************************************************************
88//*/
89
90//void ih264_inter_pred_chroma(UWORD8 *pu1_src,
91//                             UWORD8 *pu1_dst,
92//                             WORD32 src_strd,
93//                             WORD32 dst_strd,
94//                             WORD32 u1_dx,
95//                             WORD32 u1_dy,
96//                             WORD32 ht,
97//                             WORD32 wd)
98//**************Variables Vs Registers*****************************************
99//    x0 => *pu1_src
100//    x1 => *pu1_dst
101//    w2 =>  src_strd
102//    w3 =>  dst_strd
103//    w4 =>  u1_dx
104//    w5 =>  u1_dy
105//    w6 =>  height
106//    w7 =>  width
107//
108.text
109.p2align 2
110.include "ih264_neon_macros.s"
111
112
113
114    .global ih264_inter_pred_chroma_av8
115
116ih264_inter_pred_chroma_av8:
117
118
119
120    // STMFD sp!, {x4-x12, x14}          //store register values to stack
121    push_v_regs
122    stp       x19, x20, [sp, #-16]!
123    sxtw      x2, w2
124    sxtw      x3, w3
125    sxtw      x4, w4
126    sxtw      x5, w5
127    sxtw      x6, w6
128    sxtw      x7, w7
129
130
131
132
133
134    sub       x20, x4, #8               //8-u1_dx
135    neg       x8, x20
136    sub       x20, x5, #8               //8-u1_dy
137    neg       x9, x20
138    mul       x10, x8, x9               //
139    mul       x11, x4, x9               //
140
141    dup       v28.8b, w10
142    dup       v29.8b, w11
143
144    mul       x10, x8, x5               //
145    mul       x11, x4, x5               //
146
147    dup       v30.8b, w10
148    dup       v31.8b, w11
149
150    subs      x12, x7, #2               //if wd=4 branch to loop_4
151    beq       loop_2
152    subs      x12, x7, #4               //if wd=8 branch to loop_8
153    beq       loop_4
154
155loop_8:
156    ld1       {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row0 ;
157    ext       v3.8b, v0.8b , v1.8b , #2
158    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1;
159    umull     v20.8h, v0.8b, v28.8b
160    ext       v8.8b, v5.8b , v6.8b , #2
161    umlal     v20.8h, v3.8b, v29.8b
162    ext       v9.8b, v6.8b , v7.8b , #2
163    umlal     v20.8h, v5.8b, v30.8b
164    ext       v4.8b, v1.8b , v2.8b , #2
165    umlal     v20.8h, v8.8b, v31.8b
166    sqrshrun  v26.8b, v20.8h, #6
167    umull     v22.8h, v1.8b, v28.8b
168    ld1       {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row2 ;
169    umlal     v22.8h, v4.8b, v29.8b
170    ext       v13.8b, v10.8b , v11.8b , #2
171    umlal     v22.8h, v6.8b, v30.8b
172    ext       v14.8b, v11.8b , v12.8b , #2
173    umlal     v22.8h, v9.8b, v31.8b
174    sqrshrun  v27.8b, v22.8h, #6
175    umull     v24.8h, v5.8b, v28.8b
176    st1       { v26.8b, v27.8b}, [x1], x3 ////Store dest row
177    umlal     v24.8h, v8.8b, v29.8b
178    ld1       {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row3 ;
179    umlal     v24.8h, v10.8b, v30.8b
180    ext       v3.8b, v0.8b , v1.8b , #2
181    umlal     v24.8h, v13.8b, v31.8b
182    ext       v4.8b, v1.8b , v2.8b , #2
183    umull     v16.8h, v6.8b, v28.8b
184    sqrshrun  v18.8b, v24.8h, #6
185    umlal     v16.8h, v9.8b, v29.8b
186    umlal     v16.8h, v11.8b, v30.8b
187    umlal     v16.8h, v14.8b, v31.8b
188    sqrshrun  v19.8b, v16.8h, #6
189    st1       {v18.8b, v19.8b}, [x1], x3 // store row 1
190    umull     v20.8h, v10.8b, v28.8b
191    umlal     v20.8h, v13.8b, v29.8b
192    umlal     v20.8h, v0.8b, v30.8b
193    umlal     v20.8h, v3.8b, v31.8b
194    sqrshrun  v26.8b, v20.8h, #6
195    umull     v24.8h, v11.8b, v28.8b
196    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row4;
197    umlal     v24.8h, v14.8b, v29.8b
198    ext       v8.8b, v5.8b , v6.8b , #2
199    umlal     v24.8h, v1.8b, v30.8b
200    ext       v9.8b, v6.8b , v7.8b , #2
201    umlal     v24.8h, v4.8b, v31.8b
202    umull     v20.8h, v0.8b, v28.8b
203    sqrshrun  v27.8b, v24.8h, #6
204    umlal     v20.8h, v3.8b, v29.8b
205    st1       { v26.8b, v27.8b}, [x1], x3 ////Store dest row2
206    umlal     v20.8h, v5.8b, v30.8b
207    umlal     v20.8h, v8.8b, v31.8b
208    umull     v22.8h, v1.8b, v28.8b
209    umlal     v22.8h, v4.8b, v29.8b
210    umlal     v22.8h, v6.8b, v30.8b
211    sqrshrun  v26.8b, v20.8h, #6
212    umlal     v22.8h, v9.8b, v31.8b
213    subs      x12, x6, #4
214    sqrshrun  v27.8b, v22.8h, #6
215    st1       { v26.8b, v27.8b}, [x1], x3 ////Store dest row3
216
217    beq       end_func                  //If ht=4
218
219    ld1       {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row5
220    ext       v13.8b, v10.8b , v11.8b , #2
221    umull     v24.8h, v5.8b, v28.8b
222    ext       v14.8b, v11.8b , v12.8b , #2
223    ld1       {v0.8b, v1.8b, v2.8b}, [x0], x2 //// Load row6;
224    umlal     v24.8h, v8.8b, v29.8b
225    umlal     v24.8h, v10.8b, v30.8b
226    umlal     v24.8h, v13.8b, v31.8b
227    ext       v3.8b, v0.8b , v1.8b , #2
228    umull     v16.8h, v6.8b, v28.8b
229    sqrshrun  v18.8b, v24.8h, #6
230    umlal     v16.8h, v9.8b, v29.8b
231    umlal     v16.8h, v11.8b, v30.8b
232    umlal     v16.8h, v14.8b, v31.8b
233    ext       v4.8b, v1.8b , v2.8b , #2
234    sqrshrun  v19.8b, v16.8h, #6
235    st1       { v18.8b, v19.8b}, [x1], x3 // store row 4
236    umull     v20.8h, v10.8b, v28.8b
237    umlal     v20.8h, v13.8b, v29.8b
238    umlal     v20.8h, v0.8b, v30.8b
239    umlal     v20.8h, v3.8b, v31.8b
240    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7;
241    sqrshrun  v26.8b, v20.8h, #6
242    umull     v24.8h, v11.8b, v28.8b
243    umlal     v24.8h, v14.8b, v29.8b
244    ext       v8.8b, v5.8b , v6.8b , #2
245    umlal     v24.8h, v1.8b, v30.8b
246    umlal     v24.8h, v4.8b, v31.8b
247    ext       v9.8b, v6.8b , v7.8b , #2
248    sqrshrun  v27.8b, v24.8h, #6
249    st1       {v26.8b, v27.8b}, [x1], x3 ////Store dest row5
250    umull     v20.8h, v0.8b, v28.8b
251    umlal     v20.8h, v3.8b, v29.8b
252    umlal     v20.8h, v5.8b, v30.8b
253    umlal     v20.8h, v8.8b, v31.8b
254    ld1       {v10.8b, v11.8b, v12.8b}, [x0], x2 //// Load row8 ;
255    sqrshrun  v26.8b, v20.8h, #6
256    umull     v22.8h, v1.8b, v28.8b
257    umlal     v22.8h, v4.8b, v29.8b
258    umlal     v22.8h, v6.8b, v30.8b
259    ext       v13.8b, v10.8b , v11.8b , #2
260    umlal     v22.8h, v9.8b, v31.8b
261    ext       v14.8b, v11.8b , v12.8b , #2
262    sqrshrun  v27.8b, v22.8h, #6
263    st1       { v26.8b, v27.8b}, [x1], x3 ////Store dest row6
264    umull     v24.8h, v5.8b, v28.8b
265    umlal     v24.8h, v8.8b, v29.8b
266    umlal     v24.8h, v10.8b, v30.8b
267    umlal     v24.8h, v13.8b, v31.8b
268    umull     v16.8h, v6.8b, v28.8b
269    sqrshrun  v18.8b, v24.8h, #6
270    umlal     v16.8h, v9.8b, v29.8b
271    umlal     v16.8h, v11.8b, v30.8b
272    umlal     v16.8h, v14.8b, v31.8b
273    sqrshrun  v19.8b, v16.8h, #6
274    st1       { v18.8b, v19.8b}, [x1], x3 // store row 7
275    b         end_func
276
277loop_4:
278    ld1       {v0.8b, v1.8b}, [x0], x2  //// Load row0 ;
279    ext       v2.8b, v0.8b , v1.8b , #2
280    ld1       {v3.8b, v4.8b}, [x0], x2  //// Load row1;
281    ext       v5.8b, v3.8b , v4.8b , #2
282    umull     v20.8h, v0.8b, v28.8b
283    umlal     v20.8h, v2.8b, v29.8b
284    umlal     v20.8h, v3.8b, v30.8b
285    umlal     v20.8h, v5.8b, v31.8b
286    ld1       {v6.8b, v7.8b}, [x0], x2  //// Load row2
287    sqrshrun  v26.8b, v20.8h, #6
288    ext       v8.8b, v6.8b , v7.8b , #2
289    st1       {v26.8b}, [x1], x3        ////Store dest row0
290    umull     v22.8h, v3.8b, v28.8b
291    umlal     v22.8h, v5.8b, v29.8b
292    umlal     v22.8h, v6.8b, v30.8b
293    umlal     v22.8h, v8.8b, v31.8b
294    subs      x12, x6, #2
295    sqrshrun  v27.8b, v22.8h, #6
296    st1       {v27.8b}, [x1], x3        ////Store dest row1
297    beq       end_func                  //If ht=2
298
299    ld1       {v9.8b, v10.8b}, [x0], x2 //// Load row3;
300    ext       v11.8b, v9.8b , v10.8b , #2
301    umull     v24.8h, v6.8b, v28.8b
302    umlal     v24.8h, v8.8b, v29.8b
303    umlal     v24.8h, v9.8b, v30.8b
304    umlal     v24.8h, v11.8b, v31.8b
305    ld1       {v0.8b, v1.8b}, [x0], x2  //// Load row4 ;
306    sqrshrun  v16.8b, v24.8h, #6
307    ext       v2.8b, v0.8b , v1.8b , #2
308    st1       {v16.8b}, [x1], x3        ////Store dest row2
309    umull     v18.8h, v9.8b, v28.8b
310    umlal     v18.8h, v11.8b, v29.8b
311    umlal     v18.8h, v0.8b, v30.8b
312    umlal     v18.8h, v2.8b, v31.8b
313    subs      x12, x6, #4
314    sqrshrun  v17.8b, v18.8h, #6
315    st1       {v17.8b}, [x1], x3        ////Store dest row3
316    beq       end_func                  //If ht=4
317
318    ld1       {v3.8b, v4.8b}, [x0], x2  //// Load row5;
319    ext       v5.8b, v3.8b , v4.8b , #2
320    umull     v20.8h, v0.8b, v28.8b
321    umlal     v20.8h, v2.8b, v29.8b
322    umlal     v20.8h, v3.8b, v30.8b
323    umlal     v20.8h, v5.8b, v31.8b
324    ld1       {v6.8b, v7.8b}, [x0], x2  //// Load row6 ;
325    sqrshrun  v26.8b, v20.8h, #6
326    ext       v8.8b, v6.8b , v7.8b , #2
327    st1       {v26.8b}, [x1], x3        ////Store dest row4
328    umull     v22.8h, v3.8b, v28.8b
329    umlal     v22.8h, v5.8b, v29.8b
330    umlal     v22.8h, v6.8b, v30.8b
331    umlal     v22.8h, v8.8b, v31.8b
332    ld1       {v9.8b, v10.8b}, [x0], x2 //// Load row7;
333    sqrshrun  v27.8b, v22.8h, #6
334    ext       v11.8b, v9.8b , v10.8b , #2
335    st1       {v27.8b}, [x1], x3        ////Store dest row5
336    umull     v24.8h, v6.8b, v28.8b
337    umlal     v24.8h, v8.8b, v29.8b
338    umlal     v24.8h, v9.8b, v30.8b
339    umlal     v24.8h, v11.8b, v31.8b
340    ld1       {v0.8b, v1.8b}, [x0], x2  //// Load row8;
341    sqrshrun  v16.8b, v24.8h, #6
342    ext       v2.8b, v0.8b , v1.8b , #2
343    st1       {v16.8b}, [x1], x3        ////Store dest row6
344    umull     v18.8h, v9.8b, v28.8b
345    umlal     v18.8h, v11.8b, v29.8b
346    umlal     v18.8h, v0.8b, v30.8b
347    umlal     v18.8h, v2.8b, v31.8b
348    sqrshrun  v17.8b, v18.8h, #6
349    st1       {v17.8b}, [x1], x3        ////Store dest row7
350    b         end_func
351
352loop_2:
353    ld1       {v0.8b}, [x0], x2         //// Load row0 ;
354    ext       v2.8b, v0.8b , v0.8b , #2
355    ld1       {v3.8b}, [x0], x2         //// Load row1;
356    ext       v5.8b, v3.8b , v3.8b , #2
357    umull     v20.8h, v0.8b, v28.8b
358    umlal     v20.8h, v2.8b, v29.8b
359    umlal     v20.8h, v3.8b, v30.8b
360    umlal     v20.8h, v5.8b, v31.8b
361    ld1       {v6.8b}, [x0], x2         //// Load row2
362    sqrshrun  v26.8b, v20.8h, #6
363    ext       v8.8b, v6.8b , v6.8b , #2
364    st1       {v26.s}[0], [x1], x3      ////Store dest row0
365    umull     v22.8h, v3.8b, v28.8b
366    umlal     v22.8h, v5.8b, v29.8b
367    umlal     v22.8h, v6.8b, v30.8b
368    umlal     v22.8h, v8.8b, v31.8b
369    subs      x12, x6, #2
370    sqrshrun  v27.8b, v22.8h, #6
371    st1       {v27.s}[0], [x1], x3      ////Store dest row1
372    beq       end_func                  //If ht=2
373
374    ld1       {v9.8b}, [x0], x2         //// Load row3;
375    ext       v11.8b, v9.8b , v9.8b , #2
376    umull     v24.8h, v6.8b, v28.8b
377    umlal     v24.8h, v8.8b, v29.8b
378    umlal     v24.8h, v9.8b, v30.8b
379    umlal     v24.8h, v11.8b, v31.8b
380    ld1       {v0.8b}, [x0], x2         //// Load row4 ;
381    sqrshrun  v16.8b, v24.8h, #6
382    ext       v2.8b, v0.8b , v0.8b , #2
383    st1       {v16.s}[0], [x1], x3      ////Store dest row2
384    umull     v18.8h, v9.8b, v28.8b
385    umlal     v18.8h, v11.8b, v29.8b
386    umlal     v18.8h, v0.8b, v30.8b
387    umlal     v18.8h, v2.8b, v31.8b
388    sqrshrun  v17.8b, v18.8h, #6
389    st1       {v17.s}[0], [x1], x3      ////Store dest row3
390
391
392end_func:
393    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
394    ldp       x19, x20, [sp], #16
395    pop_v_regs
396    ret
397
398
399