1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21//******************************************************************************
22//* @file
23//*  ih264_inter_pred_luma_horz_qpel_av8.s
24//*
25//* @brief
26//*  Contains function definitions for inter prediction horizontal quarter pel interpolation.
27//*
28//* @author
29//*  Mohit
30//*
31//* @par List of Functions:
32//*
33//*  - ih264_inter_pred_luma_horz_qpel_av8()
34//*
35//* @remarks
36//*  None
37//*
38//*******************************************************************************
39//*/
40
41///* All the functions here are replicated from ih264_inter_pred_filters.c
42//
43
44///**
45///**
46//*******************************************************************************
47//*
48//* @brief
49//*     Quarter pel interprediction luma filter for horizontal input
50//*
51//* @par Description:
52//* Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
53//* sec 8.4.2.2.1 titled "Luma sample interpolation process"
54//*
55//* @param[in] pu1_src
56//*  UWORD8 pointer to the source
57//*
58//* @param[out] pu1_dst
59//*  UWORD8 pointer to the destination
60//*
61//* @param[in] src_strd
62//*  integer source stride
63//*
64//* @param[in] dst_strd
65//*  integer destination stride
66//*
67//* @param[in] ht
68//*  integer height of the array
69//*
70//* @param[in] wd
71//*  integer width of the array
72//*
73// @param[in] pu1_tmp: temporary buffer: UNUSED in this function
74//*
75//* @param[in] dydx: x and y reference offset for qpel calculations.
76//* @returns
77//*
78// @remarks
79//*  None
80//*
81//*******************************************************************************
82//*/
83
84//void ih264_inter_pred_luma_horz (
85//                            UWORD8 *pu1_src,
86//                            UWORD8 *pu1_dst,
87//                            WORD32 src_strd,
88//                            WORD32 dst_strd,
89//                            WORD32 ht,
90//                            WORD32 wd,
91//                              UWORD8* pu1_tmp,
92//                             UWORD32 dydx)
93
94//**************Variables Vs Registers*****************************************
95//    x0 => *pu1_src
96//    x1 => *pu1_dst
97//    x2 =>  src_strd
98//    x3 =>  dst_strd
99//    x4 =>  ht
100//    x5 =>  wd
101//   x7 =>  dydx
102
103.text
104.p2align 2
105.include "ih264_neon_macros.s"
106
107
108
109
110    .global ih264_inter_pred_luma_horz_qpel_av8
111
112ih264_inter_pred_luma_horz_qpel_av8:
113
114
115    push_v_regs
116    stp       x19, x20, [sp, #-16]!
117
118
119    and       x7, x7, #3                //Finds x-offset
120    add       x7, x0, x7, lsr #1        //pu1_src + (x_offset>>1)
121    sub       x0, x0, #2                //pu1_src-2
122    sub       x14, x4, #16
123    movi      v0.16b, #5                //filter coeff
124    subs      x12, x5, #8               //if wd=8 branch to loop_8
125    movi      v1.16b, #20               //filter coeff
126
127    beq       loop_8
128
129    subs      x12, x5, #4               //if wd=4 branch to loop_4
130    beq       loop_4
131
132loop_16:                                //when  wd=16
133    //// Processing row0 and row1
134    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0
135    add       x14, x14, #1              //for checking loop
136    ext       v31.8b, v2.8b , v3.8b , #5
137    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1
138    ext       v30.8b, v3.8b , v4.8b , #5
139    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row0)
140    ext       v28.8b, v5.8b , v6.8b , #5
141    uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row0)
142    ext       v27.8b, v6.8b , v7.8b , #5
143    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row1)
144    ext       v31.8b, v2.8b , v3.8b , #2
145    uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row1)
146    ext       v30.8b, v3.8b , v4.8b , #2
147    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
148    ext       v28.8b, v5.8b , v6.8b , #2
149    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row0)
150    ext       v27.8b, v6.8b , v7.8b , #2
151    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row1)
152    ext       v31.8b, v2.8b , v3.8b , #3
153    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row1)
154    ext       v30.8b, v3.8b , v4.8b , #3
155    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
156    ext       v28.8b, v5.8b , v6.8b , #3
157    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row0)
158    ext       v27.8b, v6.8b , v7.8b , #3
159    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row1)
160    ext       v31.8b, v2.8b , v3.8b , #1
161    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row1)
162    ext       v30.8b, v3.8b , v4.8b , #1
163    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
164    ext       v28.8b, v5.8b , v6.8b , #1
165    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row0)
166    ext       v27.8b, v6.8b , v7.8b , #1
167    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row1)
168    ext       v31.8b, v2.8b , v3.8b , #4
169    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row1)
170    ext       v30.8b, v3.8b , v4.8b , #4
171    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
172    ext       v28.8b, v5.8b , v6.8b , #4
173    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row0)
174    ext       v27.8b, v6.8b , v7.8b , #4
175    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row1)
176    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row2
177    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row1)
178
179    ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row0)
180    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
181    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row3
182    sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
183    ext       v31.8b, v2.8b , v3.8b , #5
184    urhadd    v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation
185    urhadd    v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation
186
187    sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row1)
188    st1       {v20.8b, v21.8b}, [x1], x3 ////Store dest row0
189    ext       v30.8b, v3.8b , v4.8b , #5
190    sqrshrun  v19.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row1)
191
192
193
194//// Processing row2 and row3
195    ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row1)
196    ext       v28.8b, v5.8b , v6.8b , #5
197    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
198    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
199
200    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row2)
201    st1       {v18.8b, v19.8b}, [x1], x3 ////Store dest row1
202    uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row2)
203    ext       v27.8b, v6.8b , v7.8b , #5
204    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row3)
205    ext       v31.8b, v2.8b , v3.8b , #2
206    uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row3)
207    ext       v30.8b, v3.8b , v4.8b , #2
208    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row2)
209    ext       v27.8b, v6.8b , v7.8b , #2
210    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row2)
211    ext       v28.8b, v5.8b , v6.8b , #2
212    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row3)
213    ext       v31.8b, v2.8b , v3.8b , #3
214    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row3)
215    ext       v30.8b, v3.8b , v4.8b , #3
216    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row2)
217    ext       v28.8b, v5.8b , v6.8b , #3
218    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row2)
219    ext       v27.8b, v6.8b , v7.8b , #3
220    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row3)
221    ext       v31.8b, v2.8b , v3.8b , #1
222    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row3)
223    ext       v30.8b, v3.8b , v4.8b , #1
224    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row2)
225    ext       v28.8b, v5.8b , v6.8b , #1
226    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row2)
227    ext       v27.8b, v6.8b , v7.8b , #1
228    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row3)
229    ext       v31.8b, v2.8b , v3.8b , #4
230    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row3)
231    ext       v30.8b, v3.8b , v4.8b , #4
232    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row2)
233    ext       v28.8b, v5.8b , v6.8b , #4
234    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row2)
235    ext       v27.8b, v6.8b , v7.8b , #4
236    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row3)
237    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row4
238    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row3)
239
240    ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row2)
241    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row2)
242    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row5
243    sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row2)
244    ext       v31.8b, v2.8b , v3.8b , #5
245    urhadd    v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation
246    urhadd    v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation
247
248    sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row3)
249    ext       v30.8b, v3.8b , v4.8b , #5
250    st1       {v20.8b, v21.8b}, [x1], x3 ////Store dest row2
251    sqrshrun  v19.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row3)
252    ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row3)
253
254//// Processing row4 and row5
255    ext       v28.8b, v5.8b , v6.8b , #5
256    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
257    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
258
259    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row4)
260    st1       {v18.8b, v19.8b}, [x1], x3 ////Store dest row3
261    uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row4)
262    ext       v27.8b, v6.8b , v7.8b , #5
263    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row5)
264    ext       v31.8b, v2.8b , v3.8b , #2
265    uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row5)
266    ext       v30.8b, v3.8b , v4.8b , #2
267    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row4)
268    ext       v27.8b, v6.8b , v7.8b , #2
269    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row4)
270    ext       v28.8b, v5.8b , v6.8b , #2
271    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row5)
272    ext       v31.8b, v2.8b , v3.8b , #3
273    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row5)
274    ext       v30.8b, v3.8b , v4.8b , #3
275    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row4)
276    ext       v28.8b, v5.8b , v6.8b , #3
277    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row4)
278    ext       v27.8b, v6.8b , v7.8b , #3
279    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row5)
280    ext       v31.8b, v2.8b , v3.8b , #1
281    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row5)
282    ext       v30.8b, v3.8b , v4.8b , #1
283    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row4)
284    ext       v28.8b, v5.8b , v6.8b , #1
285    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row4)
286    ext       v27.8b, v6.8b , v7.8b , #1
287    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row4)
288    ext       v31.8b, v2.8b , v3.8b , #4
289    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row5)
290    ext       v30.8b, v3.8b , v4.8b , #4
291    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row4)
292    ext       v28.8b, v5.8b , v6.8b , #4
293    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row4)
294    ext       v27.8b, v6.8b , v7.8b , #4
295    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row5)
296    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row6
297    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row5)
298    ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row4)
299    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row4)
300    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7
301    sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row4)
302    ext       v31.8b, v2.8b , v3.8b , #5
303    urhadd    v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation
304    urhadd    v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation
305
306    sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row5)
307    st1       {v20.8b, v21.8b}, [x1], x3 ////Store dest row4
308    ext       v30.8b, v3.8b , v4.8b , #5
309    sqrshrun  v19.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row5)
310    ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row5)
311
312
313    //// Processing row6 and row7
314
315    ext       v28.8b, v5.8b , v6.8b , #5
316    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
317    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
318
319    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row6)
320    st1       {v18.8b, v19.8b}, [x1], x3 ////Store dest row5
321    uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row6)
322    ext       v27.8b, v6.8b , v7.8b , #5
323    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row7)
324    ext       v31.8b, v2.8b , v3.8b , #2
325    uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row7)
326    ext       v30.8b, v3.8b , v4.8b , #2
327    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row6)
328    ext       v27.8b, v6.8b , v7.8b , #2
329    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row6)
330    ext       v28.8b, v5.8b , v6.8b , #2
331    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row7)
332    ext       v31.8b, v2.8b , v3.8b , #3
333    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row7)
334    ext       v30.8b, v3.8b , v4.8b , #3
335    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row6)
336    ext       v28.8b, v5.8b , v6.8b , #3
337    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row6)
338    ext       v27.8b, v6.8b , v7.8b , #3
339    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row7)
340    ext       v31.8b, v2.8b , v3.8b , #1
341    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row7)
342    ext       v30.8b, v3.8b , v4.8b , #1
343    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row6)
344    ext       v28.8b, v5.8b , v6.8b , #1
345    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row6)
346    ext       v27.8b, v6.8b , v7.8b , #1
347    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row6)
348    ext       v31.8b, v2.8b , v3.8b , #4
349    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row7)
350    ext       v30.8b, v3.8b , v4.8b , #4
351    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row6)
352    ext       v28.8b, v5.8b , v6.8b , #4
353    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row6)
354    ext       v27.8b, v6.8b , v7.8b , #4
355    ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row6)
356    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row6)
357    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row7)
358    sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row6)
359    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row7)
360    urhadd    v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation
361    urhadd    v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation
362
363    ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row7)
364    sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row7)
365    st1       {v20.8b, v21.8b}, [x1], x3 ////Store dest row6
366    sqrshrun  v19.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row7)
367    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
368    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
369
370    subs      x12, x14, #1              // if height==16  - looping
371    st1       {v18.8b, v19.8b}, [x1], x3 ////Store dest row7
372
373
374
375    beq       loop_16
376    b         end_func
377
378loop_8:
379//// Processing row0 and row1
380
381    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row1
382    add       x14, x14, #1              //for checking loop
383    ext       v28.8b, v5.8b , v6.8b , #5
384    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row0
385    ext       v25.8b, v5.8b , v6.8b , #2
386    ext       v31.8b, v2.8b , v3.8b , #5
387    ext       v24.8b, v5.8b , v6.8b , #3
388    ext       v23.8b, v5.8b , v6.8b , #1
389    ext       v22.8b, v5.8b , v6.8b , #4
390    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row1)
391    ext       v29.8b, v2.8b , v3.8b , #3
392    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row1)
393    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row1)
394    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row1)
395    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row1)
396    ext       v30.8b, v2.8b , v3.8b , #2
397    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row0)
398    ext       v27.8b, v2.8b , v3.8b , #1
399    ext       v26.8b, v2.8b , v3.8b , #4
400    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row2
401    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
402    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
403    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
404    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
405    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row3
406    sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
407
408    //// Processing row2 and row3
409    ext       v28.8b, v5.8b , v6.8b , #5
410    ext       v25.8b, v5.8b , v6.8b , #2
411    ext       v31.8b, v2.8b , v3.8b , #5
412    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row3)
413    ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row0)
414    ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row1)
415    ext       v24.8b, v5.8b , v6.8b , #3
416    ext       v23.8b, v5.8b , v6.8b , #1
417    sqrshrun  v19.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row1)
418    ext       v22.8b, v5.8b , v6.8b , #4
419    ext       v29.8b, v2.8b , v3.8b , #3
420    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row3)
421    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row3)
422    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row3)
423    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row3)
424    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
425    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
426
427    st1       {v18.8b}, [x1], x3        ////Store dest row0
428    st1       {v19.8b}, [x1], x3        ////Store dest row1
429    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row2)
430    ext       v30.8b, v2.8b , v3.8b , #2
431    ext       v27.8b, v2.8b , v3.8b , #1
432    ext       v26.8b, v2.8b , v3.8b , #4
433    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row4
434    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row2)
435    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row2)
436    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row2)
437    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row2)
438    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row5
439    subs      x9, x4, #4
440    sqrshrun  v19.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row3)
441    ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row2)
442    ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row3)
443    ext       v28.8b, v5.8b , v6.8b , #5
444    ext       v25.8b, v5.8b , v6.8b , #2
445    ext       v31.8b, v2.8b , v3.8b , #5
446    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row5)
447    ext       v24.8b, v5.8b , v6.8b , #3
448    sqrshrun  v18.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row2)
449    ext       v22.8b, v5.8b , v6.8b , #4
450    ext       v29.8b, v2.8b , v3.8b , #3
451    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
452    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
453
454    st1       {v18.8b}, [x1], x3        ////Store dest row2
455    ext       v30.8b, v2.8b , v3.8b , #2
456    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row4)
457    st1       {v19.8b}, [x1], x3        ////Store dest row3
458    beq       end_func                  // Branch if height==4
459
460//// Processing row4 and row5
461    ext       v23.8b, v5.8b , v6.8b , #1
462    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row5)
463    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row5)
464    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row5)
465    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row5)
466    ext       v27.8b, v2.8b , v3.8b , #1
467    ext       v26.8b, v2.8b , v3.8b , #4
468    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row6
469    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row4)
470    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row4)
471    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row4)
472    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row4)
473    sqrshrun  v19.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row5)
474    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row7
475    ext       v31.8b, v2.8b , v3.8b , #5
476    ext       v28.8b, v5.8b , v6.8b , #5
477    ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row4)
478    ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row5)
479    ext       v25.8b, v5.8b , v6.8b , #2
480    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row7)
481    ext       v24.8b, v5.8b , v6.8b , #3
482    ext       v22.8b, v5.8b , v6.8b , #4
483    sqrshrun  v18.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row4)
484    ext       v29.8b, v2.8b , v3.8b , #3
485    ext       v30.8b, v2.8b , v3.8b , #2
486    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
487    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
488
489    st1       {v18.8b}, [x1], x3        ////Store dest row4
490    ext       v27.8b, v2.8b , v3.8b , #1
491    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row6)
492    ext       v26.8b, v2.8b , v3.8b , #4
493    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row6)
494    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row6)
495    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row6)
496    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row6)
497    //// Processing row6 and row7
498    st1       {v19.8b}, [x1], x3        ////Store dest row5
499    ext       v23.8b, v5.8b , v6.8b , #1
500    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row7)
501    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row7)
502    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row7)
503    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row7)
504    ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row6)
505    ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row7)
506    sqrshrun  v18.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row6)
507    subs      x12, x14, #1
508    sqrshrun  v19.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row7)
509    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
510    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
511
512    st1       {v18.8b}, [x1], x3        ////Store dest row6
513    st1       {v19.8b}, [x1], x3        ////Store dest row7
514
515    beq       loop_8                    //looping if height ==16
516
517    b         end_func
518
519loop_4:
520    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row1
521    ext       v28.8b, v5.8b , v6.8b , #5
522    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row0
523    ext       v25.8b, v5.8b , v6.8b , #2
524    ext       v31.8b, v2.8b , v3.8b , #5
525    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row1)
526    ext       v24.8b, v5.8b , v6.8b , #3
527    ext       v23.8b, v5.8b , v6.8b , #1
528    ext       v22.8b, v5.8b , v6.8b , #4
529    ext       v29.8b, v2.8b , v3.8b , #3
530    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row1)
531    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row1)
532    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row1)
533    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row1)
534    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row0)
535    ext       v30.8b, v2.8b , v3.8b , #2
536    ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row0)
537    ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row1)
538    ext       v27.8b, v2.8b , v3.8b , #1
539    ext       v26.8b, v2.8b , v3.8b , #4
540    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row2
541    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
542    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
543    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
544    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
545    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row3
546    ext       v28.8b, v5.8b , v6.8b , #5
547    ext       v25.8b, v5.8b , v6.8b , #2
548    sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
549    ext       v31.8b, v2.8b , v3.8b , #5
550    ext       v24.8b, v5.8b , v6.8b , #3
551
552    ext       v23.8b, v5.8b , v6.8b , #1
553    ext       v22.8b, v5.8b , v6.8b , #4
554    ext       v29.8b, v2.8b , v3.8b , #3
555    sqrshrun  v19.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row1)
556    ext       v30.8b, v2.8b , v3.8b , #2
557    ext       v27.8b, v2.8b , v3.8b , #1
558
559    //// Processing row2 and row3
560    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
561    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
562
563    st1       {v18.s}[0], [x1], x3      ////Store dest row0
564    st1       {v19.s}[0], [x1], x3      ////Store dest row1
565    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row3)
566    ext       v26.8b, v2.8b , v3.8b , #4
567    ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row2)
568    ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row3)
569
570    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row3)
571    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row3)
572    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row3)
573    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row3)
574    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row2)
575    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row2)
576    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row2)
577    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row2)
578    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row2)
579    sqrshrun  v19.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row3)
580    sqrshrun  v18.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row2)
581    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
582    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
583
584    st1       {v18.s}[0], [x1], x3      ////Store dest row2
585    subs      x4, x4, #8                // Loop if height =8
586    st1       {v19.s}[0], [x1], x3      ////Store dest row3
587
588    beq       loop_4
589
590end_func:
591
592    ldp       x19, x20, [sp], #16
593    pop_v_regs
594    ret
595
596
597
598