1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21//******************************************************************************
22//* @file
23//*  ih264_inter_pred_luma_vert_qpel_av8.s
24//*
25//* @brief
26//*  Contains function definitions for inter prediction vertical quarter pel interpolation.
27//*
28//* @author
29//*  Mohit
30//*
31//* @par List of Functions:
32//*
33//*  - ih264_inter_pred_luma_vert_qpel_av8()
34//*
35//* @remarks
36//*  None
37//*
38//*******************************************************************************
39//*/
40
41///* All the functions here are replicated from ih264_inter_pred_filters.c
42//
43
44///**
45///**
46//*******************************************************************************
47//*
48//* @brief
49//*     Quarter pel interprediction luma filter for vertical input
50//*
51//* @par Description:
52//* Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
53//* sec 8.4.2.2.1 titled "Luma sample interpolation process"
54//*
55//* @param[in] pu1_src
56//*  UWORD8 pointer to the source
57//*
58//* @param[out] pu1_dst
59//*  UWORD8 pointer to the destination
60//*
61//* @param[in] src_strd
62//*  integer source stride
63//*
64//* @param[in] dst_strd
65//*  integer destination stride
66//*
67//* @param[in] ht
68//*  integer height of the array
69//*
70//* @param[in] wd
71//*  integer width of the array
72//*
73//* @param[in] pu1_tmp: temporary buffer: UNUSED in this function
74//*
75//* @param[in] dydx: x and y reference offset for qpel calculations.
76//* @returns
77//*
78// @remarks
79//*  None
80//*
81//*******************************************************************************
82//*/
83
84//void ih264_inter_pred_luma_vert (
85//                            UWORD8 *pu1_src,
86//                            UWORD8 *pu1_dst,
87//                            WORD32 src_strd,
88//                            WORD32 dst_strd,
89//                            WORD32 ht,
90//                            WORD32 wd,
91//                              UWORD8* pu1_tmp,
92//                             UWORD32 dydx)
93
94//**************Variables Vs Registers*****************************************
95//    x0 => *pu1_src
96//    x1 => *pu1_dst
97//    w2 =>  src_strd
98//    w3 =>  dst_strd
99//    w4 =>  ht
100//    w5 =>  wd
101//    w7 =>  dydx
102
103.text
104.p2align 2
105.include "ih264_neon_macros.s"
106
107
108
109    .global ih264_inter_pred_luma_vert_qpel_av8
110
111ih264_inter_pred_luma_vert_qpel_av8:
112
113    push_v_regs
114    stp       x19, x20, [sp, #-16]!
115    sxtw      x2, w2
116    sxtw      x3, w3
117    sxtw      x4, w4
118    sxtw      x5, w5
119
120
121    and       x7, x7, #12               //Finds y-offset
122    lsr       x7, x7, #3                //dydx>>3
123    mul       x7, x2, x7
124    add       x7, x0, x7                //pu1_src + (y_offset>>1)*src_strd
125    sub       x14, x4, #16
126    movi      v22.8h, #20               // Filter coeff 0x14 into Q11
127    sub       x0, x0, x2, lsl #1        //pu1_src-2*src_strd
128    subs      x12, x5, #8               //if wd=8 branch to loop_8
129    movi      v24.8h, #5                // Filter coeff 0x4  into Q12
130    beq       loop_8_start
131
132    subs      x12, x5, #4               //if wd=4 branch to loop_4
133    beq       loop_4_start
134
135
136    ld1       {v0.2s, v1.2s}, [x0], x2  // Vector load from src[0_0]
137    ld1       {v2.2s, v3.2s}, [x0], x2  // Vector load from src[1_0]
138    ld1       {v4.2s, v5.2s}, [x0], x2  // Vector load from src[2_0]
139    ld1       {v6.2s, v7.2s}, [x0], x2  // Vector load from src[3_0]
140    add       x14, x14, #1              //for checking loop
141    ld1       {v8.2s, v9.2s}, [x0], x2  // Vector load from src[4_0]
142    uaddl     v12.8h, v4.8b, v6.8b      // temp1 = src[2_0] + src[3_0]
143    ld1       {v10.2s, v11.2s}, [x0], x2 // Vector load from src[5_0]
144
145loop_16:                                //when  wd=16
146
147    uaddl     v14.8h, v0.8b, v10.8b     // temp = src[0_0] + src[5_0]
148    uaddl     v16.8h, v2.8b, v8.8b      // temp2 = src[1_0] + src[4_0]
149    mla       v14.8h, v12.8h , v22.8h   // temp += temp1 * 20
150    uaddl     v20.8h, v1.8b, v11.8b     // temp4 = src[0_8] + src[5_8]
151    uaddl     v18.8h, v5.8b, v7.8b      // temp3 = src[2_8] + src[3_8]
152    mla       v20.8h, v18.8h , v22.8h   // temp4 += temp3 * 20
153    ld1       {v0.2s, v1.2s}, [x0], x2
154    uaddl     v26.8h, v3.8b, v9.8b      // temp5 = src[1_8] + src[4_8]
155    uaddl     v12.8h, v6.8b, v8.8b
156    mls       v14.8h, v16.8h , v24.8h   // temp -= temp2 * 5
157    uaddl     v16.8h, v2.8b, v0.8b
158    uaddl     v18.8h, v4.8b, v10.8b
159    mla       v16.8h, v12.8h , v22.8h
160    mls       v20.8h, v26.8h , v24.8h   // temp4 -= temp5 * 5
161    uaddl     v26.8h, v5.8b, v11.8b
162    uaddl     v12.8h, v7.8b, v9.8b
163    sqrshrun  v30.8b, v14.8h, #5        // dst[0_0] = CLIP_U8((temp +16) >> 5)
164    uaddl     v14.8h, v3.8b, v1.8b
165    ld1       {v2.2s, v3.2s}, [x0], x2
166    mla       v14.8h, v12.8h , v22.8h
167    mls       v16.8h, v18.8h , v24.8h
168    sqrshrun  v31.8b, v20.8h, #5        // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
169    ld1       {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 0
170    urhadd    v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value
171    urhadd    v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value
172    uaddl     v18.8h, v4.8b, v2.8b
173    uaddl     v12.8h, v8.8b, v10.8b
174    st1       {v30.2s, v31.2s}, [x1], x3 // Vector store to dst[0_0]
175    mla       v18.8h, v12.8h , v22.8h
176    uaddl     v20.8h, v6.8b, v0.8b
177    mls       v14.8h, v26.8h , v24.8h
178    sqrshrun  v30.8b, v16.8h, #5
179    uaddl     v12.8h, v9.8b, v11.8b
180    uaddl     v16.8h, v5.8b, v3.8b
181    uaddl     v26.8h, v7.8b, v1.8b
182    mla       v16.8h, v12.8h , v22.8h
183    mls       v18.8h, v20.8h , v24.8h
184    ld1       {v4.2s, v5.2s}, [x0], x2
185    sqrshrun  v31.8b, v14.8h, #5
186    ld1       {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 1
187    uaddl     v12.8h, v10.8b, v0.8b
188    urhadd    v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value
189    urhadd    v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value
190    uaddl     v14.8h, v6.8b, v4.8b
191    uaddl     v20.8h, v8.8b, v2.8b
192    mla       v14.8h, v12.8h , v22.8h
193    mls       v16.8h, v26.8h , v24.8h
194    st1       {v30.2s, v31.2s}, [x1], x3 //store row 1
195    sqrshrun  v30.8b, v18.8h, #5
196    uaddl     v18.8h, v7.8b, v5.8b
197    uaddl     v12.8h, v11.8b, v1.8b
198    mla       v18.8h, v12.8h , v22.8h
199    uaddl     v26.8h, v9.8b, v3.8b
200    mls       v14.8h, v20.8h , v24.8h
201    ld1       {v6.2s, v7.2s}, [x0], x2
202    sqrshrun  v31.8b, v16.8h, #5
203    ld1       {v16.2s, v17.2s}, [x7], x2 // Load for interpolation row 2
204    mls       v18.8h, v26.8h , v24.8h
205    urhadd    v30.16b, v16.16b , v30.16b // Interpolation to obtain qpel value
206    urhadd    v31.16b, v17.16b , v31.16b // Interpolation to obtain qpel value
207    uaddl     v12.8h, v0.8b, v2.8b      // temp1 = src[2_0] + src[3_0]
208    st1       {v30.2s, v31.2s}, [x1], x3 //store row 2
209    uaddl     v16.8h, v10.8b, v4.8b     // temp2 = src[1_0] + src[4_0]
210    uaddl     v20.8h, v9.8b, v7.8b      // temp4 = src[0_8] + src[5_8]
211    sqrshrun  v30.8b, v14.8h, #5
212    uaddl     v26.8h, v5.8b, v11.8b     // temp5 = src[1_8] + src[4_8]
213    uaddl     v14.8h, v8.8b, v6.8b      // temp = src[0_0] + src[5_0]
214    sqrshrun  v31.8b, v18.8h, #5
215    ld1       {v18.2s, v19.2s}, [x7], x2 // Load for interpolation row 3
216    mla       v14.8h, v12.8h , v22.8h   // temp += temp1 * 20
217    urhadd    v30.16b, v18.16b , v30.16b // Interpolation to obtain qpel value
218    urhadd    v31.16b, v19.16b , v31.16b // Interpolation to obtain qpel value
219    uaddl     v18.8h, v1.8b, v3.8b      // temp3 = src[2_8] + src[3_8]
220    st1       {v30.2s, v31.2s}, [x1], x3 //store row 3
221    // 4 rows processed
222    mla       v20.8h, v18.8h , v22.8h   // temp4 += temp3 * 20
223    ld1       {v8.2s, v9.2s}, [x0], x2
224    uaddl     v12.8h, v2.8b, v4.8b
225    uaddl     v18.8h, v3.8b, v5.8b
226    mls       v14.8h, v16.8h , v24.8h   // temp -= temp2 * 5
227    uaddl     v28.8h, v9.8b, v11.8b
228    uaddl     v16.8h, v6.8b, v0.8b
229    mla       v28.8h, v18.8h , v22.8h   // temp4 += temp3 * 20
230    mls       v20.8h, v26.8h , v24.8h   // temp4 -= temp5 * 5
231    uaddl     v26.8h, v1.8b, v7.8b
232    uaddl     v18.8h, v5.8b, v7.8b
233    sqrshrun  v30.8b, v14.8h, #5        // dst[0_0] = CLIP_U8((temp +16) >> 5)
234    uaddl     v14.8h, v8.8b, v10.8b
235    sqrshrun  v31.8b, v20.8h, #5        // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
236    ld1       {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 4
237    ld1       {v10.2s, v11.2s}, [x0], x2
238    urhadd    v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value
239    urhadd    v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value
240    mls       v28.8h, v26.8h , v24.8h   // temp4 -= temp5 * 5
241    st1       {v30.2s, v31.2s}, [x1], x3 //  store row 4
242    mla       v14.8h, v12.8h , v22.8h   // temp += temp1 * 20
243    uaddl     v20.8h, v11.8b, v1.8b
244    uaddl     v26.8h, v3.8b, v9.8b
245    mla       v20.8h, v18.8h , v22.8h   // temp4 += temp3 * 20
246    uaddl     v12.8h, v6.8b, v4.8b
247    uaddl     v18.8h, v7.8b, v9.8b
248    sqrshrun  v31.8b, v28.8h, #5        // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
249    mls       v14.8h, v16.8h , v24.8h   // temp -= temp2 * 5
250    uaddl     v16.8h, v8.8b, v2.8b
251    sqrshrun  v30.8b, v14.8h, #5        // dst[0_0] = CLIP_U8((temp +16) >> 5)
252    ld1       {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 5
253    mls       v20.8h, v26.8h , v24.8h   // temp4 -= temp5 * 5
254    urhadd    v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value
255    urhadd    v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value
256    uaddl     v14.8h, v10.8b, v0.8b
257    st1       {v30.2s, v31.2s}, [x1], x3 //  store row 5
258    mla       v14.8h, v12.8h , v22.8h   // temp += temp1 * 20
259    ld1       {v0.2s, v1.2s}, [x0], x2
260    uaddl     v26.8h, v5.8b, v11.8b
261    uaddl     v12.8h, v8.8b, v6.8b
262    uaddl     v28.8h, v0.8b, v2.8b
263    sqrshrun  v31.8b, v20.8h, #5        // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
264    mla       v28.8h, v12.8h , v22.8h   // temp += temp1 * 20
265    uaddl     v20.8h, v1.8b, v3.8b
266    mls       v14.8h, v16.8h , v24.8h   // temp -= temp2 * 5
267    mla       v20.8h, v18.8h , v22.8h   // temp4 += temp3 * 20
268    uaddl     v16.8h, v10.8b, v4.8b
269    sqrshrun  v30.8b, v14.8h, #5        // dst[0_0] = CLIP_U8((temp +16) >> 5)
270    ld1       {v14.2s, v15.2s}, [x7], x2 // Load for interpolation row 6
271    mov       v2.8b, v6.8b
272    mov       v3.8b, v7.8b
273    urhadd    v30.16b, v14.16b , v30.16b // Interpolation to obtain qpel value
274    urhadd    v31.16b, v15.16b , v31.16b // Interpolation to obtain qpel value
275
276    mls       v28.8h, v16.8h , v24.8h   // temp -= temp2 * 5
277    st1       {v30.2s, v31.2s}, [x1], x3 //  store row 6
278    sqrshrun  v30.8b, v28.8h, #5        // dst[0_0] = CLIP_U8((temp +16) >> 5)
279    swp       v0.8b, v4.8b              // swapping registers to put it in order
280    swp       v1.8b, v5.8b              // swapping registers to put it in order
281
282    mls       v20.8h, v26.8h , v24.8h   // temp4 -= temp5 * 5
283    mov       v6.8b, v10.8b
284    mov       v7.8b, v11.8b
285    subs      x12, x14, #1              // if height==16  - looping
286    swp       v4.8b, v8.8b
287    swp       v5.8b, v9.8b
288    sqrshrun  v31.8b, v20.8h, #5        // dst[0_8] = CLIP_U8((temp4 +16) >> 5)
289    ld1       {v20.2s, v21.2s}, [x7], x2 // Load for interpolation row 7
290    urhadd    v30.16b, v20.16b , v30.16b // Interpolation to obtain qpel value
291    urhadd    v31.16b, v21.16b , v31.16b // Interpolation to obtain qpel value
292    st1       {v30.2s, v31.2s}, [x1], x3 //  store row 7
293    bne       end_func                  //if height =8  end function
294    add       x14, x14, #1              //for checking loop
295    ld1       {v10.2s, v11.2s}, [x0], x2
296    uaddl     v12.8h, v4.8b, v6.8b      // temp1 = src[2_0] + src[3_0]
297
298    b         loop_16                   // looping if height =16
299
300loop_8_start:
301//// Processing row0 and row1
302
303    ld1       {v0.2s}, [x0], x2         // Vector load from src[0_0]
304    ld1       {v1.2s}, [x0], x2         // Vector load from src[1_0]
305    ld1       {v2.2s}, [x0], x2         // Vector load from src[2_0]
306    ld1       {v3.2s}, [x0], x2         // Vector load from src[3_0]
307    add       x14, x14, #1              //for checking loop
308    ld1       {v4.2s}, [x0], x2         // Vector load from src[4_0]
309    ld1       {v5.2s}, [x0], x2         // Vector load from src[5_0]
310
311loop_8:
312                                        //for checking loop
313    uaddl     v6.8h, v2.8b, v3.8b       // temp1 = src[2_0] + src[3_0]
314    uaddl     v8.8h, v0.8b, v5.8b       // temp = src[0_0] + src[5_0]
315    uaddl     v10.8h, v1.8b, v4.8b      // temp2 = src[1_0] + src[4_0]
316    mla       v8.8h, v6.8h , v22.8h     // temp += temp1 * 20
317    ld1       {v6.2s}, [x0], x2
318    uaddl     v14.8h, v3.8b, v4.8b
319    uaddl     v16.8h, v1.8b, v6.8b
320    uaddl     v18.8h, v2.8b, v5.8b
321    mls       v8.8h, v10.8h , v24.8h    // temp -= temp2 * 5
322    mla       v16.8h, v14.8h , v22.8h
323    ld1       {v7.2s}, [x0], x2
324    uaddl     v20.8h, v4.8b, v5.8b
325    uaddl     v12.8h, v2.8b, v7.8b
326    uaddl     v10.8h, v3.8b, v6.8b
327    mls       v16.8h, v18.8h , v24.8h
328    sqrshrun  v26.8b, v8.8h, #5         // dst[0_0] = CLIP_U8( (temp + 16) >> 5)
329    mla       v12.8h, v20.8h , v22.8h
330    ld1       {v8.2s}, [x7], x2         //Load value for interpolation            (row0)
331    ld1       {v9.2s}, [x7], x2         //Load value for interpolation            (row1)
332    ld1       {v0.2s}, [x0], x2
333    uaddl     v14.8h, v5.8b, v6.8b
334    sqrshrun  v27.8b, v16.8h, #5
335    urhadd    v26.16b, v8.16b , v26.16b // Interpolation step for qpel calculation
336    urhadd    v27.16b, v9.16b , v27.16b // Interpolation step for qpel calculation
337
338    uaddl     v20.8h, v3.8b, v0.8b
339    mls       v12.8h, v10.8h , v24.8h
340    st1       {v26.2s}, [x1], x3        // Vector store to dst[0_0]
341    uaddl     v18.8h, v4.8b, v7.8b
342    mla       v20.8h, v14.8h , v22.8h
343    st1       {v27.2s}, [x1], x3        // Vector store to dst[1_0]
344    sqrshrun  v28.8b, v12.8h, #5
345    mls       v20.8h, v18.8h , v24.8h
346    ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (row2)
347    ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (row3)
348    ld1       {v1.2s}, [x0], x2
349    sqrshrun  v29.8b, v20.8h, #5
350    subs      x9, x4, #4
351    urhadd    v28.16b, v12.16b , v28.16b
352    urhadd    v29.16b, v13.16b , v29.16b
353    st1       {v28.2s}, [x1], x3        //store row 2
354    st1       {v29.2s}, [x1], x3        //store row 3
355    beq       end_func                  // Branch if height==4
356    uaddl     v14.8h, v6.8b, v7.8b      // temp1 = src[2_0] + src[3_0]
357    uaddl     v16.8h, v0.8b, v5.8b      // temp = src[0_0] + src[5_0]
358    uaddl     v18.8h, v1.8b, v4.8b      // temp2 = src[1_0] + src[4_0]
359    mla       v18.8h, v14.8h , v22.8h   // temp += temp1 * 20
360    ld1       {v2.2s}, [x0], x2
361    mls       v18.8h, v16.8h , v24.8h   // temp -= temp2 * 5
362    uaddl     v8.8h, v0.8b, v7.8b
363    uaddl     v10.8h, v1.8b, v6.8b
364    uaddl     v12.8h, v2.8b, v5.8b
365    sqrshrun  v26.8b, v18.8h, #5
366    mla       v12.8h, v8.8h , v22.8h
367    ld1       {v18.2s}, [x7], x2        //Load value for interpolation            (row4)
368    ld1       {v19.2s}, [x7], x2        //Load value for interpolation            (row5)
369    ld1       {v3.2s}, [x0], x2
370    mls       v12.8h, v10.8h , v24.8h
371    sqrshrun  v27.8b, v12.8h, #5
372    urhadd    v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation
373    urhadd    v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation
374
375    st1       {v26.2s}, [x1], x3        // store row 4
376    st1       {v27.2s}, [x1], x3        // store row 5
377    uaddl     v14.8h, v0.8b, v1.8b      // temp1 = src[2_0] + src[3_0]
378    uaddl     v16.8h, v2.8b, v7.8b      // temp = src[0_0] + src[5_0]
379    uaddl     v18.8h, v3.8b, v6.8b      // temp2 = src[1_0] + src[4_0]
380    mla       v18.8h, v14.8h , v22.8h   // temp += temp1 * 20
381    ld1       {v4.2s}, [x0], x2
382    mls       v18.8h, v16.8h , v24.8h   // temp -= temp2 * 5
383    uaddl     v8.8h, v2.8b, v1.8b
384    uaddl     v10.8h, v3.8b, v0.8b
385    uaddl     v12.8h, v4.8b, v7.8b
386    sqrshrun  v26.8b, v18.8h, #5
387    mla       v12.8h, v8.8h , v22.8h
388    ld1       {v18.2s}, [x7], x2        //Load value for interpolation            (row6)
389    ld1       {v19.2s}, [x7], x2        //Load value for interpolation            (row7)
390    ld1       {v5.2s}, [x0], x2
391    mls       v12.8h, v10.8h , v24.8h
392    sqrshrun  v27.8b, v12.8h, #5
393    urhadd    v26.16b, v18.16b , v26.16b // Interpolation step for qpel calculation
394    urhadd    v27.16b, v19.16b , v27.16b // Interpolation step for qpel calculation
395
396    subs      x12, x14, #1
397    st1       {v26.2s}, [x1], x3        // store row 6
398    st1       {v27.2s}, [x1], x3        // store row 7
399    add       x14, x14, #1
400    beq       loop_8                    //looping if height ==16
401
402    b         end_func
403
404
405loop_4_start:
406//// Processing row0 and row1
407
408
409    ld1       {v0.s}[0], [x0], x2       // Vector load from src[0_0]
410    ld1       {v1.s}[0], [x0], x2       // Vector load from src[1_0]
411    ld1       {v2.s}[0], [x0], x2       // Vector load from src[2_0]
412    ld1       {v3.s}[0], [x0], x2       // Vector load from src[3_0]
413    ld1       {v4.s}[0], [x0], x2       // Vector load from src[4_0]
414    ld1       {v5.s}[0], [x0], x2       // Vector load from src[5_0]
415
416    uaddl     v6.8h, v2.8b, v3.8b       // temp1 = src[2_0] + src[3_0]
417    uaddl     v8.8h, v0.8b, v5.8b       // temp = src[0_0] + src[5_0]
418    uaddl     v10.8h, v1.8b, v4.8b      // temp2 = src[1_0] + src[4_0]
419    mla       v8.8h, v6.8h , v22.8h     // temp += temp1 * 20
420    ld1       {v6.2s}, [x0], x2
421    uaddl     v14.8h, v3.8b, v4.8b
422    uaddl     v16.8h, v1.8b, v6.8b
423    uaddl     v18.8h, v2.8b, v5.8b
424    mls       v8.8h, v10.8h , v24.8h    // temp -= temp2 * 5
425    ld1       {v7.s}[0], [x0], x2
426    mla       v16.8h, v14.8h , v22.8h
427    uaddl     v20.8h, v4.8b, v5.8b
428    uaddl     v12.8h, v2.8b, v7.8b
429    uaddl     v10.8h, v3.8b, v6.8b
430    mls       v16.8h, v18.8h , v24.8h
431    sqrshrun  v26.8b, v8.8h, #5         // dst[0_0] = CLIP_U8( (temp + 16) >> 5)
432    ld1       {v8.s}[0], [x7], x2       //Load value for interpolation - row 0
433    ld1       {v9.s}[0], [x7], x2       //Load value for interpolation - row 1
434    mla       v12.8h, v20.8h , v22.8h
435    ld1       {v0.s}[0], [x0], x2
436    uaddl     v14.8h, v5.8b, v6.8b
437    sqrshrun  v27.8b, v16.8h, #5
438    uaddl     v20.8h, v3.8b, v0.8b
439    urhadd    v26.16b, v26.16b , v8.16b //Interpolation step for qpel calculation
440    urhadd    v27.16b, v27.16b , v9.16b //Interpolation step for qpel calculation
441
442    mls       v12.8h, v10.8h , v24.8h
443    st1       {v26.s}[0], [x1], x3      // Vector store to dst[0_0]
444    uaddl     v18.8h, v4.8b, v7.8b
445    mla       v20.8h, v14.8h , v22.8h
446    st1       {v27.s}[0], [x1], x3      // store row 1
447    sqrshrun  v28.8b, v12.8h, #5
448    ld1       {v12.s}[0], [x7], x2      //Load value for interpolation - row 2
449    ld1       {v13.s}[0], [x7], x2      //Load value for interpolation - row 3
450
451    mls       v20.8h, v18.8h , v24.8h
452    ld1       {v1.s}[0], [x0], x2
453    sqrshrun  v29.8b, v20.8h, #5
454    urhadd    v28.16b, v12.16b , v28.16b //Interpolation step for qpel calculation
455    urhadd    v29.16b, v13.16b , v29.16b //Interpolation step for qpel calculation
456
457    st1       {v28.s}[0], [x1], x3      //store row 2
458    st1       {v29.s}[0], [x1], x3      //store row 3
459
460    subs      x9, x4, #4
461    beq       end_func                  // Branch if height==4
462
463
464    uaddl     v14.8h, v6.8b, v7.8b      // temp1 = src[2_0] + src[3_0]
465    uaddl     v16.8h, v0.8b, v5.8b      // temp = src[0_0] + src[5_0]
466    uaddl     v18.8h, v1.8b, v4.8b      // temp2 = src[1_0] + src[4_0]
467    mla       v18.8h, v14.8h , v22.8h   // temp += temp1 * 20
468    ld1       {v2.s}[0], [x0], x2
469    mls       v18.8h, v16.8h , v24.8h   // temp -= temp2 * 5
470    uaddl     v8.8h, v0.8b, v7.8b
471    uaddl     v10.8h, v1.8b, v6.8b
472    uaddl     v12.8h, v2.8b, v5.8b
473    sqrshrun  v26.8b, v18.8h, #5
474    ld1       {v18.s}[0], [x7], x2      //Load value for interpolation - row 4
475    ld1       {v19.s}[0], [x7], x2      //Load value for interpolation - row 5
476    mla       v12.8h, v8.8h , v22.8h
477    ld1       {v3.s}[0], [x0], x2
478    mls       v12.8h, v10.8h , v24.8h
479    sqrshrun  v27.8b, v12.8h, #5
480    urhadd    v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation
481    urhadd    v27.16b, v27.16b , v19.16b //Interpolation step for qpel calculation
482
483    st1       {v26.s}[0], [x1], x3      //store row 4
484    st1       {v27.s}[0], [x1], x3      // store row 5
485    uaddl     v14.8h, v0.8b, v1.8b      // temp1 = src[2_0] + src[3_0]
486    uaddl     v16.8h, v2.8b, v7.8b      // temp = src[0_0] + src[5_0]
487    uaddl     v18.8h, v3.8b, v6.8b      // temp2 = src[1_0] + src[4_0]
488    mla       v18.8h, v14.8h , v22.8h   // temp += temp1 * 20
489    ld1       {v4.s}[0], [x0], x2
490    mls       v18.8h, v16.8h , v24.8h   // temp -= temp2 * 5
491    uaddl     v8.8h, v2.8b, v1.8b
492    uaddl     v10.8h, v3.8b, v0.8b
493    uaddl     v12.8h, v4.8b, v7.8b
494    sqrshrun  v26.8b, v18.8h, #5
495    ld1       {v18.s}[0], [x7], x2      //Load value for interpolation - row 6
496    ld1       {v19.s}[0], [x7], x2      //Load value for interpolation - row 7
497    mla       v12.8h, v8.8h , v22.8h
498    ld1       {v5.s}[0], [x0], x2
499    mls       v12.8h, v10.8h , v24.8h
500    sqrshrun  v27.8b, v12.8h, #5
501    urhadd    v26.16b, v18.16b , v26.16b //Interpolation step for qpel calculation
502    urhadd    v27.16b, v19.16b , v27.16b //Interpolation step for qpel calculation
503
504    st1       {v26.s}[0], [x1], x3      // store row 6
505    st1       {v27.s}[0], [x1], x3      // store row 7
506
507
508end_func:
509    // LDMFD sp!,{x4-x12,PC}         //Restoring registers from stack
510    ldp       x19, x20, [sp], #16
511    pop_v_regs
512    ret
513
514
515
516