1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21// *******************************************************************************
22// * @file
23// *  ih264e_half_pel.s
24// *
25// * @brief
26// *
27// *
28// * @author
29// *  Ittiam
30// *
31// * @par List of Functions:
32// *  ih264e_sixtapfilter_horz
33// *  ih264e_sixtap_filter_2dvh_vert
34//
35// *
36// * @remarks
37// *  None
38// *
39// *******************************************************************************
40// */
41
42
43.text
44.p2align 2
45.include "ih264_neon_macros.s"
46
47///*******************************************************************************
48//*
49//* @brief
50//*     Interprediction luma filter for horizontal input(Filter run for width = 17 and height =16)
51//*
52//* @par Description:
53//*    Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
54//*    sec 8.4.2.2.1 titled "Luma sample interpolation process"
55//*
56//* @param[in] pu1_src
57//*  UWORD8 pointer to the source
58//*
59//* @param[out] pu1_dst
60//*  UWORD8 pointer to the destination
61//*
62//* @param[in] src_strd
63//*  integer source stride
64//*
65//* @param[in] dst_strd
66//*  integer destination stride
67//*
68//*
69//* @returns
70//*
71//* @remarks
72//*  None
73//*
74//*******************************************************************************
75//*/
76//void ih264e_sixtapfilter_horz(UWORD8 *pu1_src,
77//                                UWORD8 *pu1_dst,
78//                                WORD32 src_strd,
79//                                WORD32 dst_strd);
80
81
82.equ halfpel_width ,  17 + 1            //( make it even, two rows are processed at a time)
83
84
85        .global ih264e_sixtapfilter_horz_av8
86ih264e_sixtapfilter_horz_av8:
87    // STMFD sp!,{x14}
88    push_v_regs
89    stp       x19, x20, [sp, #-16]!
90
91    movi      v0.8b, #5
92    sub       x0, x0, #2
93    sub       x3, x3, #16
94    movi      v1.8b, #20
95    mov       x14, #16
96
97filter_horz_loop:
98
99
100    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0
101    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1
102
103    //// Processing row0 and row1
104
105    ext       v31.8b, v2.8b , v3.8b , #5
106    ext       v30.8b, v3.8b , v4.8b , #5
107
108    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row0)
109    ext       v29.8b, v4.8b , v4.8b , #5
110    uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row0)
111    ext       v28.8b, v5.8b , v6.8b , #5
112    uaddl     v12.8h, v29.8b, v4.8b     //// a0 + a5                             (column3,row0)
113    ext       v27.8b, v6.8b , v7.8b , #5
114    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row1)
115    ext       v26.8b, v7.8b , v7.8b , #5
116
117    uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row1)
118    ext       v31.8b, v2.8b , v3.8b , #2
119    uaddl     v18.8h, v26.8b, v7.8b     //// a0 + a5                             (column3,row1)
120    ext       v30.8b, v3.8b , v4.8b , #2
121    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
122    ext       v29.8b, v4.8b , v4.8b , #2
123    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row0)
124    ext       v28.8b, v5.8b , v6.8b , #2
125    umlal     v12.8h, v29.8b, v1.8b     //// a0 + a5 + 20a2                         (column3,row0)
126    ext       v27.8b, v6.8b , v7.8b , #2
127    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row1)
128    ext       v26.8b, v7.8b , v7.8b , #2
129
130    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row1)
131    ext       v31.8b, v2.8b , v3.8b , #3
132    umlal     v18.8h, v26.8b, v1.8b     //// a0 + a5 + 20a2                         (column3,row1)
133    ext       v30.8b, v3.8b , v4.8b , #3
134    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
135    ext       v29.8b, v4.8b , v4.8b , #3
136    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row0)
137    ext       v28.8b, v5.8b , v6.8b , #3
138    umlal     v12.8h, v29.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column3,row0)
139    ext       v27.8b, v6.8b , v7.8b , #3
140    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row1)
141    ext       v26.8b, v7.8b , v7.8b , #3
142
143    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row1)
144    ext       v31.8b, v2.8b , v3.8b , #1
145    umlal     v18.8h, v26.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column3,row1)
146    ext       v30.8b, v3.8b , v4.8b , #1
147    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
148    ext       v29.8b, v4.8b , v4.8b , #1
149    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row0)
150    ext       v28.8b, v5.8b , v6.8b , #1
151    umlsl     v12.8h, v29.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column3,row0)
152    ext       v27.8b, v6.8b , v7.8b , #1
153    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row1)
154    ext       v26.8b, v7.8b , v7.8b , #1
155
156    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row1)
157    ext       v31.8b, v2.8b , v3.8b , #4
158    umlsl     v18.8h, v26.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column3,row1)
159    ext       v30.8b, v3.8b , v4.8b , #4
160    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
161    ext       v29.8b, v4.8b , v4.8b , #4
162    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row0)
163    ext       v28.8b, v5.8b , v6.8b , #4
164    umlsl     v12.8h, v29.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column3,row0)
165    ext       v27.8b, v6.8b , v7.8b , #4
166    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row1)
167    ext       v26.8b, v7.8b , v7.8b , #4
168
169    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row1)
170    umlsl     v18.8h, v26.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column3,row1)
171
172    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
173    sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
174    sqrshrun  v22.8b, v12.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
175    sqrshrun  v23.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row1)
176    sqrshrun  v24.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row1)
177    sqrshrun  v25.8b, v18.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row1)
178
179    st1       {v20.8b, v21.8b}, [x1], #16 ////Store dest row0
180    st1       {v22.h}[0], [x1], x3
181    st1       {v23.8b, v24.8b}, [x1], #16 ////Store dest row1
182    st1       {v25.h}[0], [x1], x3
183
184    subs      x14, x14, #2              //    decrement counter
185
186    bne       filter_horz_loop
187
188
189    // LDMFD sp!,{pc}
190    ldp       x19, x20, [sp], #16
191    pop_v_regs
192    ret
193
194
195
196
197
198
199
200
201
202///**
203//*******************************************************************************
204//*
205//* @brief
206//*   This function implements a two stage cascaded six tap filter. It
207//*    applies the six tap filter in the vertical direction on the
208//*    predictor values, followed by applying the same filter in the
209//*    horizontal direction on the output of the first stage. The six tap
210//*    filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
211//*    interpolation process"
212//*    (Filter run for width = 17 and height =17)
213//* @par Description:
214//*    The function interpolates
215//*    the predictors first in the vertical direction and then in the
216//*    horizontal direction to output the (1/2,1/2). The output of the first
217//*    stage of the filter is stored in the buffer pointed to by pi16_pred1(only in C)
218//*    in 16 bit precision.
219//*
220//*
221//* @param[in] pu1_src
222//*  UWORD8 pointer to the source
223//*
224//* @param[out] pu1_dst1
225//*  UWORD8 pointer to the destination(vertical filtered output)
226//*
227//* @param[out] pu1_dst2
228//*  UWORD8 pointer to the destination(out put after applying horizontal filter to the intermediate vertical output)
229//*
230//* @param[in] src_strd
231//*  integer source stride
232//*
233//* @param[in] dst_strd
234//*  integer destination stride of pu1_dst
235//*
236//* @param[in]pi16_pred1
237//*  Pointer to 16bit intermediate buffer(used only in c)
238//*
239//* @param[in] pi16_pred1_strd
240//*  integer destination stride of pi16_pred1
241//*
242//*
243//* @returns
244//*
245//* @remarks
246//*  None
247//*
248//*******************************************************************************
249//*/
250//void ih264e_sixtap_filter_2dvh_vert(UWORD8 *pu1_src,
251//                                UWORD8 *pu1_dst1,
252//                                UWORD8 *pu1_dst2,
253//                                WORD32 src_strd,
254//                                WORD32 dst_strd,
255//                                WORD32 *pi16_pred1,/* Pointer to 16bit intermmediate buffer (used only in c)*/
256//                                WORD32 pi16_pred1_strd)
257
258
259
260
261        .global ih264e_sixtap_filter_2dvh_vert_av8
262
263ih264e_sixtap_filter_2dvh_vert_av8:
264    // STMFD sp!,{x10,x11,x12,x14}
265    push_v_regs
266    stp       x19, x20, [sp, #-16]!
267
268////x0 - pu1_ref
269////x3 - u4_ref_width
270
271    //// Load six rows for vertical interpolation
272    lsl       x12, x3, #1
273    sub       x0, x0, x12
274    sub       x0, x0, #2
275    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x3
276    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x3
277    ld1       {v8.8b, v9.8b, v10.8b}, [x0], x3
278    mov       x12, #5
279    ld1       {v11.8b, v12.8b, v13.8b}, [x0], x3
280    mov       x14, #20
281    ld1       {v14.8b, v15.8b, v16.8b}, [x0], x3
282    mov       v0.h[0], w12
283    mov       v0.h[1], w14
284    ld1       {v17.8b, v18.8b, v19.8b}, [x0], x3
285    movi      v1.8b, #20
286
287//// x12 - u2_buff1_width
288//// x14 - u2_buff2_width
289    mov       x12, x4
290    add       x11, x1, #16
291
292    mov       x14, x12
293
294    mov       x10, #3 //loop counter
295    sub       x16 , x12, #8
296    sub       x19, x14, #16
297filter_2dvh_loop:
298
299    //// ////////////// ROW 1 ///////////////////////
300
301//// Process first vertical interpolated row
302//// each column is
303    uaddl     v20.8h, v2.8b, v17.8b     //// a0 + a5                             (column1,row0)
304    movi      v31.8b, #5
305    umlal     v20.8h, v8.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
306    umlal     v20.8h, v11.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
307    umlsl     v20.8h, v5.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
308    umlsl     v20.8h, v14.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
309    mov       v21.d[0], v20.d[1]
310
311    uaddl     v22.8h, v3.8b, v18.8b     //// a0 + a5                                (column2,row0)
312    umlal     v22.8h, v9.8b, v1.8b      //// a0 + a5 + 20a2                        (column2,row0)
313    umlal     v22.8h, v12.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column2,row0)
314    umlsl     v22.8h, v6.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
315    umlsl     v22.8h, v15.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
316    ext       v30.8b, v20.8b , v21.8b , #4
317    mov       v23.d[0], v22.d[1]
318
319
320    uaddl     v24.8h, v4.8b, v19.8b     //// a0 + a5                                (column3,row0)
321    ext       v29.8b, v20.8b , v21.8b , #6
322    umlal     v24.8h, v10.8b, v1.8b     //// a0 + a5 + 20a2                        (column3,row0)
323    umlal     v24.8h, v13.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column3,row0)
324    umlsl     v24.8h, v7.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
325    umlsl     v24.8h, v16.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
326    mov       v25.d[0], v24.d[1]
327
328    sqrshrun  v2.8b, v20.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
329    ext       v31.8b, v21.8b , v22.8b , #2
330    sqrshrun  v3.8b, v22.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
331    ext       v28.8b, v20.8b , v21.8b , #2
332
333    saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
334    ext       v31.8b, v22.8b , v23.8b , #2
335    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
336    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
337    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
338    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
339    ext       v30.8b, v21.8b , v22.8b , #4
340
341    sqrshrun  v4.8b, v24.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
342    ext       v29.8b, v21.8b , v22.8b , #6
343
344    ext       v28.8b, v21.8b , v22.8b , #2
345    saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
346    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
347    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
348    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
349    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
350    ext       v31.8b, v23.8b , v24.8b , #2
351    mov       v21.d[0], v20.d[1]
352    ext       v2.8b, v2.8b , v3.8b , #2
353    ext       v3.8b, v3.8b , v4.8b , #2
354    ext       v4.8b, v4.8b , v4.8b , #2
355
356    st1       {v2.8b, v3.8b}, [x1], x12 //// store row1 - 1,1/2 grid
357    st1       {v4.h}[0], [x11], x12     //// store row1 - 1,1/2 grid
358
359    ext       v30.8b, v22.8b , v23.8b , #4
360    ext       v29.8b, v22.8b , v23.8b , #6
361
362    saddl     v2.4s, v31.4h, v22.4h     //// a0 + a5                             (set3)
363    ext       v28.8b, v22.8b , v23.8b , #2
364    smlal     v2.4s, v30.4h, v0.h[1]    //// a0 + a5 + 20a2                         (set3)
365    smlal     v2.4s, v29.4h, v0.h[1]    //// a0 + a5 + 20a2 + 20a3                  (set3)
366    smlsl     v2.4s, v28.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
367    smlsl     v2.4s, v23.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
368    ext       v31.8b, v24.8b , v25.8b , #2
369
370    shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
371    ext       v30.8b, v23.8b , v24.8b , #4
372    shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
373    ext       v29.8b, v23.8b , v24.8b , #6
374
375    saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
376    ext       v28.8b, v23.8b , v24.8b , #2
377    ext       v31.8b, v25.8b , v25.8b , #2
378    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
379    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
380    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
381    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
382    ext       v30.8b, v24.8b , v25.8b , #4
383
384    saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
385    ext       v29.8b, v24.8b , v25.8b , #6
386
387    ext       v31.8b, v24.8b , v25.8b , #2
388    shrn      v28.4h, v2.4s, #8         //// shift by 8 and later we will shift by 2 more with rounding     (set3)
389
390    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x3 //// Load next Row data
391    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
392    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
393    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
394    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
395    shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
396    mov       v20.d[1], v21.d[0]
397    sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
398
399
400    ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
401    ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
402
403    ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
404
405    ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
406    //// ////////////// ROW 2 ///////////////////////
407
408//// Process first vertical interpolated row
409//// each column is
410    uaddl     v20.8h, v5.8b, v2.8b      //// a0 + a5                             (column1,row0)
411    movi      v31.8b, #5
412    umlal     v20.8h, v11.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row0)
413    umlal     v20.8h, v14.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
414    umlsl     v20.8h, v8.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
415    umlsl     v20.8h, v17.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
416    mov       v21.d[0], v20.d[1]
417
418    mov       v28.d[1], v29.d[0]
419    sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
420
421    shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
422
423    uaddl     v22.8h, v6.8b, v3.8b      //// a0 + a5                                (column2,row0)
424    umlal     v22.8h, v12.8b, v1.8b     //// a0 + a5 + 20a2                        (column2,row0)
425    umlal     v22.8h, v15.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column2,row0)
426    umlsl     v22.8h, v9.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
427    umlsl     v22.8h, v18.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
428    mov       v23.d[0], v22.d[1]
429
430    sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
431    ext       v30.8b, v20.8b , v21.8b , #4
432
433    uaddl     v24.8h, v7.8b, v4.8b      //// a0 + a5                                (column3,row0)
434    ext       v29.8b, v20.8b , v21.8b , #6
435    umlal     v24.8h, v13.8b, v1.8b     //// a0 + a5 + 20a2                        (column3,row0)
436    umlal     v24.8h, v16.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column3,row0)
437    umlsl     v24.8h, v10.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
438    umlsl     v24.8h, v19.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
439    mov       v25.d[0], v24.d[1]
440
441    st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
442    st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
443
444    sqrshrun  v5.8b, v20.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
445    ext       v31.8b, v21.8b , v22.8b , #2
446    sqrshrun  v6.8b, v22.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
447    ext       v28.8b, v20.8b , v21.8b , #2
448
449    saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
450    ext       v31.8b, v22.8b , v23.8b , #2
451    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
452    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
453    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
454    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
455    ext       v30.8b, v21.8b , v22.8b , #4
456
457    sqrshrun  v7.8b, v24.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
458    ext       v29.8b, v21.8b , v22.8b , #6
459
460    ext       v28.8b, v21.8b , v22.8b , #2
461    saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
462    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
463    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
464    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
465    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
466    ext       v31.8b, v23.8b , v24.8b , #2
467
468    ext       v5.8b, v5.8b , v6.8b , #2
469    ext       v6.8b, v6.8b , v7.8b , #2
470    ext       v7.8b, v7.8b , v7.8b , #2
471
472    st1       {v5.8b, v6.8b}, [x1], x12 //// store row1 - 1,1/2 grid
473    st1       {v7.h}[0], [x11], x12     //// store row1 - 1,1/2 grid
474
475    ext       v30.8b, v22.8b , v23.8b , #4
476    ext       v29.8b, v22.8b , v23.8b , #6
477
478    saddl     v6.4s, v31.4h, v22.4h     //// a0 + a5                             (set3)
479    ext       v28.8b, v22.8b , v23.8b , #2
480    smlal     v6.4s, v30.4h, v0.h[1]    //// a0 + a5 + 20a2                         (set3)
481    smlal     v6.4s, v29.4h, v0.h[1]    //// a0 + a5 + 20a2 + 20a3                  (set3)
482    smlsl     v6.4s, v28.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
483    smlsl     v6.4s, v23.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
484    ext       v31.8b, v24.8b , v25.8b , #2
485
486    shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
487    ext       v30.8b, v23.8b , v24.8b , #4
488    shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
489    ext       v29.8b, v23.8b , v24.8b , #6
490
491    saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
492    ext       v28.8b, v23.8b , v24.8b , #2
493    ext       v31.8b, v25.8b , v25.8b , #2
494    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
495    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
496    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
497    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
498    ext       v30.8b, v24.8b , v25.8b , #4
499
500    saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
501    ext       v29.8b, v24.8b , v25.8b , #6
502
503    ext       v31.8b, v24.8b , v25.8b , #2
504    shrn      v28.4h, v6.4s, #8         //// shift by 8 and later we will shift by 2 more with rounding     (set3)
505
506    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x3 //// Load next Row data
507    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
508    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
509    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
510    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
511    shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
512    mov       v20.d[1], v21.d[0]
513    sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
514
515
516    ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
517    ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
518
519    ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
520
521    ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
522    //// ////////////// ROW 3 ///////////////////////
523
524//// Process first vertical interpolated row
525//// each column is
526    uaddl     v20.8h, v8.8b, v5.8b      //// a0 + a5                             (column1,row0)
527    movi      v31.8b, #5
528    umlal     v20.8h, v14.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row0)
529    umlal     v20.8h, v17.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
530    umlsl     v20.8h, v11.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
531    umlsl     v20.8h, v2.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
532    mov       v21.d[0], v20.d[1]
533
534    mov       v28.d[1], v29.d[0]
535    sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
536    shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
537
538    uaddl     v22.8h, v9.8b, v6.8b      //// a0 + a5                                (column2,row0)
539    umlal     v22.8h, v15.8b, v1.8b     //// a0 + a5 + 20a2                        (column2,row0)
540    umlal     v22.8h, v18.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column2,row0)
541    umlsl     v22.8h, v12.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
542    umlsl     v22.8h, v3.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
543    mov       v23.d[0], v22.d[1]
544
545    sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
546    ext       v30.8b, v20.8b , v21.8b , #4
547
548    uaddl     v24.8h, v10.8b, v7.8b     //// a0 + a5                                (column3,row0)
549    ext       v29.8b, v20.8b , v21.8b , #6
550    umlal     v24.8h, v16.8b, v1.8b     //// a0 + a5 + 20a2                        (column3,row0)
551    umlal     v24.8h, v19.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column3,row0)
552    umlsl     v24.8h, v13.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
553    umlsl     v24.8h, v4.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
554    mov       v25.d[0], v24.d[1]
555
556    st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
557    st1       { v28.h}[0], [x2], x19    //// store 1/2,1,2 grif values
558
559    sqrshrun  v8.8b, v20.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
560    ext       v31.8b, v21.8b , v22.8b , #2
561    sqrshrun  v9.8b, v22.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
562    ext       v28.8b, v20.8b , v21.8b , #2
563
564    saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
565    ext       v31.8b, v22.8b , v23.8b , #2
566    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
567    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
568    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
569    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
570    ext       v30.8b, v21.8b , v22.8b , #4
571
572    sqrshrun  v10.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
573    ext       v29.8b, v21.8b , v22.8b , #6
574
575    ext       v28.8b, v21.8b , v22.8b , #2
576    saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
577    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
578    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
579    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
580    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
581    ext       v31.8b, v23.8b , v24.8b , #2
582
583    ext       v8.8b, v8.8b , v9.8b , #2
584    ext       v9.8b, v9.8b , v10.8b , #2
585    ext       v10.8b, v10.8b , v10.8b , #2
586
587    st1       {v8.8b, v9.8b}, [x1], x12 //// store row1 - 1,1/2 grid
588    st1       {v10.h}[0], [x11], x12    //// store row1 - 1,1/2 grid
589
590    ext       v30.8b, v22.8b , v23.8b , #4
591    ext       v29.8b, v22.8b , v23.8b , #6
592
593    saddl     v8.4s, v31.4h, v22.4h     //// a0 + a5                             (set3)
594    ext       v28.8b, v22.8b , v23.8b , #2
595    smlal     v8.4s, v30.4h, v0.h[1]    //// a0 + a5 + 20a2                         (set3)
596    smlal     v8.4s, v29.4h, v0.h[1]    //// a0 + a5 + 20a2 + 20a3                  (set3)
597    smlsl     v8.4s, v28.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
598    smlsl     v8.4s, v23.4h, v0.h[0]    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
599    ext       v31.8b, v24.8b , v25.8b , #2
600
601    shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
602    ext       v30.8b, v23.8b , v24.8b , #4
603    shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
604    ext       v29.8b, v23.8b , v24.8b , #6
605
606    saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
607    ext       v28.8b, v23.8b , v24.8b , #2
608    ext       v31.8b, v25.8b , v25.8b , #2
609    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
610    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
611    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
612    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
613    ext       v30.8b, v24.8b , v25.8b , #4
614
615    saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
616    ext       v29.8b, v24.8b , v25.8b , #6
617
618    ext       v31.8b, v24.8b , v25.8b , #2
619    shrn      v28.4h, v8.4s, #8         //// shift by 8 and later we will shift by 2 more with rounding     (set3)
620
621    ld1       {v8.8b, v9.8b, v10.8b}, [x0], x3 //// Load next Row data
622    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
623    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
624    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
625    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
626    shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
627    mov       v20.d[1], v21.d[0]
628    sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
629
630
631    ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
632    ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
633
634    ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
635
636    ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
637    //// ////////////// ROW 4 ///////////////////////
638
639//// Process first vertical interpolated row
640//// each column is
641    uaddl     v20.8h, v11.8b, v8.8b     //// a0 + a5                             (column1,row0)
642    movi      v31.8b, #5
643    umlal     v20.8h, v17.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row0)
644    umlal     v20.8h, v2.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
645    umlsl     v20.8h, v14.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
646    umlsl     v20.8h, v5.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
647    mov       v21.d[0], v20.d[1]
648    mov       v28.d[1], v29.d[0]
649    sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
650    shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
651
652    uaddl     v22.8h, v12.8b, v9.8b     //// a0 + a5                                (column2,row0)
653    umlal     v22.8h, v18.8b, v1.8b     //// a0 + a5 + 20a2                        (column2,row0)
654    umlal     v22.8h, v3.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column2,row0)
655    umlsl     v22.8h, v15.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
656    umlsl     v22.8h, v6.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
657    mov       v23.d[0], v22.d[1]
658
659    sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
660    ext       v30.8b, v20.8b , v21.8b , #4
661
662    uaddl     v24.8h, v13.8b, v10.8b    //// a0 + a5                                (column3,row0)
663    ext       v29.8b, v20.8b , v21.8b , #6
664    umlal     v24.8h, v19.8b, v1.8b     //// a0 + a5 + 20a2                        (column3,row0)
665    umlal     v24.8h, v4.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column3,row0)
666    umlsl     v24.8h, v16.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
667    umlsl     v24.8h, v7.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
668    mov       v25.d[0], v24.d[1]
669
670    st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
671    st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
672
673    sqrshrun  v11.8b, v20.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
674    ext       v31.8b, v21.8b , v22.8b , #2
675    sqrshrun  v12.8b, v22.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
676    ext       v28.8b, v20.8b , v21.8b , #2
677
678    saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
679    ext       v31.8b, v22.8b , v23.8b , #2
680    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
681    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
682    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
683    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
684    ext       v30.8b, v21.8b , v22.8b , #4
685
686    sqrshrun  v13.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
687    ext       v29.8b, v21.8b , v22.8b , #6
688
689    ext       v28.8b, v21.8b , v22.8b , #2
690    saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
691    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
692    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
693    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
694    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
695    ext       v31.8b, v23.8b , v24.8b , #2
696
697    ext       v11.8b, v11.8b , v12.8b , #2
698    ext       v12.8b, v12.8b , v13.8b , #2
699    ext       v13.8b, v13.8b , v13.8b , #2
700
701    st1       {v11.8b, v12.8b}, [x1], x12 //// store row1 - 1,1/2 grid
702    st1       {v13.h}[0], [x11], x12    //// store row1 - 1,1/2 grid
703
704    ext       v30.8b, v22.8b , v23.8b , #4
705    ext       v29.8b, v22.8b , v23.8b , #6
706
707    saddl     v12.4s, v31.4h, v22.4h    //// a0 + a5                             (set3)
708    ext       v28.8b, v22.8b , v23.8b , #2
709    smlal     v12.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set3)
710    smlal     v12.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
711    smlsl     v12.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
712    smlsl     v12.4s, v23.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
713    ext       v31.8b, v24.8b , v25.8b , #2
714
715    shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
716    ext       v30.8b, v23.8b , v24.8b , #4
717    shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
718    ext       v29.8b, v23.8b , v24.8b , #6
719
720    saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
721    ext       v28.8b, v23.8b , v24.8b , #2
722    ext       v31.8b, v25.8b , v25.8b , #2
723    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
724    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
725    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
726    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
727    ext       v30.8b, v24.8b , v25.8b , #4
728
729    saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
730    ext       v29.8b, v24.8b , v25.8b , #6
731
732    ext       v31.8b, v24.8b , v25.8b , #2
733    shrn      v28.4h, v12.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set3)
734
735    ld1       {v11.8b, v12.8b, v13.8b}, [x0], x3 //// Load next Row data
736    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
737    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
738    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
739    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
740    shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
741    mov       v20.d[1], v21.d[0]
742    sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
743
744
745    ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
746    ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
747
748    ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
749
750    ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
751    //// ////////////// ROW 5 ///////////////////////
752
753//// Process first vertical interpolated row
754//// each column is
755    uaddl     v20.8h, v14.8b, v11.8b    //// a0 + a5                             (column1,row0)
756    movi      v31.8b, #5
757    umlal     v20.8h, v2.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
758    umlal     v20.8h, v5.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
759    umlsl     v20.8h, v17.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
760    umlsl     v20.8h, v8.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
761    mov       v21.d[0], v20.d[1]
762    mov       v28.d[1], v29.d[0]
763    sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
764    shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
765
766    uaddl     v22.8h, v15.8b, v12.8b    //// a0 + a5                                (column2,row0)
767    umlal     v22.8h, v3.8b, v1.8b      //// a0 + a5 + 20a2                        (column2,row0)
768    umlal     v22.8h, v6.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column2,row0)
769    umlsl     v22.8h, v18.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
770    umlsl     v22.8h, v9.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
771    mov       v23.d[0], v22.d[1]
772
773    sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
774    ext       v30.8b, v20.8b , v21.8b , #4
775
776    uaddl     v24.8h, v16.8b, v13.8b    //// a0 + a5                                (column3,row0)
777    ext       v29.8b, v20.8b , v21.8b , #6
778    umlal     v24.8h, v4.8b, v1.8b      //// a0 + a5 + 20a2                        (column3,row0)
779    umlal     v24.8h, v7.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column3,row0)
780    umlsl     v24.8h, v19.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
781    umlsl     v24.8h, v10.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
782    mov       v25.d[0], v24.d[1]
783
784    st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
785    st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
786
787    sqrshrun  v14.8b, v20.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
788    ext       v31.8b, v21.8b , v22.8b , #2
789    sqrshrun  v15.8b, v22.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
790    ext       v28.8b, v20.8b , v21.8b , #2
791
792    saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
793    ext       v31.8b, v22.8b , v23.8b , #2
794    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
795    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
796    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
797    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
798    ext       v30.8b, v21.8b , v22.8b , #4
799
800    sqrshrun  v16.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
801    ext       v29.8b, v21.8b , v22.8b , #6
802
803    ext       v28.8b, v21.8b , v22.8b , #2
804    saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
805    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
806    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
807    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
808    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
809    ext       v31.8b, v23.8b , v24.8b , #2
810
811    ext       v14.8b, v14.8b , v15.8b , #2
812    ext       v15.8b, v15.8b , v16.8b , #2
813    ext       v16.8b, v16.8b , v16.8b , #2
814
815    st1       {v14.8b, v15.8b}, [x1], x12 //// store row1 - 1,1/2 grid
816    st1       {v16.h}[0], [x11], x12    //// store row1 - 1,1/2 grid
817
818    ext       v30.8b, v22.8b , v23.8b , #4
819    ext       v29.8b, v22.8b , v23.8b , #6
820
821    saddl     v14.4s, v31.4h, v22.4h    //// a0 + a5                             (set3)
822    ext       v28.8b, v22.8b , v23.8b , #2
823    smlal     v14.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set3)
824    smlal     v14.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
825    smlsl     v14.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
826    smlsl     v14.4s, v23.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
827    ext       v31.8b, v24.8b , v25.8b , #2
828
829    shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
830    ext       v30.8b, v23.8b , v24.8b , #4
831    shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
832    ext       v29.8b, v23.8b , v24.8b , #6
833
834    saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
835    ext       v28.8b, v23.8b , v24.8b , #2
836    ext       v31.8b, v25.8b , v25.8b , #2
837    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
838    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
839    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
840    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
841    ext       v30.8b, v24.8b , v25.8b , #4
842
843    saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
844    ext       v29.8b, v24.8b , v25.8b , #6
845
846    ext       v31.8b, v24.8b , v25.8b , #2
847    shrn      v28.4h, v14.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set3)
848
849    ld1       {v14.8b, v15.8b, v16.8b}, [x0], x3 //// Load next Row data
850    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
851    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
852    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
853    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
854    shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
855    mov       v20.d[1], v21.d[0]
856    sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
857
858
859    ////VQRSHRUN.s16    D27,Q14,#2            ;// half,half gird set3,4
860    ////VSHRN.s32        D28,Q11,#8            ;// shift by 8 and later we will shift by 2 more with rounding     (set5)
861
862    ////VQRSHRUN.s16    D28,Q14,#2            ;// half,half gird set5
863
864    ////VST1.8        {D26,D27,D28},[x2],x14    ;// store 1/2,1,2 grif values
865    //// ////////////// ROW 6 ///////////////////////
866
867//// Process first vertical interpolated row
868//// each column is
869
870    cmp       x10, #1                   //// if it 17 rows are complete skip
871    beq       filter_2dvh_skip_row
872    uaddl     v20.8h, v17.8b, v14.8b    //// a0 + a5                             (column1,row0)
873    movi      v31.8b, #5
874    umlal     v20.8h, v5.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
875    umlal     v20.8h, v8.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
876    umlsl     v20.8h, v2.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
877    umlsl     v20.8h, v11.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
878    mov       v21.d[0], v20.d[1]
879    mov       v28.d[1], v29.d[0]
880    sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
881    shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
882
883    uaddl     v22.8h, v18.8b, v15.8b    //// a0 + a5                                (column2,row0)
884    umlal     v22.8h, v6.8b, v1.8b      //// a0 + a5 + 20a2                        (column2,row0)
885    umlal     v22.8h, v9.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                (column2,row0)
886    umlsl     v22.8h, v3.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column2,row0)
887    umlsl     v22.8h, v12.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column2,row0)
888    mov       v23.d[0], v22.d[1]
889
890    sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
891    ext       v30.8b, v20.8b , v21.8b , #4
892
893    uaddl     v24.8h, v19.8b, v16.8b    //// a0 + a5                                (column3,row0)
894    ext       v29.8b, v20.8b , v21.8b , #6
895    umlal     v24.8h, v7.8b, v1.8b      //// a0 + a5 + 20a2                        (column3,row0)
896    umlal     v24.8h, v10.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                (column3,row0)
897    umlsl     v24.8h, v4.8b, v31.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1            (column3,row0)
898    umlsl     v24.8h, v13.8b, v31.8b    //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (column3,row0)
899    mov       v25.d[0], v24.d[1]
900
901    st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
902    st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
903
904    sqrshrun  v17.8b, v20.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
905    ext       v31.8b, v21.8b , v22.8b , #2
906    sqrshrun  v18.8b, v22.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
907    ext       v28.8b, v20.8b , v21.8b , #2
908
909    saddl     v26.4s, v31.4h, v20.4h    //// a0 + a5                             (set1)
910    ext       v31.8b, v22.8b , v23.8b , #2
911    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set1)
912    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set1)
913    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set1)
914    smlsl     v26.4s, v21.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set1)
915    ext       v30.8b, v21.8b , v22.8b , #4
916
917    sqrshrun  v19.8b, v24.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column3,row0)
918    ext       v29.8b, v21.8b , v22.8b , #6
919
920    ext       v28.8b, v21.8b , v22.8b , #2
921    saddl     v20.4s, v31.4h, v21.4h    //// a0 + a5                             (set2)
922    smlal     v20.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set2)
923    smlal     v20.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set2)
924    smlsl     v20.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set2)
925    smlsl     v20.4s, v22.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set2)
926    ext       v31.8b, v23.8b , v24.8b , #2
927
928    ext       v17.8b, v17.8b , v18.8b , #2
929    ext       v18.8b, v18.8b , v19.8b , #2
930    ext       v19.8b, v19.8b , v19.8b , #2
931
932    st1       {v17.8b, v18.8b}, [x1], x12 //// store row1 - 1,1/2 grid
933    st1       {v19.h}[0], [x11], x12    //// store row1 - 1,1/2 grid
934
935    ext       v30.8b, v22.8b , v23.8b , #4
936    ext       v29.8b, v22.8b , v23.8b , #6
937
938    saddl     v18.4s, v31.4h, v22.4h    //// a0 + a5                             (set3)
939    ext       v28.8b, v22.8b , v23.8b , #2
940    smlal     v18.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set3)
941    smlal     v18.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set3)
942    smlsl     v18.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set3)
943    smlsl     v18.4s, v23.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set3)
944    ext       v31.8b, v24.8b , v25.8b , #2
945
946    shrn      v21.4h, v20.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set2)
947    ext       v30.8b, v23.8b , v24.8b , #4
948    shrn      v20.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set1)
949    ext       v29.8b, v23.8b , v24.8b , #6
950
951    saddl     v26.4s, v31.4h, v23.4h    //// a0 + a5                             (set4)
952    ext       v28.8b, v23.8b , v24.8b , #2
953    ext       v31.8b, v25.8b , v25.8b , #2
954    smlal     v26.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set4)
955    smlal     v26.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set4)
956    smlsl     v26.4s, v28.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set4)
957    smlsl     v26.4s, v24.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set4)
958    ext       v30.8b, v24.8b , v25.8b , #4
959
960    saddl     v22.4s, v31.4h, v24.4h    //// a0 + a5                             (set5)
961    ext       v29.8b, v24.8b , v25.8b , #6
962
963    ext       v31.8b, v24.8b , v25.8b , #2
964    shrn      v28.4h, v18.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set3)
965
966    ld1       {v17.8b, v18.8b, v19.8b}, [x0], x3 //// Load next Row data
967    smlal     v22.4s, v30.4h, v0.h[1]   //// a0 + a5 + 20a2                         (set5)
968    smlal     v22.4s, v29.4h, v0.h[1]   //// a0 + a5 + 20a2 + 20a3                  (set5)
969    smlsl     v22.4s, v31.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1          (set5)
970    smlsl     v22.4s, v25.4h, v0.h[0]   //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4    (set5)
971    shrn      v29.4h, v26.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set4)
972    mov       v20.d[1], v21.d[0]
973    sqrshrun  v26.8b, v20.8h, #2        //// half,half gird set1,2
974
975    mov       v28.d[1], v29.d[0]
976    sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
977    shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
978
979    sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
980
981    st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
982    st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
983
984    subs      x10, x10, #1              ////decrement loop counter
985
986    bne       filter_2dvh_loop
987
988
989//// Process first vertical interpolated row
990//// each column is
991    //// ////////////// ROW 13 ///////////////////////
992
993//// Process first vertical interpolated row
994//// each column is
995
996    // LDMFD sp!,{x10,x11,x12,pc}
997    ldp       x19, x20, [sp], #16
998    pop_v_regs
999    ret
1000
1001filter_2dvh_skip_row:
1002    mov       v28.d[1], v29.d[0]
1003    sqrshrun  v27.8b, v28.8h, #2        //// half,half gird set3,4
1004    shrn      v28.4h, v22.4s, #8        //// shift by 8 and later we will shift by 2 more with rounding     (set5)
1005
1006    sqrshrun  v28.8b, v28.8h, #2        //// half,half gird set5
1007
1008    st1       {v26.8b, v27.8b}, [x2], #16 //// store 1/2,1,2 grif values
1009    st1       {v28.h}[0], [x2], x19     //// store 1/2,1,2 grif values
1010    // LDMFD sp!,{x10,x11,x12,pc}
1011    ldp       x19, x20, [sp], #16
1012    pop_v_regs
1013    ret
1014
1015
1016///*****************************************
1017