1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_intra_pred_luma_mode_3_to_9.s
22//*
23//* @brief
24//*  contains function definitions for intra prediction dc filtering.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* @author
30//*  parthiban v
31//*
32//* @par list of functions:
33//*
34//*
35//* @remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41//*******************************************************************************
42//*
43//* @brief
44//*    luma intraprediction filter for dc input
45//*
46//* @par description:
47//*
48//* @param[in] pu1_ref
49//*  uword8 pointer to the source
50//*
51//* @param[out] pu1_dst
52//*  uword8 pointer to the destination
53//*
54//* @param[in] src_strd
55//*  integer source stride
56//*
57//* @param[in] dst_strd
58//*  integer destination stride
59//*
60//* @param[in] nt
61//*  size of tranform block
62//*
63//* @param[in] mode
64//*  type of filtering
65//*
66//* @returns
67//*
68//* @remarks
69//*  none
70//*
71//*******************************************************************************
72//*/
73
74//void ihevc_intra_pred_luma_mode_3_to_9(uword8* pu1_ref,
75//                               word32 src_strd,
76//                               uword8* pu1_dst,
77//                               word32 dst_strd,
78//                               word32 nt,
79//                               word32 mode)
80//
81//**************variables vs registers*****************************************
82//x0 => *pu1_ref
83//x1 => src_strd
84//x2 => *pu1_dst
85//x3 => dst_strd
86
87//stack contents from #40
88//    nt
89//    mode
90
91.text
92.align 4
93.include "ihevc_neon_macros.s"
94
95
96
97.globl ihevc_intra_pred_luma_mode_3_to_9_av8
98.extern gai4_ihevc_ang_table
99.extern gai4_ihevc_inv_ang_table
100.extern col_for_intra_luma
101.extern idx_neg_idx_3_9
102
103
104.type ihevc_intra_pred_luma_mode_3_to_9_av8, %function
105
106ihevc_intra_pred_luma_mode_3_to_9_av8:
107
108    // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
109
110    stp         d12,d13,[sp,#-16]!
111    stp         d14,d15,[sp,#-16]!
112    stp         x19, x20,[sp,#-16]!
113
114    adrp        x7,  :got:gai4_ihevc_ang_table
115    ldr         x7,  [x7, #:got_lo12:gai4_ihevc_ang_table]
116
117    adrp        x8,  :got:gai4_ihevc_inv_ang_table
118    ldr         x8,  [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
119
120    add         x7, x7, x5, lsl #2          //gai4_ihevc_ang_table[mode]
121    ldr         w7,  [x7]                   //intra_pred_ang
122    sxtw        x7,w7
123    dup         v30.8b,w7                   //intra_pred_ang
124
125    adrp        x14,  :got:col_for_intra_luma
126    ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
127
128    cmp         x4, #4
129
130    beq         sz_4_proc
131    b           prologue_8_16_32
132
133prologue_8_16_32:
134    lsr         x10, x4, #3
135    ld1         {v31.8b},[x14],#8
136    mul         x10, x4, x10                //block counter (dec by #8)
137
138    mov         x11, x4                     //col counter to be inc/dec by #8
139    smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
140
141    sub         x7, x5, #3
142    movi        v2.8b, #1                   //contains #1 for adding to get ref_main_idx + 1
143    adrp        x12, :got:idx_neg_idx_3_9   //load least idx table
144    ldr         x12, [x12, #:got_lo12:idx_neg_idx_3_9]
145    movi        v3.8b, #2
146
147    add         x12, x12, x7, lsl #4
148    mov         x8, x12
149
150    mov         x7, #8
151    sub         x7, x7, x3, lsl #3          //x7 = 8-8x3
152
153    ldr         w9,  [x8]
154    sxtw        x9,w9
155    add         x1, x0, x4, lsl #1          //pu1_ref + nt
156
157    xtn         v6.8b,  v22.8h
158    dup         v26.8b,w9                   //least idx added to final idx values
159    sub         x1, x1, #9                  //ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
160
161    sub         x6, x1, x9
162
163    ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
164    sshr        v22.8h, v22.8h,#5
165
166    movi        v29.8b, #31                 //contains #31 for vand operation
167
168    movi        v28.8b, #32
169
170    sqxtn       v1.8b,  v22.8h
171
172    and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
173
174    mov         x0, #1
175
176    movi        v27.8b, #7                  //row 0 to 7
177
178    sub         v1.8b,  v1.8b ,  v2.8b      //ref_main_idx (sub row)
179    sub         v1.8b,  v26.8b ,  v1.8b     //ref_main_idx (row 0)
180    add         v1.8b,  v1.8b ,  v27.8b     //t0 compensate the pu1_src idx incremented by 8
181    sub         v19.8b,  v1.8b ,  v2.8b     //ref_main_idx + 1 (row 0)
182    tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 0)
183    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
184
185    tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 0)
186    sub         v4.8b,  v1.8b ,  v2.8b      //ref_main_idx (row 1)
187    sub         v5.8b,  v19.8b ,  v2.8b     //ref_main_idx + 1 (row 1)
188
189    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
190    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
191    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
192
193    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
194    sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 2)
195    sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx + 1 (row 2)
196
197    rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
198
199    tbl         v14.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 2)
200    umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
201    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
202
203    tbl         v15.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 2)
204    sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 3)
205    sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 3)
206
207    st1         {v24.8b},[x2], x3           //st (row 0)
208    rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
209
210    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
211    umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
212    umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
213
214    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
215    sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 4)
216    sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx + 1 (row 4)
217
218    st1         {v22.8b},[x2], x3           //st (row 1)
219    rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
220
221    tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 4)
222    umull       v18.8h, v23.8b, v7.8b       //mul (row 3)
223    umlal       v18.8h, v25.8b, v6.8b       //mul (row 3)
224
225    tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 4)
226    sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 5)
227    sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 5)
228
229    st1         {v20.8b},[x2], x3           //st (row 2)
230    rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
231
232    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 5)
233    umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
234    umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
235
236    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 5)
237    sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 6)
238    sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx + 1 (row 6)
239
240    st1         {v18.8b},[x2], x3           //st (row 3)
241    rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
242
243    tbl         v14.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 6)
244    umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
245    umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
246
247    tbl         v15.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 6)
248    sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 7)
249    sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx + 1 (row 7)
250
251    st1         {v24.8b},[x2], x3           //st (row 4)
252    rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
253
254    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
255    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
256    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
257
258    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
259    umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
260    umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
261
262    st1         {v22.8b},[x2], x3           //st (row 5)
263    rshrn       v20.8b, v20.8h,#5           //round shft (row 6)
264    rshrn       v18.8b, v18.8h,#5           //round shft (row 7)
265
266    st1         {v20.8b},[x2], x3           //st (row 6)
267
268    subs        x10, x10, #8                //subtract 8 and go to end if 8x8
269
270    st1         {v18.8b},[x2], x3           //st (row 7)
271
272    beq         end_func
273
274    subs        x11, x11, #8
275    add         x20, x8, #4
276    csel        x8, x20, x8,gt
277    add         x20, x2, x7
278    csel        x2, x20, x2,gt
279    csel        x8, x12, x8,le
280    sub         x20, x2, x4
281    csel        x2, x20, x2,le
282    add         x20, x2, #8
283    csel        x2, x20, x2,le
284    csel        x11, x4, x11,le
285    bgt         lbl284
286    adrp        x14,  :got:col_for_intra_luma
287    ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
288lbl284:
289    add         x20, x0, #8
290    csel        x0, x20, x0,le
291
292    mov         x5,x2
293    ld1         {v31.8b},[x14],#8
294    smull       v12.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
295    xtn         v23.8b,  v12.8h
296    sshr        v12.8h, v12.8h,#5
297    sqxtn       v25.8b,  v12.8h
298    ldr         w9,  [x8]
299    sxtw        x9,w9
300    add         x9, x0, x9
301    sub         x9, x9, #1
302    dup         v26.8b,w9
303    movi        v16.8b, #8
304
305    sub         x4,x4,#8
306
307kernel_8_16_32:
308
309    sub         v1.8b,  v26.8b ,  v25.8b    //ref_main_idx
310    mov         v26.8b, v23.8b
311
312    subs        x11, x11, #8
313    sub         x6, x1, x9
314    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
315    add         v1.8b,  v1.8b ,  v16.8b     //to compensate the pu1_src idx incremented by 8
316
317    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
318    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx - 1 (row 7)
319    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
320
321    sub         v19.8b,  v1.8b ,  v2.8b     //ref_main_idx - 1
322    add         x20, x0, #8
323    csel        x0, x20, x0,le
324    add         x20, x8, #4
325    csel        x8, x20, x8,gt
326    ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
327
328    st1         {v24.8b},[x5], x3           //st (row 4)
329    rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
330
331    bgt         lbl323
332    adrp        x14,  :got:col_for_intra_luma
333    ldr         x14,  [x14, #:got_lo12:col_for_intra_luma]
334lbl323:
335    csel        x8, x12, x8,le
336    dup         v27.8b,w0                   //row value inc or reset accordingly
337
338    sub         v4.8b,  v1.8b ,  v2.8b      //ref_main_idx (row 1)
339    tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 0)
340    sub         v5.8b,  v19.8b ,  v2.8b     //ref_main_idx - 1 (row 1)
341
342
343    umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
344    tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 0)
345    umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
346
347    ld1         {v31.8b},[x14],#8
348    and         v6.8b,  v29.8b ,  v26.8b    //fract values in d1/ idx values in d0
349
350    st1         {v22.8b},[x5], x3           //(from previous loop)st (row 5)
351    rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
352
353    sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 2)
354    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 1)
355    sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx - 1 (row 2)
356
357    add         x20, x4, #8
358    csel        x11, x20, x11,le
359    ldr         w9,  [x8]
360    sxtw        x9,w9
361    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
362
363    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
364    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
365    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
366
367    st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
368    rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
369
370    sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 3)
371    tbl         v14.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 2)
372    sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx - 1 (row 3)
373
374    umull       v22.8h, v23.8b, v7.8b       //mul (row 1)
375    tbl         v15.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 2)
376    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
377
378    rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
379    st1         {v18.8b},[x5], x3           //(from previous loop)st (row 7)
380
381    sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 4)
382    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
383    sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx - 1 (row 4)
384
385    umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
386    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
387    umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
388
389    smull       v14.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
390    add         x5,x2,x3,lsl#2
391    add         x9, x0, x9
392
393    st1         {v24.8b},[x2], x3           //st (row 0)
394    rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
395
396    sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 5)
397    tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 4)
398    sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx - 1 (row 5)
399
400    umull       v18.8h, v23.8b, v7.8b       //mul (row 3)
401    tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 4)
402    umlal       v18.8h, v25.8b, v6.8b       //mul (row 3)
403
404    st1         {v22.8b},[x2], x3           //st (row 1)
405    rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
406
407    xtn         v23.8b,  v14.8h
408    sshr        v14.8h, v14.8h,#5
409
410    sub         v1.8b,  v1.8b ,  v3.8b      //ref_main_idx (row 6)
411    tbl         v21.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 5)
412    sub         v19.8b,  v19.8b ,  v3.8b    //ref_main_idx - 1 (row 6)
413
414    umull       v24.8h, v12.8b, v7.8b       //mul (row 4)
415    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 5)
416    umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
417
418    st1         {v20.8b},[x2], x3           //st (row 2)
419    rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
420
421    sub         x9, x9, #1
422    sqxtn       v25.8b,  v14.8h
423
424    sub         v4.8b,  v4.8b ,  v3.8b      //ref_main_idx (row 7)
425    tbl         v14.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 6)
426    sub         v5.8b,  v5.8b ,  v3.8b      //ref_main_idx - 1 (row 7)
427
428    umull       v22.8h, v21.8b, v7.8b       //mul (row 5)
429    tbl         v15.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 6)
430    umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
431
432    add         v25.8b,  v27.8b ,  v25.8b   //ref_main_idx (add row)
433    dup         v26.8b,w9
434
435    st1         {v18.8b},[x2], x3           //st (row 3)
436    rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
437
438    add         x2, x2, x3, lsl #2
439    sub         v25.8b,  v25.8b ,  v2.8b    //ref_main_idx -1 (sub 1)
440    add         x20, x7, x2
441    csel        x2, x20, x2,gt
442
443    sub         x20, x2, x4
444    csel        x2, x20, x2,le
445
446    subs        x10, x10, #8                //subtract 8 and go to end if 8x8
447
448    bne         kernel_8_16_32
449
450epil_8_16_32:
451    tbl         v23.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 7)
452
453    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
454    tbl         v25.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 7)
455    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
456
457    st1         {v24.8b},[x5], x3           //st (row 4)
458    rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
459
460    umull       v18.8h, v23.8b, v7.8b       //mul (row 7)
461    umlal       v18.8h, v25.8b, v6.8b       //mul (row 7)
462
463    st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
464    rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
465
466    st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
467    rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
468
469    st1         {v18.8b},[x5], x3           //st (row 7)
470
471    b           end_func
472
473sz_4_proc:
474    ld1         {v31.8b},[x14]
475    movi        v2.8b, #1                   //contains #1 for adding to get ref_main_idx - 1
476
477    movi        v3.8b, #2
478    adrp        x12, :got:idx_neg_idx_3_9   //load least idx table
479    ldr         x12, [x12, #:got_lo12:idx_neg_idx_3_9]
480
481    smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
482    sub         x7, x5, #3
483
484    add         x12, x12, x7, lsl #4
485    mov         x8, x12
486
487    ldr         w9,  [x8]
488    sxtw        x9,w9
489
490    dup         v26.8b,w9                   //least idx added to final idx values
491    add         x6, x0, x4, lsl #1          //pu1_ref + 2nt
492
493    xtn         v6.8b,  v22.8h
494    sub         x6, x6, #9                  //ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
495    sub         x6, x6, x9
496
497    ld1         {v0.16b}, [x6]              //stores the 32 values reqd based on indices values (from least idx)
498
499    movi        v29.8b, #31                 //contains #31 for vand operation
500
501    movi        v28.8b, #32
502
503    sshr        v22.8h, v22.8h,#5
504    sqxtn       v1.8b,  v22.8h
505
506    and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
507    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
508
509    movi        v27.8b, #7                  //row 0 to 7(row-1)
510    sub         v1.8b,  v1.8b ,  v2.8b      //ref_main_idx (add 1)
511    sub         v1.8b,  v26.8b ,  v1.8b     //ref_main_idx
512    add         v1.8b,  v1.8b ,  v27.8b     //t0 compensate the pu1_src idx incremented by 8
513    sub         v19.8b,  v1.8b ,  v2.8b     //ref_main_idx - 1
514
515    sub         v4.8b,  v1.8b ,  v2.8b      //row 1 ref_main_idx
516    sub         v5.8b,  v19.8b ,  v2.8b
517
518    tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx (row 0)
519    tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 0)
520
521
522    umull       v24.8h, v12.8b, v7.8b       //mul (row 0)
523    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx    (row 1)
524    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
525
526    sub         v1.8b,  v1.8b ,  v3.8b      //idx (row 2)
527    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 1)
528    sub         v19.8b,  v19.8b ,  v3.8b    //idx+1 (row 2)
529
530    umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
531    tbl         v12.8b, {v0.16b},v1.8b      //load from ref_main_idx    (row 2)
532    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
533
534    rshrn       v24.8b, v24.8h,#5           //round shift (row 0)
535
536    sub         v4.8b,  v4.8b ,  v3.8b      //idx (row 3)
537    tbl         v13.8b, {v0.16b},v19.8b     //load from ref_main_idx + 1 (row 2)
538    sub         v5.8b,  v5.8b ,  v3.8b      //idx+1 (row 3)
539
540    umull       v20.8h, v12.8b, v7.8b       //mul (row 2)
541    tbl         v16.8b, {v0.16b},v4.8b      //load from ref_main_idx (row 3)
542    umlal       v20.8h, v13.8b, v6.8b       //mul (row 2)
543
544    st1         {v24.s}[0],[x2], x3         //st row 0
545    rshrn       v22.8b, v22.8h,#5           //round shift (row 1)
546
547    tbl         v17.8b, {v0.16b},v5.8b      //load from ref_main_idx + 1 (row 3)
548
549    umull       v18.8h, v16.8b, v7.8b       //mul (row 3)
550    umlal       v18.8h, v17.8b, v6.8b       //mul (row 3)
551
552    st1         {v22.s}[0],[x2], x3         //st row 1
553    rshrn       v20.8b, v20.8h,#5           //round shift (row 2)
554
555    st1         {v20.s}[0],[x2], x3         //st row 2
556
557    rshrn       v18.8b, v18.8h,#5           //round shift (row 3)
558
559    st1         {v18.s}[0],[x2], x3         //st (row 3)
560
561end_func:
562    // ldmfd sp!,{x4-x12,x15}          //reload the registers from sp
563    ldp         x19, x20,[sp],#16
564    ldp         d14,d15,[sp],#16
565    ldp         d12,d13,[sp],#16
566    ret
567
568
569
570
571