1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_intra_pred_chroma_mode_3_to_9.s
22//*
23//* @brief
24//*  contains function definitions for intra prediction dc filtering.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* @author
30//*  parthiban v
31//*
32//* @par list of functions:
33//*
34//*
35//* @remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41//*******************************************************************************
42//*
43//* @brief
44//*    luma intraprediction filter for dc input
45//*
46//* @par description:
47//*
48//* @param[in] pu1_ref
49//*  uword8 pointer to the source
50//*
51//* @param[out] pu1_dst
52//*  uword8 pointer to the destination
53//*
54//* @param[in] src_strd
55//*  integer source stride
56//*
57//* @param[in] dst_strd
58//*  integer destination stride
59//*
60//* @param[in] nt
61//*  size of tranform block
62//*
63//* @param[in] mode
64//*  type of filtering
65//*
66//* @returns
67//*
68//* @remarks
69//*  none
70//*
71//*******************************************************************************
72//*/
73//void ihevc_intra_pred_chroma_mode_3_to_9(uword8 *pu1_ref,
74//                                       word32 src_strd,
75//                                       uword8 *pu1_dst,
76//                                       word32 dst_strd,
77//                                       word32 nt,
78//                                       word32 mode)
79//**************variables vs registers*****************************************
80//x0 => *pu1_ref
81//x1 => src_strd
82//x2 => *pu1_dst
83//x3 => dst_strd
84
85//stack contents from #40
86//    nt
87//    mode
88
89.text
90.align 4
91
92.include "ihevc_neon_macros.s"
93
94
95
96.globl ihevc_intra_pred_chroma_mode_3_to_9_av8
97.extern gai4_ihevc_ang_table
98.extern gai4_ihevc_inv_ang_table
99.extern col_for_intra_chroma
100.extern idx_neg_idx_chroma_3_9
101
102.type ihevc_intra_pred_chroma_mode_3_to_9_av8, %function
103
104ihevc_intra_pred_chroma_mode_3_to_9_av8:
105
106    // stmfd sp!, {x4-x12, x14}        //stack stores the values of the arguments
107
108    stp         d13,d14,[sp,#-16]!
109    stp         d8,d15,[sp,#-16]!           // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
110                                            // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
111    stp         x19, x20,[sp,#-16]!
112
113    adrp        x7,  :got:gai4_ihevc_ang_table
114    ldr         x7,  [x7, #:got_lo12:gai4_ihevc_ang_table]
115
116    adrp        x8,  :got:gai4_ihevc_inv_ang_table
117    ldr         x8,  [x8, #:got_lo12:gai4_ihevc_inv_ang_table]
118
119    add         x7, x7, x5, lsl #2          //gai4_ihevc_ang_table[mode]
120    ldr         w7,  [x7]                   //intra_pred_ang
121    sxtw        x7,w7
122    dup         v30.8b,w7                   //intra_pred_ang
123
124    adrp        x14,  :got:col_for_intra_chroma
125    ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
126
127prologue_8_16_32:
128    lsr         x10, x4, #3
129    ld1         {v31.8b},[x14],#8
130    mul         x10, x4, x10                //block counter (dec by #8)
131
132    lsl         x11, x4, #1                 //col counter to be inc/dec by #8
133    smull       v22.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
134
135    sub         x7, x5, #3
136    adrp        x12,  :got:idx_neg_idx_chroma_3_9 //load most idx table
137    ldr         x12, [x12,  #:got_lo12:idx_neg_idx_chroma_3_9]
138
139    add         x12, x12, x7, lsl #4
140    mov         x8, x12
141
142    mov         x7, #8
143    sub         x7, x7, x3, lsl #3          //x7 = 8-8x3
144
145    ldr         w9,  [x8]
146    sxtw        x9,w9
147    lsl         x9, x9, #1
148    add         x1, x0, x4, lsl #2          //pu1_ref + 4*nt
149
150    xtn         v6.8b,  v22.8h
151    dup         v26.8b,w9                   //most idx added to final idx values
152    sub         x1, x1, #26                 //ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row
153
154    sub         x6, x1, x9
155
156    ld1         {v0.16b, v1.16b}, [x6]      //stores the 32 values reqd based on indices values (from most idx)
157    sshr        v22.8h, v22.8h,#5
158
159    movi        v29.8b, #31                 //contains #31 for vand operation
160
161    movi        v28.8b, #32
162
163    sqxtn       v2.8b,  v22.8h
164    shl         v2.8b, v2.8b,#1             // 2 * idx
165
166    and         v6.8b,  v6.8b ,  v29.8b     //fract values in d1/ idx values in d0
167    movi        v29.8b, #2                  //contains #2 for adding to get ref_main_idx + 1
168
169    mov         x0,#0x302                   // idx value for v is +1 of u
170    dup         v27.4h,w0
171    mov         x0,#0
172
173    movi        v3.8b, #22                  //row 0 to 7
174
175    sub         v2.8b,  v2.8b ,  v27.8b     //ref_main_idx (sub row)
176    sub         v2.8b,  v26.8b ,  v2.8b     //ref_main_idx (row 0)
177    add         v2.8b,  v2.8b ,  v3.8b      //to compensate the pu1_src idx incremented by 8
178    sub         v3.8b,  v2.8b ,  v29.8b     //ref_main_idx + 1 (row 0)
179    tbl         v25.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 0)
180    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
181
182    tbl         v13.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 0)
183    sub         v4.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 1)
184    sub         v5.8b,  v3.8b ,  v29.8b     //ref_main_idx + 1 (row 1)
185
186    movi        v29.8b, #4
187
188    tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
189    umull       v24.8h, v25.8b, v7.8b       //mul (row 0)
190    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
191
192    tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
193    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 2)
194    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx + 1 (row 2)
195
196    rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
197
198    tbl         v14.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 2)
199    umull       v22.8h, v16.8b, v7.8b       //mul (row 1)
200    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
201
202    tbl         v15.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 2)
203    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 3)
204    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 3)
205
206    st1         {v24.8b},[x2], x3           //st (row 0)
207    rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
208
209    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
210    umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
211    umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
212
213    tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
214    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 4)
215    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx + 1 (row 4)
216
217    st1         {v22.8b},[x2], x3           //st (row 1)
218    rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
219
220    tbl         v25.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 4)
221    umull       v18.8h, v19.8b, v7.8b       //mul (row 3)
222    umlal       v18.8h, v23.8b, v6.8b       //mul (row 3)
223
224    tbl         v13.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 4)
225    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 5)
226    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 5)
227
228    st1         {v20.8b},[x2], x3           //st (row 2)
229    rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
230
231    tbl         v16.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
232    umull       v24.8h, v25.8b, v7.8b       //mul (row 4)
233    umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
234
235    tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
236    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 6)
237    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx + 1 (row 6)
238
239    st1         {v18.8b},[x2], x3           //st (row 3)
240    cmp         x4,#4
241    beq         end_func
242    rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
243
244    tbl         v14.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 6)
245    umull       v22.8h, v16.8b, v7.8b       //mul (row 5)
246    umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
247
248    tbl         v15.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 6)
249    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 7)
250    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx + 1 (row 7)
251
252    st1         {v24.8b},[x2], x3           //st (row 4)
253    rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
254
255    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
256    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
257    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
258
259    tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
260    umull       v18.8h, v19.8b, v7.8b       //mul (row 7)
261    umlal       v18.8h, v23.8b, v6.8b       //mul (row 7)
262
263    st1         {v22.8b},[x2], x3           //st (row 5)
264    rshrn       v20.8b, v20.8h,#5           //round shft (row 6)
265    rshrn       v18.8b, v18.8h,#5           //round shft (row 7)
266
267    st1         {v20.8b},[x2], x3           //st (row 6)
268
269    subs        x10, x10, #4                //subtract 8 and go to end if 8x8
270
271    st1         {v18.8b},[x2], x3           //st (row 7)
272
273    beq         end_func
274
275    subs        x11, x11, #8                //decrement the processed col
276    add         x20, x8, #4
277    csel        x8, x20, x8,gt
278    add         x20, x2, x7
279    csel        x2, x20, x2,gt
280    csel        x8, x12, x8,le
281    sub         x20, x2, x4
282    csel        x2, x20, x2,le
283    add         x20, x2, #8
284    csel        x2, x20, x2,le
285    lsl         x20, x4,  #1
286    csel        x11,x20,x11,le
287    bgt         lbl284
288    adrp        x14,  :got:col_for_intra_chroma
289    ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
290lbl284:
291    add         x20, x0, #8
292    csel        x0, x20, x0,le
293
294    ld1         {v31.8b},[x14],#8
295    smull       v25.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
296    xtn         v19.8b,  v25.8h
297    sshr        v25.8h, v25.8h,#5
298    sqxtn       v23.8b,  v25.8h
299    shl         v23.8b, v23.8b,#1
300    mov         x5, #0x302                  //idx value for v is +1 of u
301    dup         v27.4h,w5                   //row value inc or reset accordingly
302    ldr         w9,  [x8]                   //loads index value
303    sxtw        x9,w9
304    lsl         x9, x9, #1
305    mov         x5, #22
306    sub         x5, x5, x0, lsl #1
307    dup         v16.8b,w5
308    dup         v26.8b,w9
309
310    mov         x5,x2
311    sub         v23.8b,  v23.8b ,  v27.8b   //ref_main_idx (sub row)
312
313kernel_8_16_32:
314    movi        v29.8b, #2                  //contains #2 for adding to get ref_main_idx + 1
315    sub         v2.8b,  v26.8b ,  v23.8b    //ref_main_idx
316    mov         v26.8b, v19.8b
317
318    subs        x11, x11, #8
319    sub         x6, x1, x9
320    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
321    add         v2.8b,  v2.8b ,  v16.8b     //to compensate the pu1_src idx incremented by 8
322
323    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
324    tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx - 1 (row 7)
325    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
326
327    add         x20, x0, #8
328    csel        x0, x20, x0,le
329    sub         v3.8b,  v2.8b ,  v29.8b     //ref_main_idx - 2
330    add         x20, x8, #4
331    csel        x8, x20, x8,gt
332
333    ld1         {v0.16b, v1.16b}, [x6]      //stores the 32 values reqd based on indices values (from most idx)
334    rshrn       v22.8b, v22.8h,#5           //round shft (row 5)
335
336    bgt         lbl326
337    adrp        x14,  :got:col_for_intra_chroma
338    ldr         x14,  [x14, #:got_lo12:col_for_intra_chroma]
339lbl326:
340    st1         {v24.8b},[x5], x3           //st (row 4)
341    csel        x8, x12, x8,le
342
343    mov         x9,#0x302
344    dup         v27.4h,w9                   //row value inc or reset accordingly
345    sub         v4.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 1)
346
347    sub         v5.8b,  v3.8b ,  v29.8b     //ref_main_idx - 1 (row 1)
348    tbl         v25.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 0)
349    movi        v29.8b, #31                 //contains #2 for adding to get ref_main_idx + 1
350
351    umull       v18.8h, v19.8b, v7.8b       //mul (row 7)
352    tbl         v13.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 0)
353    umlal       v18.8h, v23.8b, v6.8b       //mul (row 7)
354
355    ld1         {v31.8b},[x14],#8
356    and         v6.8b,  v29.8b ,  v26.8b    //fract values in d1/ idx values in d0
357
358    lsl         x20, x4,  #1
359    csel        x11,x20,x11,le
360    movi        v29.8b, #4                  //contains #2 for adding to get ref_main_idx + 1
361    ldr         w9,  [x8]
362    sxtw        x9,w9
363
364    st1         {v22.8b},[x5], x3           //(from previous loop)st (row 5)
365    rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
366
367    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 2)
368    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1)
369    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx - 1 (row 2)
370
371    lsl         x9, x9, #1
372    sub         v7.8b,  v28.8b ,  v6.8b     //32-fract
373
374    umull       v24.8h, v25.8b, v7.8b       //mul (row 0)
375    tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1)
376    umlal       v24.8h, v13.8b, v6.8b       //mul (row 0)
377
378    st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
379    rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
380
381    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 3)
382    tbl         v14.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 2)
383    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx - 1 (row 3)
384
385    umull       v22.8h, v19.8b, v7.8b       //mul (row 1)
386    tbl         v15.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 2)
387    umlal       v22.8h, v17.8b, v6.8b       //mul (row 1)
388
389    rshrn       v24.8b, v24.8h,#5           //round shft (row 0)
390    st1         {v18.8b},[x5], x3           //(from previous loop)st (row 7)
391
392    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 4)
393    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3)
394    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx - 1 (row 4)
395
396    umull       v20.8h, v14.8b, v7.8b       //mul (row 2)
397    tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3)
398    umlal       v20.8h, v15.8b, v6.8b       //mul (row 2)
399
400    add         x5,x2,x3,lsl#2
401    smull       v14.8h, v30.8b, v31.8b      //(col+1)*intra_pred_angle [0:7](col)
402    add         x9, x9, x0, lsl #1
403
404    st1         {v24.8b},[x2], x3           //st (row 0)
405    rshrn       v22.8b, v22.8h,#5           //round shft (row 1)
406
407    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 5)
408    tbl         v25.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 4)
409    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx - 1 (row 5)
410
411    umull       v18.8h, v19.8b, v7.8b       //mul (row 3)
412    tbl         v13.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 4)
413    umlal       v18.8h, v23.8b, v6.8b       //mul (row 3)
414
415    st1         {v22.8b},[x2], x3           //st (row 1)
416    rshrn       v20.8b, v20.8h,#5           //round shft (row 2)
417
418    xtn         v19.8b,  v14.8h
419    sshr        v14.8h, v14.8h,#5
420
421    sub         v2.8b,  v2.8b ,  v29.8b     //ref_main_idx (row 6)
422    tbl         v21.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5)
423    sub         v3.8b,  v3.8b ,  v29.8b     //ref_main_idx - 1 (row 6)
424
425    umull       v24.8h, v25.8b, v7.8b       //mul (row 4)
426    tbl         v17.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5)
427    sqxtn       v23.8b,  v14.8h
428
429    st1         {v20.8b},[x2], x3           //st (row 2)
430    umlal       v24.8h, v13.8b, v6.8b       //mul (row 4)
431
432    rshrn       v18.8b, v18.8h,#5           //round shft (row 3)
433    dup         v26.8b,w9
434
435    sub         v4.8b,  v4.8b ,  v29.8b     //ref_main_idx (row 7)
436    tbl         v14.8b, {  v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 6)
437    sub         v5.8b,  v5.8b ,  v29.8b     //ref_main_idx - 1 (row 7)
438
439    mov         x6, #22                     //to compensate the 2*row value
440    shl         v23.8b, v23.8b,#1
441    sub         x6, x6, x0, lsl #1
442
443    umull       v22.8h, v21.8b, v7.8b       //mul (row 5)
444    tbl         v15.8b, {  v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 6)
445    umlal       v22.8h, v17.8b, v6.8b       //mul (row 5)
446
447    st1         {v18.8b},[x2], x3           //st (row 3)
448    rshrn       v24.8b, v24.8h,#5           //round shft (row 4)
449
450    add         x2,x2,x3, lsl #2
451    dup         v16.8b,w6
452    add         x20, x7, x2
453    csel        x2, x20, x2,gt
454
455    sub         x20, x2, x4
456    csel        x2, x20, x2,le
457    sub         v23.8b,  v23.8b ,  v27.8b   //ref_main_idx (add row)
458    sub         x20,x2,#8
459    csel        x2, x20, x2,le
460
461    subs        x10, x10, #4                //subtract 8 and go to end if 8x8
462
463    bne         kernel_8_16_32
464
465epil_8_16_32:
466    tbl         v19.8b, {  v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7)
467
468    umull       v20.8h, v14.8b, v7.8b       //mul (row 6)
469    tbl         v23.8b, {  v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7)
470    umlal       v20.8h, v15.8b, v6.8b       //mul (row 6)
471
472    st1         {v24.8b},[x5], x3           //st (row 4)
473    rshrn       v24.8b, v22.8h,#5           //round shft (row 5)
474
475    umull       v18.8h, v19.8b, v7.8b       //mul (row 7)
476    umlal       v18.8h, v23.8b, v6.8b       //mul (row 7)
477
478    st1         {v24.8b},[x5], x3           //(from previous loop)st (row 5)
479    rshrn       v20.8b, v20.8h,#5           //(from previous loop)round shft (row 6)
480
481    st1         {v20.8b},[x5], x3           //(from previous loop)st (row 6)
482    rshrn       v18.8b, v18.8h,#5           //(from previous loop)round shft (row 7)
483
484    st1         {v18.8b},[x5], x3           //st (row 7)
485
486end_func:
487    // ldmfd sp!,{x4-x12,x15}               //reload the registers from sp
488    ldp         x19, x20,[sp],#16
489    ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
490                                            // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
491    ldp         d13,d14,[sp],#16
492    ret
493
494
495
496
497
498
499
500
501