1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* ,:file
21//*  ihevc_sao_band_offset_luma.s
22//*
23//* ,:brief
24//*  Contains function definitions for inter prediction  interpolation.
25//* Functions are coded using NEON  intrinsics and can be compiled using// ARM
26//* RVCT
27//*
28//* ,:author
29//*  Parthiban V
30//*
31//* ,:par List of Functions:
32//*
33//*
34//* ,:remarks
35//*  None
36//*
37//*******************************************************************************
38//*/
39//void ihevc_sao_band_offset_luma(UWORD8 *pu1_src,
40//                           WORD32 src_strd,
41//                           UWORD8 *pu1_src_left,
42//                           UWORD8 *pu1_src_top,
43//                           UWORD8 *pu1_src_top_left,
44//                           WORD32 sao_band_pos,
45//                           WORD8 *pi1_sao_offset,
46//                           WORD32 wd,
47//                           WORD32 ht)
48//
49//**************Variables Vs Registers*****************************************
50//x0 =>    *pu1_src
51//x1 =>    src_strd
52//x2 =>    *pu1_src_left
53//x3 =>    *pu1_src_top
54//x4    =>    *pu1_src_top_left
55//x5    =>    sao_band_pos
56//x6    =>    *pi1_sao_offset
57//x7    =>    wd
58//x8    =>    ht
59
60
61.set WIDE_REFERENCE, 0
62.set ARCHITECTURE, 5
63.set DO1STROUNDING, 0
64
65.include "ihevc_neon_macros.s"
66
67.text
68.p2align 2
69
70.globl gu1_table_band_idx
71.globl ihevc_sao_band_offset_luma_av8
72
73ihevc_sao_band_offset_luma_av8:
74
75    // STMFD sp!, {x4-x12, x14}            //stack stores the values of the arguments
76
77    LDR         w8,[sp]                     //Loads ht
78
79
80    stp         d13,d14,[sp,#-16]!
81    stp         d8,d15,[sp,#-16]!           // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error.
82                                            // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function.
83    stp         x19, x20,[sp,#-16]!
84
85    MOV         x9,x8                       //Move the ht to x9 for loop counter
86    ADD         x10,x0,x7                   //pu1_src[row * src_strd + (wd)]
87
88    SUB         x10,x10,#1                  //wd-1
89    ADRP        x14, :got:gu1_table_band_idx
90    LDR         x14, [x14, #:got_lo12:gu1_table_band_idx]
91
92SRC_LEFT_LOOP:
93    LDRB        w11,[x10]
94    add         x10, x10, x1                //Load the value
95    SUBS        x9,x9,#1                    //Decrement the loop counter
96    STRB        w11,[x2],#1                 //Store the value in pu1_src_left pointer
97    BNE         SRC_LEFT_LOOP
98
99    ADD         x9,x3,x7                    //pu1_src_top[wd]
100    LD1         {v1.8b},[x14],#8            //band_table.val[0]
101
102    LSL         x11,x5,#3
103    LD1         {v2.8b},[x14],#8            //band_table.val[1]
104
105    LDRB        w10,[x9,#-1]
106    dup         v31.8b,w11                  //band_pos
107    SUB         x12,x8,#1                   //ht-1
108
109    STRB        w10,[x4]                    //store to pu1_src_top_left[0]
110    LD1         {v3.8b},[x14],#8            //band_table.val[2]
111    mul         x12, x12, x1                //ht-1 * src_strd
112
113    ADD         x4,x12,x0                   //pu1_src[(ht - 1) * src_strd]
114    LD1         {v4.8b},[x14],#8            //band_table.val[3]
115    MOV         x9,x7                       //Move the wd to x9 for loop counter
116
117SRC_TOP_LOOP:                               //wd is always multiple of 8
118    LD1         {v0.8b},[x4],#8             //Load pu1_src[(ht - 1) * src_strd + col]
119    SUBS        x9,x9,#8                    //Decrement the loop counter by 8
120    ST1         {v0.8b},[x3],#8             //Store to pu1_src_top[col]
121    BNE         SRC_TOP_LOOP
122
123    LD1         {v30.8b},[x6]               //pi1_sao_offset load
124    ADD         v5.8b,  v1.8b ,  v31.8b     //band_table.val[0] = vadd_u8(band_table.val[0], band_pos)
125
126    dup         v29.8b, v30.b[1]            //vdup_n_u8(pi1_sao_offset[1])
127    ADD         v6.8b,  v2.8b ,  v31.8b     //band_table.val[1] = vadd_u8(band_table.val[1], band_pos)
128
129    dup         v28.8b, v30.b[2]            //vdup_n_u8(pi1_sao_offset[2])
130    ADD         v7.8b,  v3.8b ,  v31.8b     //band_table.val[2] = vadd_u8(band_table.val[2], band_pos)
131
132    dup         v27.8b, v30.b[3]            //vdup_n_u8(pi1_sao_offset[3])
133    ADD         v21.8b,  v4.8b ,  v31.8b    //band_table.val[3] = vadd_u8(band_table.val[3], band_pos)
134
135    dup         v26.8b, v30.b[4]            //vdup_n_u8(pi1_sao_offset[4])
136    ADD         v1.8b,  v5.8b ,  v29.8b     //band_table.val[0] = vadd_u8(band_table.val[0], vdup_n_u8(pi1_sao_offset[1]))
137
138    movi        v29.8b, #16                 //vdup_n_u8(16)
139    ADD         v2.8b,  v6.8b ,  v28.8b     //band_table.val[1] = vadd_u8(band_table.val[1], vdup_n_u8(pi1_sao_offset[2]))
140
141    CMP         x5,#28
142    ADD         v3.8b,  v7.8b ,  v27.8b     //band_table.val[2] = vadd_u8(band_table.val[2], vdup_n_u8(pi1_sao_offset[3]))
143
144    ADD         v4.8b,  v21.8b ,  v26.8b    //band_table.val[3] = vadd_u8(band_table.val[3], vdup_n_u8(pi1_sao_offset[4]))
145    BLT         SAO_BAND_POS_0
146
147SAO_BAND_POS_28:                            //case 28
148
149    cmhs        v25.8b,  v29.8b ,  v4.8b    //vcle_u8(band_table.val[3], vdup_n_u8(16))
150
151    BNE         SAO_BAND_POS_29
152    ORR         v4.8b,  v4.8b ,  v25.8b     //band_table.val[3] = vorr_u8(band_table.val[3], au1_cmp)
153    B           SWITCH_BREAK
154
155SAO_BAND_POS_29:                            //case 29
156    CMP         x5,#29
157    cmhs        v24.8b,  v29.8b ,  v3.8b    //vcle_u8(band_table.val[2], vdup_n_u8(16))
158
159    BNE         SAO_BAND_POS_30
160    ORR         v3.8b,  v3.8b ,  v24.8b     //band_table.val[2] = vorr_u8(band_table.val[2], au1_cmp)
161
162    AND         v4.8b,  v4.8b ,  v25.8b     //band_table.val[3] = vand_u8(band_table.val[3], au1_cmp)
163    B           SWITCH_BREAK
164
165SAO_BAND_POS_30:                            //case 30
166    CMP         x5,#30
167    cmhs        v23.8b,  v29.8b ,  v2.8b    //vcle_u8(band_table.val[1], vdup_n_u8(16))
168
169    BNE         SAO_BAND_POS_31
170    ORR         v2.8b,  v2.8b ,  v23.8b     //band_table.val[1] = vorr_u8(band_table.val[1], au1_cmp)
171
172    AND         v3.8b,  v3.8b ,  v24.8b     //band_table.val[2] = vand_u8(band_table.val[2], au1_cmp)
173    B           SWITCH_BREAK
174
175SAO_BAND_POS_31:                            //case 31
176    CMP         x5,#31
177    BNE         SWITCH_BREAK
178
179    cmhs        v22.8b,  v29.8b ,  v1.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
180    ORR         v1.8b,  v1.8b ,  v22.8b     //band_table.val[0] = vorr_u8(band_table.val[0], au1_cmp)
181
182    AND         v2.8b,  v2.8b ,  v23.8b     //band_table.val[1] = vand_u8(band_table.val[1], au1_cmp)
183
184SAO_BAND_POS_0:
185    CMP         x5,#0                       //case 0
186    BNE         SWITCH_BREAK
187
188    cmhs        v22.8b,  v29.8b ,  v1.8b    //vcle_u8(band_table.val[0], vdup_n_u8(16))
189    AND         v1.8b,  v1.8b ,  v22.8b     //band_table.val[0] = vand_u8(band_table.val[0], au1_cmp)
190
191SWITCH_BREAK:
192
193    mov         v1.d[1],v2.d[0]
194    mov         v2.d[0],v3.d[0]
195    mov         v2.d[1],v4.d[0]
196
197SWITCH_BREAK_1:
198
199    MOV         x4,x0                       //pu1_src_cpy
200    MOV         x11,x8                      //move ht
201    ADD         x5,x4,x1
202
203HEIGHT_LOOP:
204    ADD         x6,x5,x1
205    LD1         {v13.8b},[x4]               //au1_cur_row = vld1_u8(pu1_src_cpy)
206
207    ADD         x10,x6,x1
208    LD1         {v15.8b},[x5]               //au1_cur_row = vld1_u8(pu1_src_cpy)
209
210    LD1         {v17.8b},[x6]               //au1_cur_row = vld1_u8(pu1_src_cpy)
211
212    LD1         {v19.8b},[x10]              //au1_cur_row = vld1_u8(pu1_src_cpy)
213    SUB         v14.8b,  v13.8b ,  v31.8b   //vsub_u8(au1_cur_row, band_pos)
214
215    TBX         v13.8b, {v1.16b- v2.16b},v14.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
216    SUB         v16.8b,  v15.8b ,  v31.8b   //vsub_u8(au1_cur_row, band_pos)
217
218    TBX         v15.8b, {v1.16b- v2.16b},v16.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
219    SUB         v18.8b,  v17.8b ,  v31.8b   //vsub_u8(au1_cur_row, band_pos)
220
221    TBX         v17.8b, {v1.16b- v2.16b},v18.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
222    SUB         v20.8b,  v19.8b ,  v31.8b   //vsub_u8(au1_cur_row, band_pos)
223
224    TBX         v19.8b, {v1.16b- v2.16b},v20.8b //vtbx4_u8(au1_cur_row, band_table, vsub_u8(au1_cur_row, band_pos))
225    ST1         {v13.8b},[x4],x1            //vst1_u8(pu1_src_cpy, au1_cur_row)
226
227    ST1         {v15.8b},[x5]               //vst1_u8(pu1_src_cpy, au1_cur_row)
228    SUBS        x11,x11,#4                  //Decrement the ht loop count by 4
229
230    ST1         {v17.8b},[x6],x1            //vst1_u8(pu1_src_cpy, au1_cur_row)
231
232    ADD         x4,x6,x1
233    ST1         {v19.8b},[x10]              //vst1_u8(pu1_src_cpy, au1_cur_row)
234    ADD         x5,x4,x1
235
236    BNE         HEIGHT_LOOP
237
238    SUBS        x7,x7,#8                    //Decrement the width loop by 8
239    ADD         x0,x0,#8
240    BNE         SWITCH_BREAK_1
241
242    // LDMFD sp!,{x4-x12,x15}               //Reload the registers from SP
243    ldp         x19, x20,[sp], #16
244    ldp         d8,d15,[sp],#16             // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error.
245                                            // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function.
246    ldp         d13,d14,[sp],#16
247    ret
248
249
250
251