1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_intra_pred_filters_planar.s
22//*
23//* @brief
24//*  contains function definitions for inter prediction  interpolation.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* @author
30//*  akshaya mukund
31//*
32//* @par list of functions:
33//*
34//*
35//* @remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41//*******************************************************************************
42//*
43//* @brief
44//*    luma intraprediction filter for planar input
45//*
46//* @par description:
47//*
48//* @param[in] pu1_ref
49//*  uword8 pointer to the source
50//*
51//* @param[out] pu1_dst
52//*  uword8 pointer to the destination
53//*
54//* @param[in] src_strd
55//*  integer source stride
56//*
57//* @param[in] dst_strd
58//*  integer destination stride
59//*
60//* @param[in] pi1_coeff
61//*  word8 pointer to the planar coefficients
62//*
63//* @param[in] nt
64//*  size of tranform block
65//*
66//* @param[in] mode
67//*  type of filtering
68//*
69//* @returns
70//*
71//* @remarks
72//*  none
73//*
74//*******************************************************************************
75//*/
76
77//void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
78//                                  word32 src_strd,
79//                                  uword8* pu1_dst,
80//                                  word32 dst_strd,
81//                                  word32 nt,
82//                                  word32 mode,
83//                   word32 pi1_coeff)
84//**************variables vs registers*****************************************
85//x0 => *pu1_ref
86//x1 => src_strd
87//x2 => *pu1_dst
88//x3 => dst_strd
89
90//stack contents from #40
91//    nt
92//    mode
93//    pi1_coeff
94
95.text
96.align 4
97.include "ihevc_neon_macros.s"
98
99
100.globl ihevc_intra_pred_chroma_planar_av8
101.extern gau1_ihevc_planar_factor
102
103
104.type ihevc_intra_pred_chroma_planar_av8, %function
105
106ihevc_intra_pred_chroma_planar_av8:
107
108    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
109
110    stp         d10,d11,[sp,#-16]!
111    stp         d12,d13,[sp,#-16]!
112    stp         d8,d14,[sp,#-16]!           // Storing d14 using { sub sp,sp,#8; str d14,[sp] } is giving bus error.
113                                            // d8 is used as dummy register and stored along with d14 using stp. d8 is not used in the function.
114    stp         x19, x20,[sp,#-16]!
115
116    adrp        x11, :got:gau1_ihevc_planar_factor //loads table of coeffs
117    ldr         x11, [x11, #:got_lo12:gau1_ihevc_planar_factor]
118
119    clz         w5,w4
120    sub         x20, x5, #32
121    neg         x5, x20
122    dup         v14.8h,w5
123    neg         v14.8h, v14.8h              //shr value (so vneg)
124    dup         v2.8b,w4                    //nt
125    dup         v16.8h,w4                   //nt
126
127    sub         x6, x4, #1                  //nt-1
128    add         x6, x0,x6,lsl #1            //2*(nt-1)
129    ldr         w7,  [x6]
130    sxtw        x7,w7
131    dup         v0.4h,w7                    //src[nt-1]
132
133    add         x6, x4, x4,lsl #1           //3nt
134    add         x6, x6, #1                  //3nt + 1
135    lsl         x6,x6,#1                    //2*(3nt + 1)
136
137    add         x6, x6, x0
138    ldr         w7,  [x6]
139    sxtw        x7,w7
140    dup         v1.4h,w7                    //src[3nt+1]
141
142
143    add         x6, x4, x4                  //2nt
144    add         x14, x6, #1                 //2nt+1
145    lsl         x14,x14,#1                  //2*(2nt+1)
146    sub         x6, x6, #1                  //2nt-1
147    lsl         x6,x6,#1                    //2*(2nt-1)
148    add         x6, x6, x0                  //&src[2nt-1]
149    add         x14, x14, x0                //&src[2nt+1]
150
151    mov         x8, #1                      //row+1 (row is first 0)
152    sub         x9, x4, x8                  //nt-1-row (row is first 0)
153
154    dup         v5.8b,w8                    //row + 1
155    dup         v6.8b,w9                    //nt - 1 - row
156    mov         v7.8b, v5.8b                //mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
157
158    add         x12, x11, #1                //coeffs (to be reloaded after every row)
159    mov         x1, x4                      //nt (row counter) (dec after every row)
160    mov         x5, x2                      //dst (to be reloaded after every row and inc by dst_strd)
161    mov         x10, #8                     //increment for the coeffs
162    mov         x0, x14                     //&src[2nt+1] (to be reloaded after every row)
163
164    cmp         x4, #4
165    beq         tf_sz_4
166
167
168
169    mov         x10,x6
170tf_sz_8_16:
171    ld1         {v10.8b, v11.8b}, [x14],#16 //load src[2nt+1+col]
172    ld1         {v17.8b},[x12],#8
173    mov         v25.8b, v17.8b
174    zip1        v29.8b, v17.8b, v25.8b
175    zip2        v25.8b, v17.8b, v25.8b
176    mov         v17.d[0], v29.d[0]
177    sub         v30.8b,  v2.8b ,  v17.8b    //[nt-1-col]
178    sub         v31.8b,  v2.8b ,  v25.8b
179
180
181
182
183loop_sz_8_16:
184
185    ldr         w7,  [x6], #-2              //src[2nt-1-row] (dec to take into account row)
186    sxtw        x7,w7
187    umull       v12.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
188    ldr         w11,  [x6], #-2             //src[2nt-1-row] (dec to take into account row)
189    sxtw        x11,w11
190    umlal       v12.8h, v6.8b, v10.8b       //(nt-1-row)    *    src[2nt+1+col]
191    dup         v4.4h,w7                    //src[2nt-1-row]
192    umlal       v12.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
193    dup         v3.4h,w11                   //src[2nt-1-row]
194    umlal       v12.8h, v30.8b, v4.8b       //(nt-1-col)    *    src[2nt-1-row]
195
196
197
198    umull       v28.8h, v5.8b, v0.8b
199    ldr         w7,  [x6], #-2              //src[2nt-1-row] (dec to take into account row)
200    sxtw        x7,w7
201    umlal       v28.8h, v6.8b, v11.8b
202    add         v18.8b,  v5.8b ,  v7.8b     //row++ [(row+1)++]c
203
204
205    umlal       v28.8h, v31.8b, v4.8b
206    sub         v19.8b,  v6.8b ,  v7.8b     //[nt-1-row]--
207    umlal       v28.8h, v25.8b, v1.8b
208    dup         v4.4h,w7                    //src[2nt-1-row]
209
210    umull       v26.8h, v18.8b, v0.8b       //(row+1)    *    src[nt-1]
211    add         v12.8h,  v12.8h ,  v16.8h   //add (nt)
212    umlal       v26.8h, v19.8b, v10.8b      //(nt-1-row)    *    src[2nt+1+col]
213    sshl        v12.8h, v12.8h, v14.8h      //shr
214    umlal       v26.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
215    add         v28.8h,  v28.8h ,  v16.8h
216    umlal       v26.8h, v30.8b, v3.8b       //(nt-1-col)    *    src[2nt-1-row]
217    sshl        v28.8h, v28.8h, v14.8h
218
219
220
221
222
223    umull       v24.8h, v18.8b, v0.8b
224    add         v5.8b,  v18.8b ,  v7.8b     //row++ [(row+1)++]
225    umlal       v24.8h, v19.8b, v11.8b
226    sub         v6.8b,  v19.8b ,  v7.8b     //[nt-1-row]--
227    umlal       v24.8h, v25.8b, v1.8b
228    xtn         v12.8b,  v12.8h
229    umlal       v24.8h, v31.8b, v3.8b
230    xtn         v13.8b,  v28.8h
231
232
233
234
235    add         v26.8h,  v26.8h ,  v16.8h   //add (nt)
236    umull       v22.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
237    sshl        v26.8h, v26.8h, v14.8h      //shr
238    umlal       v22.8h, v6.8b, v10.8b       //(nt-1-row)    *    src[2nt+1+col]
239    st1         {v12.2s, v13.2s}, [x2], x3
240    umlal       v22.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
241    add         v24.8h,  v24.8h ,  v16.8h
242    umlal       v22.8h, v30.8b, v4.8b       //(nt-1-col)    *    src[2nt-1-row]
243    sshl        v24.8h, v24.8h, v14.8h
244
245    umull       v20.8h, v5.8b, v0.8b
246    add         v18.8b,  v5.8b ,  v7.8b     //row++ [(row+1)++]c
247    umlal       v20.8h, v6.8b, v11.8b
248    sub         v19.8b,  v6.8b ,  v7.8b     //[nt-1-row]--
249    umlal       v20.8h, v31.8b, v4.8b
250
251    ldr         w11,  [x6], #-2             //src[2nt-1-row] (dec to take into account row)
252    sxtw        x11,w11
253    umlal       v20.8h, v25.8b, v1.8b
254    dup         v3.4h,w11                   //src[2nt-1-row]
255    add         v22.8h,  v22.8h ,  v16.8h   //add (nt)
256
257    umull       v12.8h, v18.8b, v0.8b       //(row+1)    *    src[nt-1]
258    xtn         v26.8b,  v26.8h
259    umlal       v12.8h, v19.8b, v10.8b      //(nt-1-row)    *    src[2nt+1+col]
260    xtn         v27.8b,  v24.8h
261
262    umlal       v12.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
263    sshl        v22.8h, v22.8h, v14.8h      //shr
264
265    umlal       v12.8h, v30.8b, v3.8b       //(nt-1-col)    *    src[2nt-1-row]
266    add         v20.8h,  v20.8h ,  v16.8h
267
268    umull       v28.8h, v18.8b, v0.8b
269    st1         {v26.2s, v27.2s}, [x2], x3
270
271    umlal       v28.8h, v19.8b, v11.8b
272    add         v5.8b,  v18.8b ,  v7.8b     //row++ [(row+1)++]
273
274    sub         v6.8b,  v19.8b ,  v7.8b     //[nt-1-row]--
275    umlal       v28.8h, v25.8b, v1.8b
276
277    umlal       v28.8h, v31.8b, v3.8b
278    sshl        v20.8h, v20.8h, v14.8h
279
280
281    add         v12.8h,  v12.8h ,  v16.8h   //add (nt)
282    xtn         v22.8b,  v22.8h
283
284
285    add         v28.8h,  v28.8h ,  v16.8h
286    xtn         v23.8b,  v20.8h
287
288
289    sshl        v12.8h, v12.8h, v14.8h      //shr
290    st1         {v22.2s, v23.2s}, [x2], x3
291    sshl        v28.8h, v28.8h, v14.8h
292
293
294
295
296
297    xtn         v20.8b,  v12.8h
298    xtn         v21.8b,  v28.8h
299
300    st1         {v20.2s, v21.2s}, [x2], x3
301
302
303    subs        x1, x1, #4
304
305    bne         loop_sz_8_16
306
307
308
309
310    cmp         x4,#16
311
312    bne         end_loop
313
314
315    sub         x4, x4,#16
316    dup         v5.8b,w8                    //row + 1
317    dup         v6.8b,w9                    //nt - 1 - row
318    mov         v7.8b, v5.8b                //mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
319
320    mov         x6,x10
321    mov         x1,#16
322    sub         x2,x2,x3,lsl #4
323    add         x2,x2,#16
324
325    ld1         {v10.8b, v11.8b}, [x14],#16 //load src[2nt+1+col]
326    ld1         {v17.8b},[x12],#8
327    mov         v25.8b, v17.8b
328    zip1        v29.8b, v17.8b, v25.8b
329    zip2        v25.8b, v17.8b, v25.8b
330    mov         v17.d[0], v29.d[0]
331    sub         v30.8b,  v2.8b ,  v17.8b    //[nt-1-col]
332    sub         v31.8b,  v2.8b ,  v25.8b
333
334    beq         loop_sz_8_16
335
336
337
338tf_sz_4:
339    ld1         {v10.8b},[x14]              //load src[2nt+1+col]
340    ld1         {v17.8b},[x12], x10         //load 8 coeffs [col+1]
341    mov         v25.8b, v17.8b
342    zip1        v29.8b, v17.8b, v25.8b
343    zip2        v25.8b, v17.8b, v25.8b
344    mov         v17.d[0], v29.d[0]
345loop_sz_4:
346    //mov        x10, #4                @reduce inc to #4 for 4x4
347    ldr         w7,  [x6], #-2              //src[2nt-1-row] (dec to take into account row)
348    sxtw        x7,w7
349    dup         v4.4h,w7                    //src[2nt-1-row]
350
351    sub         v25.8b,  v2.8b ,  v17.8b    //[nt-1-col]
352
353    umull       v12.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
354    umlal       v12.8h, v6.8b, v10.8b       //(nt-1-row)    *    src[2nt+1+col]
355    umlal       v12.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
356    umlal       v12.8h, v25.8b, v4.8b       //(nt-1-col)    *    src[2nt-1-row]
357//    vadd.i16    q6, q6, q8            @add (nt)
358//    vshl.s16     q6, q6, q7            @shr
359//    vmovn.i16     d12, q6
360    rshrn       v12.8b, v12.8h,#3
361
362    st1         {v12.2s},[x2], x3
363
364    add         v5.8b,  v5.8b ,  v7.8b      //row++ [(row+1)++]
365    sub         v6.8b,  v6.8b ,  v7.8b      //[nt-1-row]--
366    subs        x1, x1, #1
367
368    bne         loop_sz_4
369
370end_loop:
371    // ldmfd sp!,{x4-x12,x15}                   //reload the registers from sp
372    ldp         x19, x20,[sp],#16
373    ldp         d8,d14,[sp],#16             // Loading d14 using { ldr d14,[sp]; add sp,sp,#8 } is giving bus error.
374                                            // d8 is used as dummy register and loaded along with d14 using ldp. d8 is not used in the function.
375    ldp         d12,d13,[sp],#16
376    ldp         d10,d11,[sp],#16
377    ret
378
379
380
381
382
383
384
385