1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_intra_pred_filters_planar.s
22//*
23//* @brief
24//*  contains function definitions for inter prediction  interpolation.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* @author
30//*  akshaya mukund
31//*
32//* @par list of functions:
33//*
34//*
35//* @remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41//*******************************************************************************
42//*
43//* @brief
44//*    luma intraprediction filter for planar input
45//*
46//* @par description:
47//*
48//* @param[in] pu1_ref
49//*  uword8 pointer to the source
50//*
51//* @param[out] pu1_dst
52//*  uword8 pointer to the destination
53//*
54//* @param[in] src_strd
55//*  integer source stride
56//*
57//* @param[in] dst_strd
58//*  integer destination stride
59//*
60//* @param[in] pi1_coeff
61//*  word8 pointer to the planar coefficients
62//*
63//* @param[in] nt
64//*  size of tranform block
65//*
66//* @param[in] mode
67//*  type of filtering
68//*
69//* @returns
70//*
71//* @remarks
72//*  none
73//*
74//*******************************************************************************
75//*/
76
77//void ihevc_intra_pred_luma_planar(uword8* pu1_ref,
78//                                  word32 src_strd,
79//                                  uword8* pu1_dst,
80//                                  word32 dst_strd,
81//                                  word32 nt,
82//                                  word32 mode,
83//                   word32 pi1_coeff)
84//**************variables vs registers*****************************************
85//x0 => *pu1_ref
86//x1 => src_strd
87//x2 => *pu1_dst
88//x3 => dst_strd
89
90//stack contents from #40
91//    nt
92//    mode
93//    pi1_coeff
94
95.text
96.align 4
97.include "ihevc_neon_macros.s"
98
99
100
101.globl ihevc_intra_pred_luma_planar_av8
102.extern gau1_ihevc_planar_factor
103.extern gau1_ihevc_planar_factor_1
104
105.type ihevc_intra_pred_luma_planar_av8, %function
106
107ihevc_intra_pred_luma_planar_av8:
108
109    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
110
111    stp         x19, x20,[sp,#-16]!
112
113    adrp        x11, :got:gau1_ihevc_planar_factor //loads table of coeffs
114    ldr         x11, [x11, #:got_lo12:gau1_ihevc_planar_factor]
115
116    clz         w5,w4
117    sub         x20, x5, #32
118    neg         x5, x20
119    dup         v29.8h,w5
120    neg         v29.8h, v29.8h              //shr value (so vneg)
121    dup         v2.8b,w4                    //nt
122    dup         v16.8h,w4                   //nt
123
124    sub         x6, x4, #1                  //nt-1
125    add         x6, x6, x0
126    ldr         w7,  [x6]
127    sxtw        x7,w7
128    dup         v0.8b,w7                    //src[nt-1]
129
130    add         x6, x4, x4,lsl #1           //3nt
131    add         x6, x6, #1                  //3nt + 1
132    add         x6, x6, x0
133    ldr         w7,  [x6]
134    sxtw        x7,w7
135    dup         v1.8b,w7                    //src[3nt+1]
136
137    add         x6, x4, x4                  //2nt
138    add         x14, x6, #1                 //2nt+1
139    sub         x6, x6, #1                  //2nt-1
140    add         x6, x6, x0                  //&src[2nt-1]
141    add         x14, x14, x0                //&src[2nt+1]
142
143    mov         x8, #1                      //row+1 (row is first 0)
144    sub         x9, x4, x8                  //nt-1-row (row is first 0)
145
146    dup         v5.8b,w8                    //row + 1
147    dup         v6.8b,w9                    //nt - 1 - row
148    mov         v7.8b, v5.8b                //mov #1 to d7 to used for inc for row+1 and dec for nt-1-row
149
150    add         x12, x11, #1                //coeffs (to be reloaded after every row)
151    mov         x1, x4                      //nt (row counter) (dec after every row)
152    mov         x5, x2                      //dst (to be reloaded after every row and inc by dst_strd)
153    mov         x10, #8                     //increment for the coeffs
154    mov         x0, x14                     //&src[2nt+1] (to be reloaded after every row)
155
156    cmp         x4, #4
157    beq         tf_sz_4
158
159//@ ========== ***************** =====================
160prolog:
161tf_sz_8_16_32:
162
163    mov         x7, x4                      //column counter (set to no of cols)
164    lsr         x9, x4, #3                  //divide nt by 8
165    mul         x7, x7, x9                  //multiply width * height
166    adrp        x5, :got:gau1_ihevc_planar_factor_1 //loads table of coeffs
167    ldr         x5, [x5, #:got_lo12:gau1_ihevc_planar_factor_1]
168    sub         x6, x6, #7
169    mov         x8, x2
170    lsl         x9, x3, #3                  //4*stride
171    sub         x20, x9, #8                 //8-4*stride
172    neg         x9, x20
173    mov         x10, x4                     //nt
174    sub         x10, x10, #8                //nt - 8
175
176col_loop_8_16_32:
177
178    ld1         {v17.8b},[x12]              //(1-8)load 8 coeffs [col+1]
179    dup         v27.8h,w4                   //(1)
180    ld1         {v4.8b},[x6]                //(1-8)src[2nt-1-row]
181    sub         v19.8b,  v2.8b ,  v17.8b    //(1-8)[nt-1-col]
182
183
184    umlal       v27.8h, v5.8b, v0.8b        //(1)(row+1)    *    src[nt-1]
185
186    ld1         {v3.8b},[x14]               //(1-8)load 8 src[2nt+1+col]
187    umlal       v27.8h, v17.8b, v1.8b       //(1)(col+1)    *    src[3nt+1]
188
189    dup         v20.8b, v4.b[7]             //(1)
190    umlal       v27.8h, v6.8b, v3.8b        //(1)(nt-1-row)    *    src[2nt+1+col]
191
192    dup         v21.8b, v4.b[6]             //(2)
193    umlal       v27.8h, v19.8b, v20.8b      //(1)(nt-1-col)    *    src[2nt-1-row]
194
195    dup         v30.8h,w4                   //(2)
196    add         v5.8b,  v5.8b ,  v7.8b      //(1)
197
198    sub         v6.8b,  v6.8b ,  v7.8b      //(1)
199
200    dup         v22.8b, v4.b[5]             //(3)
201    umlal       v30.8h, v5.8b, v0.8b        //(2)
202
203    dup         v28.8h,w4                   //(3)
204    umlal       v30.8h, v17.8b, v1.8b       //(2)
205
206    umlal       v30.8h, v6.8b, v3.8b        //(2)
207    umlal       v30.8h, v19.8b, v21.8b      //(2)
208
209    sshl        v27.8h, v27.8h, v29.8h      //(1)shr
210
211    add         v5.8b,  v5.8b ,  v7.8b      //(2)
212    sub         v6.8b,  v6.8b ,  v7.8b      //(2)
213
214    xtn         v27.8b,  v27.8h             //(1)
215    umlal       v28.8h, v5.8b, v0.8b        //(3)
216
217    dup         v23.8b, v4.b[4]             //(4)
218    umlal       v28.8h, v17.8b, v1.8b       //(3)
219
220    dup         v25.8h,w4                   //(4)
221    umlal       v28.8h, v6.8b, v3.8b        //(3)
222
223    st1         {v27.8b},[x2], x3           //(1)str 8 values
224    umlal       v28.8h, v19.8b, v22.8b      //(3)
225
226    sshl        v30.8h, v30.8h, v29.8h      //(2)shr
227
228    add         v5.8b,  v5.8b ,  v7.8b      //(3)
229    sub         v6.8b,  v6.8b ,  v7.8b      //(3)
230
231    xtn         v30.8b,  v30.8h             //(2)
232    umlal       v25.8h, v5.8b, v0.8b        //(4)
233
234    dup         v20.8b, v4.b[3]             //(5)
235    umlal       v25.8h, v17.8b, v1.8b       //(4)
236
237    dup         v16.8h,w4                   //(5)
238    umlal       v25.8h, v6.8b, v3.8b        //(4)
239
240    st1         {v30.8b},[x2], x3           //(2)str 8 values
241    umlal       v25.8h, v19.8b, v23.8b      //(4)
242
243    sshl        v28.8h, v28.8h, v29.8h      //(3)shr
244
245    add         v5.8b,  v5.8b ,  v7.8b      //(4)
246    sub         v6.8b,  v6.8b ,  v7.8b      //(4)
247
248    xtn         v28.8b,  v28.8h             //(3)
249    umlal       v16.8h, v5.8b, v0.8b        //(5)
250
251    dup         v21.8b, v4.b[2]             //(6)
252    umlal       v16.8h, v17.8b, v1.8b       //(5)
253
254    dup         v18.8h,w4                   //(6)
255    umlal       v16.8h, v6.8b, v3.8b        //(5)
256
257    st1         {v28.8b},[x2], x3           //(3)str 8 values
258    umlal       v16.8h, v19.8b, v20.8b      //(5)
259
260    sshl        v25.8h, v25.8h, v29.8h      //(4)shr
261    add         v5.8b,  v5.8b ,  v7.8b      //(5)
262    sub         v6.8b,  v6.8b ,  v7.8b      //(5)
263
264    xtn         v25.8b,  v25.8h             //(4)
265    umlal       v18.8h, v5.8b, v0.8b        //(6)
266
267    dup         v22.8b, v4.b[1]             //(7)
268    umlal       v18.8h, v17.8b, v1.8b       //(6)
269
270    dup         v26.8h,w4                   //(7)
271    umlal       v18.8h, v6.8b, v3.8b        //(6)
272
273    st1         {v25.8b},[x2], x3           //(4)str 8 values
274    umlal       v18.8h, v19.8b, v21.8b      //(6)
275
276    sshl        v16.8h, v16.8h, v29.8h      //(5)shr
277
278    add         v5.8b,  v5.8b ,  v7.8b      //(6)
279    sub         v6.8b,  v6.8b ,  v7.8b      //(6)
280
281    xtn         v16.8b,  v16.8h             //(5)
282    umlal       v26.8h, v5.8b, v0.8b        //(7)
283
284    dup         v23.8b, v4.b[0]             //(8)
285    umlal       v26.8h, v17.8b, v1.8b       //(7)
286
287    dup         v24.8h,w4                   //(8)
288    umlal       v26.8h, v6.8b, v3.8b        //(7)
289
290    st1         {v16.8b},[x2], x3           //(5)str 8 values
291    umlal       v26.8h, v19.8b, v22.8b      //(7)
292
293    sshl        v18.8h, v18.8h, v29.8h      //(6)shr
294
295    add         v5.8b,  v5.8b ,  v7.8b      //(7)
296    sub         v6.8b,  v6.8b ,  v7.8b      //(7)
297
298    xtn         v18.8b,  v18.8h             //(6)
299    umlal       v24.8h, v5.8b, v0.8b        //(8)
300
301
302    umlal       v24.8h, v17.8b, v1.8b       //(8)
303
304    umlal       v24.8h, v6.8b, v3.8b        //(8)
305
306    st1         {v18.8b},[x2], x3           //(6)str 8 values
307    umlal       v24.8h, v19.8b, v23.8b      //(8)
308
309    sshl        v26.8h, v26.8h, v29.8h      //(7)shr
310
311    subs        x7, x7, #8
312
313    beq         epilog
314
315    subs        x1, x1, #8                  //row counter
316    add         x20, x12, #8                //col inc
317    csel        x12, x20, x12,gt
318    add         x20, x14, #8                //also for col inc
319    csel        x14, x20, x14,gt
320    csel        x1, x4, x1,le               //nt reloaded (refresh the value)
321    add         x20, x11, #1                //x12 reset
322    csel        x12, x20, x12,le
323
324    csel        x14, x0, x14,le             //x14 reset
325    ld1         {v17.8b},[x12]              //(1n)(1-8)load 8 coeffs [col+1]
326
327    sub         x20, x6, #8                 //for next set of rows
328    csel        x6, x20, x6,le
329    ld1         {v3.8b},[x14]               //(1n)(1-8)load 8 src[2nt+1+col]
330
331    add         x20, x5, #8
332    csel        x5, x20, x5,le
333    dup         v27.8h,w4                   //(1n)(1)
334
335    ld1         {v5.8b},[x5]
336
337    ld1         {v4.8b},[x6]                //(1n)(1-8)src[2nt-1-row]
338    sub         v19.8b,  v2.8b ,  v17.8b    //(1n)(1-8)[nt-1-col]
339
340    dup         v20.8b, v4.b[7]             //(1n)(1)
341    sub         v6.8b,  v2.8b ,  v5.8b
342
343    beq         epilog
344
345kernel_plnr:
346
347    cmp         x1, #0                      // (cond loop)
348    sshl        v24.8h, v24.8h, v29.8h      //(8)shr
349
350    xtn         v26.8b,  v26.8h             //(7)
351    umlal       v27.8h, v5.8b, v0.8b        //(1)(row+1)    *    src[nt-1]
352
353    xtn         v24.8b,  v24.8h             //(8)
354    umlal       v27.8h, v17.8b, v1.8b       //(1)(col+1)    *    src[3nt+1]
355
356    dup         v21.8b, v4.b[6]             //(2)
357    umlal       v27.8h, v6.8b, v3.8b        //(1)(nt-1-row)    *    src[2nt+1+col]
358
359    dup         v30.8h,w4                   //(2)
360    umlal       v27.8h, v19.8b, v20.8b      //(1)(nt-1-col)    *    src[2nt-1-row]
361
362    st1         {v26.8b},[x2], x3           //(7)str 8 values
363    add         v5.8b,  v5.8b ,  v7.8b      //(1)
364
365    st1         {v24.8b},[x2], x3           //(8)str 8 values
366    sub         v6.8b,  v6.8b ,  v7.8b      //(1)
367
368    add         x20, x2, x9                 //since more cols to fill, dst + 8 - 6*strd (cond loop)
369    csel        x2, x20, x2,gt
370    umlal       v30.8h, v5.8b, v0.8b        //(2)
371
372    sub         x20, x2, x10                //else go to next set of rows, dst - (nt-8) (cond loop)
373    csel        x2, x20, x2,le
374    umlal       v30.8h, v17.8b, v1.8b       //(2)
375
376    dup         v22.8b, v4.b[5]             //(3)
377    umlal       v30.8h, v6.8b, v3.8b        //(2)
378
379    dup         v28.8h,w4                   //(3)
380    umlal       v30.8h, v19.8b, v21.8b      //(2)
381
382    sshl        v27.8h, v27.8h, v29.8h      //(1)shr
383
384    add         v5.8b,  v5.8b ,  v7.8b      //(2)
385    csel        x1, x4, x1,le               //nt reloaded (refresh the value)    (cond loop)
386
387    sub         v6.8b,  v6.8b ,  v7.8b      //(2)
388    subs        x1, x1, #8                  //row counter (loop)
389
390    xtn         v27.8b,  v27.8h             //(1)
391    umlal       v28.8h, v5.8b, v0.8b        //(3)
392
393    dup         v23.8b, v4.b[4]             //(4)
394    umlal       v28.8h, v17.8b, v1.8b       //(3)
395
396    dup         v25.8h,w4                   //(4)
397    umlal       v28.8h, v6.8b, v3.8b        //(3)
398
399    st1         {v27.8b},[x2], x3           //(1)str 8 values
400    umlal       v28.8h, v19.8b, v22.8b      //(3)
401
402    sshl        v30.8h, v30.8h, v29.8h      //(2)shr
403
404    add         v5.8b,  v5.8b ,  v7.8b      //(3)
405
406    sub         v6.8b,  v6.8b ,  v7.8b      //(3)
407
408    xtn         v30.8b,  v30.8h             //(2)
409    umlal       v25.8h, v5.8b, v0.8b        //(4)
410
411    dup         v20.8b, v4.b[3]             //(5)
412    umlal       v25.8h, v17.8b, v1.8b       //(4)
413
414    dup         v16.8h,w4                   //(5)
415    umlal       v25.8h, v6.8b, v3.8b        //(4)
416
417    st1         {v30.8b},[x2], x3           //(2)str 8 values
418    umlal       v25.8h, v19.8b, v23.8b      //(4)
419
420    sshl        v28.8h, v28.8h, v29.8h      //(3)shr
421
422    add         v5.8b,  v5.8b ,  v7.8b      //(4)
423
424    sub         v6.8b,  v6.8b ,  v7.8b      //(4)
425
426    xtn         v28.8b,  v28.8h             //(3)
427    umlal       v16.8h, v5.8b, v0.8b        //(5)
428
429    dup         v21.8b, v4.b[2]             //(6)
430    umlal       v16.8h, v17.8b, v1.8b       //(5)
431
432    dup         v18.8h,w4                   //(6)
433    umlal       v16.8h, v6.8b, v3.8b        //(5)
434
435    st1         {v28.8b},[x2], x3           //(3)str 8 values
436    umlal       v16.8h, v19.8b, v20.8b      //(5)
437
438    add         x20, x11, #1                //x12 reset (cond loop)
439    csel        x12, x20, x12,le
440    sshl        v25.8h, v25.8h, v29.8h      //(4)shr
441
442    add         x20, x12, #8                //col inc (cond loop)
443    csel        x12, x20, x12,gt
444    add         v5.8b,  v5.8b ,  v7.8b      //(5)
445
446    add         x20, x14, #8                //also for col inc (cond loop)
447    csel        x14, x20, x14,gt
448    sub         v6.8b,  v6.8b ,  v7.8b      //(5)
449
450    xtn         v25.8b,  v25.8h             //(4)
451    umlal       v18.8h, v5.8b, v0.8b        //(6)
452
453    dup         v22.8b, v4.b[1]             //(7)
454    umlal       v18.8h, v17.8b, v1.8b       //(6)
455
456    dup         v26.8h,w4                   //(7)
457    umlal       v18.8h, v6.8b, v3.8b        //(6)
458
459    st1         {v25.8b},[x2], x3           //(4)str 8 values
460    umlal       v18.8h, v19.8b, v21.8b      //(6)
461
462    csel        x14, x0, x14,le             //x14 reset (cond loop)
463    sshl        v16.8h, v16.8h, v29.8h      //(5)shr
464
465    sub         x20, x6, #8                 //for next set of rows (cond loop)
466    csel        x6, x20, x6,le
467    add         v5.8b,  v5.8b ,  v7.8b      //(6)
468
469    add         x20, x5, #8                 // (cond loop)
470    csel        x5, x20, x5,le
471    sub         v6.8b,  v6.8b ,  v7.8b      //(6)
472
473    xtn         v16.8b,  v16.8h             //(5)
474    umlal       v26.8h, v5.8b, v0.8b        //(7)
475
476    dup         v23.8b, v4.b[0]             //(8)
477    umlal       v26.8h, v17.8b, v1.8b       //(7)
478
479    dup         v24.8h,w4                   //(8)
480    umlal       v26.8h, v6.8b, v3.8b        //(7)
481
482    st1         {v16.8b},[x2], x3           //(5)str 8 values
483    umlal       v26.8h, v19.8b, v22.8b      //(7)
484
485    ld1         {v4.8b},[x6]                //(1n)(1-8)src[2nt-1-row]
486    sshl        v18.8h, v18.8h, v29.8h      //(6)shr
487
488    add         v5.8b,  v5.8b ,  v7.8b      //(7)
489
490    sub         v6.8b,  v6.8b ,  v7.8b      //(7)
491
492    xtn         v18.8b,  v18.8h             //(6)
493    umlal       v24.8h, v5.8b, v0.8b        //(8)
494
495    ld1         {v5.8b},[x5]                //(row+1 value)
496    umlal       v24.8h, v17.8b, v1.8b       //(8)
497
498    dup         v20.8b, v4.b[7]             //(1n)(1)
499    umlal       v24.8h, v6.8b, v3.8b        //(8)
500
501    st1         {v18.8b},[x2], x3           //(6)str 8 values
502    umlal       v24.8h, v19.8b, v23.8b      //(8)
503
504    ld1         {v17.8b},[x12]              //(1n)(1-8)load 8 coeffs [col+1]
505    sub         v6.8b,  v2.8b ,  v5.8b      //(nt-1-row) value
506
507    subs        x7, x7, #8                  //col counter
508
509    ld1         {v3.8b},[x14]               //(1n)(1-8)load 8 src[2nt+1+col]
510    sshl        v26.8h, v26.8h, v29.8h      //(7)shr
511
512    dup         v27.8h,w4                   //(1n)(1)
513    sub         v19.8b,  v2.8b ,  v17.8b    //(1n)(1-8)[nt-1-col]
514
515    bne         kernel_plnr
516
517epilog:
518
519    xtn         v26.8b,  v26.8h             //(7)
520    st1         {v26.8b},[x2], x3           //(7)str 8 values
521
522    sshl        v24.8h, v24.8h, v29.8h      //(8)shr
523    xtn         v24.8b,  v24.8h             //(8)
524    st1         {v24.8b},[x2], x3           //(8)str 8 values
525
526//@ ========== ***************** =====================
527
528    beq         end_loop
529
530tf_sz_4:
531    ld1         {v25.8b},[x14]              //load src[2nt+1+col]
532    ld1         {v17.8b},[x12], x10         //load 8 coeffs [col+1]
533loop_sz_4:
534    mov         x10, #4                     //reduce inc to #4 for 4x4
535    ldr         w7,  [x6], #-1              //src[2nt-1-row] (dec to take into account row)
536    sxtw        x7,w7
537    dup         v4.8b,w7                    //src[2nt-1-row]
538
539    sub         v19.8b,  v2.8b ,  v17.8b    //[nt-1-col]
540
541    umull       v27.8h, v5.8b, v0.8b        //(row+1)    *    src[nt-1]
542    umlal       v27.8h, v6.8b, v25.8b       //(nt-1-row)    *    src[2nt+1+col]
543    umlal       v27.8h, v17.8b, v1.8b       //(col+1)    *    src[3nt+1]
544    umlal       v27.8h, v19.8b, v4.8b       //(nt-1-col)    *    src[2nt-1-row]
545//    vadd.i16    q6, q6, q8            @add (nt)
546//    vshl.s16     q6, q6, q7            @shr
547//    vmovn.i16     d12, q6
548    rshrn       v27.8b, v27.8h,#3
549    st1         {v27.s}[0],[x2], x3
550
551    add         v5.8b,  v5.8b ,  v7.8b      //row++ [(row+1)++]
552    sub         v6.8b,  v6.8b ,  v7.8b      //[nt-1-row]--
553    subs        x1, x1, #1
554
555    bne         loop_sz_4
556
557end_loop:
558    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
559    ldp         x19, x20,[sp],#16
560
561    ret
562
563
564
565
566
567
568
569
570