1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21//******************************************************************************
22//* @file
23//*  ih264_intra_pred_luma_16x16_av8.s
24//*
25//* @brief
26//*  Contains function definitions for intra 16x16 Luma prediction .
27//*
28//* @author
29//*  Ittiam
30//*
31//* @par List of Functions:
32//*
33//*  - ih264_intra_pred_luma_16x16_mode_vert_av8()
34//*  - ih264_intra_pred_luma_16x16_mode_horz_av8()
35//*  - ih264_intra_pred_luma_16x16_mode_dc_av8()
36//*  - ih264_intra_pred_luma_16x16_mode_plane_av8()
37//*
38//* @remarks
39//*  None
40//*
41//*******************************************************************************
42//*/
43
44///* All the functions here are replicated from ih264_intra_pred_filters.c
45//
46
47///**
48///**
49///**
50//
51
52
53.text
54.p2align 2
55.include "ih264_neon_macros.s"
56.extern ih264_gai1_intrapred_luma_plane_coeffs
57
58
59
60///**
61//*******************************************************************************
62//*
63//*ih264_intra_pred_luma_16x16_mode_vert
64//*
65//* @brief
66//*   Perform Intra prediction for  luma_16x16 mode:vertical
67//*
68//* @par Description:
69//* Perform Intra prediction for  luma_16x16 mode:Vertical ,described in sec 8.3.3.1
70//*
71//* @param[in] pu1_src
72//*  UWORD8 pointer to the source
73//*
74//* @param[out] pu1_dst
75//*  UWORD8 pointer to the destination
76//*
77//* @param[in] src_strd
78//*  integer source stride
79//*
80//* @param[in] dst_strd
81//*  integer destination stride
82//*
83//* @param[in] ui_neighboravailability
84//* availability of neighbouring pixels(Not used in this function)
85//*
86//* @returns
87//*
88//* @remarks
89//*  None
90//*
91//*******************************************************************************
92//void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src,
93//                                        UWORD8 *pu1_dst,
94//                                        WORD32 src_strd,
95//                                        WORD32 dst_strd,
96//                                        WORD32 ui_neighboravailability)
97
98//**************Variables Vs Registers*****************************************
99//    x0 => *pu1_src
100//    x1 => *pu1_dst
101//    w2 =>  src_strd
102//    w3 =>  dst_strd
103//    w4 =>  ui_neighboravailability
104
105
106    .global ih264_intra_pred_luma_16x16_mode_vert_av8
107
108ih264_intra_pred_luma_16x16_mode_vert_av8:
109
110    push_v_regs
111    sxtw      x3, w3
112
113
114    add       x0, x0, #17
115    ld1       {v0.8b, v1.8b}, [x0]
116
117    st1       {v0.8b, v1.8b}, [x1], x3
118    st1       {v0.8b, v1.8b}, [x1], x3
119    st1       {v0.8b, v1.8b}, [x1], x3
120    st1       {v0.8b, v1.8b}, [x1], x3
121    st1       {v0.8b, v1.8b}, [x1], x3
122    st1       {v0.8b, v1.8b}, [x1], x3
123    st1       {v0.8b, v1.8b}, [x1], x3
124    st1       {v0.8b, v1.8b}, [x1], x3
125    st1       {v0.8b, v1.8b}, [x1], x3
126    st1       {v0.8b, v1.8b}, [x1], x3
127    st1       {v0.8b, v1.8b}, [x1], x3
128    st1       {v0.8b, v1.8b}, [x1], x3
129    st1       {v0.8b, v1.8b}, [x1], x3
130    st1       {v0.8b, v1.8b}, [x1], x3
131    st1       {v0.8b, v1.8b}, [x1], x3
132    st1       {v0.8b, v1.8b}, [x1], x3
133
134    pop_v_regs
135    ret
136
137
138
139
140
141///******************************************************************************
142
143
144///**
145//*******************************************************************************
146//*
147//*ih264_intra_pred_luma_16x16_mode_horz
148//*
149//* @brief
150//*  Perform Intra prediction for  luma_16x16 mode:horizontal
151//*
152//* @par Description:
153//*  Perform Intra prediction for  luma_16x16 mode:horizontal ,described in sec 8.3.3.2
154//*
155//* @param[in] pu1_src
156//*  UWORD8 pointer to the source
157//*
158//* @param[out] pu1_dst
159//*  UWORD8 pointer to the destination
160//*
161//* @param[in] src_strd
162//*  integer source stride
163//*
164//* @param[in] dst_strd
165//*  integer destination stride
166//*
167//* @param[in] ui_neighboravailability
168//* availability of neighbouring pixels(Not used in this function)
169//*
170//* @returns
171//*
172//* @remarks
173//*  None
174//*
175//*******************************************************************************
176//*/
177//void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src,
178//                                         UWORD8 *pu1_dst,
179//                                         WORD32 src_strd,
180//                                         WORD32 dst_strd,
181//                                         WORD32 ui_neighboravailability)
182//**************Variables Vs Registers*****************************************
183//    x0 => *pu1_src
184//    x1 => *pu1_dst
185//    w2 =>  src_strd
186//    w3 =>  dst_strd
187//    w4 =>  ui_neighboravailability
188
189    .global ih264_intra_pred_luma_16x16_mode_horz_av8
190
191ih264_intra_pred_luma_16x16_mode_horz_av8:
192
193
194
195    push_v_regs
196    sxtw      x3, w3
197
198    ld1       {v0.16b}, [x0]
199
200
201
202    dup       v10.16b, v0.b[15]
203    dup       v11.16b, v0.b[14]
204    dup       v12.16b, v0.b[13]
205    dup       v13.16b, v0.b[12]
206    st1       {v10.16b}, [x1], x3
207    dup       v14.16b, v0.b[11]
208    st1       {v11.16b}, [x1], x3
209    dup       v15.16b, v0.b[10]
210    st1       {v12.16b}, [x1], x3
211    dup       v16.16b, v0.b[9]
212    st1       {v13.16b}, [x1], x3
213    dup       v17.16b, v0.b[8]
214    st1       {v14.16b}, [x1], x3
215    dup       v18.16b, v0.b[7]
216    st1       {v15.16b}, [x1], x3
217    dup       v19.16b, v0.b[6]
218    st1       {v16.16b}, [x1], x3
219    dup       v20.16b, v0.b[5]
220    st1       {v17.16b}, [x1], x3
221    dup       v21.16b, v0.b[4]
222    st1       {v18.16b}, [x1], x3
223    dup       v22.16b, v0.b[3]
224    st1       {v19.16b}, [x1], x3
225    dup       v23.16b, v0.b[2]
226    st1       {v20.16b}, [x1], x3
227    dup       v24.16b, v0.b[1]
228    st1       {v21.16b}, [x1], x3
229    dup       v25.16b, v0.b[0]
230    st1       {v22.16b}, [x1], x3
231    st1       {v23.16b}, [x1], x3
232    st1       {v24.16b}, [x1], x3
233    st1       {v25.16b}, [x1], x3
234
235    pop_v_regs
236    ret
237
238
239
240
241
242
243
244///******************************************************************************
245
246
247///**
248//*******************************************************************************
249//*
250//*ih264_intra_pred_luma_16x16_mode_dc
251//*
252//* @brief
253//*  Perform Intra prediction for  luma_16x16 mode:DC
254//*
255//* @par Description:
256//*  Perform Intra prediction for  luma_16x16 mode:DC ,described in sec 8.3.3.3
257//*
258//* @param[in] pu1_src
259//*  UWORD8 pointer to the source
260//*
261//* @param[out] pu1_dst
262//*  UWORD8 pointer to the destination
263//*
264//* @param[in] src_strd
265//*  integer source stride
266//*
267//* @param[in] dst_strd
268//*  integer destination stride
269//*
270//* @param[in] ui_neighboravailability
271//*  availability of neighbouring pixels
272//*
273//* @returns
274//*
275//* @remarks
276//*  None
277//*
278//*******************************************************************************/
279//void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src,
280//                                       UWORD8 *pu1_dst,
281//                                       WORD32 src_strd,
282//                                       WORD32 dst_strd,
283//                                       WORD32 ui_neighboravailability)
284
285//**************Variables Vs Registers*****************************************
286//    x0 => *pu1_src
287//    x1 => *pu1_dst
288//    w2 =>  src_strd
289//    w3 =>  dst_strd
290//    w4 =>  ui_neighboravailability
291
292    .global ih264_intra_pred_luma_16x16_mode_dc_av8
293
294ih264_intra_pred_luma_16x16_mode_dc_av8:
295
296
297
298    push_v_regs
299    stp       x19, x20, [sp, #-16]!
300    sxtw      x3, w3
301
302    sub       v0.16b, v0.16b, v0.16b
303    sub       v1.16b, v1.16b, v1.16b
304    mov       w10, #0
305    mov       w11 , #3
306    ands      w6, w4, #0x01
307    beq       top_available             //LEFT NOT AVAILABLE
308    ld1       {v0.16b}, [x0]
309    add       w10, w10, #8
310    add       w11, w11, #1
311top_available:
312    ands      w6, w4, #0x04
313    beq       none_available
314    add       x6, x0, #17
315    ld1       {v1.16b}, [x6]
316    add       w10, w10, #8
317    add       w11, w11, #1
318    b         summation
319none_available:
320    cmp       w4, #0
321    bne       summation
322    mov       w15, #128
323    dup       v20.16b, w15
324    b         store
325summation:
326    uaddl     v2.8h, v0.8b, v1.8b
327    uaddl2    v3.8h, v0.16b, v1.16b
328    dup       v10.8h, w10
329    neg       w11, w11
330    dup       v20.8h, w11
331    add       v0.8h, v2.8h, v3.8h
332    mov       v1.d[0], v0.d[1]
333    add       v0.4h, v0.4h, v1.4h
334    addp      v0.4h, v0.4h , v0.4h
335    addp      v0.4h, v0.4h , v0.4h
336    add       v0.4h, v0.4h, v10.4h
337    uqshl     v0.8h, v0.8h, v20.8h
338    sqxtun    v0.8b, v0.8h
339    dup       v20.16b, v0.b[0]
340
341store:
342
343    st1       { v20.16b}, [x1], x3
344    st1       { v20.16b}, [x1], x3
345    st1       { v20.16b}, [x1], x3
346    st1       { v20.16b}, [x1], x3
347    st1       { v20.16b}, [x1], x3
348    st1       { v20.16b}, [x1], x3
349    st1       { v20.16b}, [x1], x3
350    st1       { v20.16b}, [x1], x3
351    st1       { v20.16b}, [x1], x3
352    st1       { v20.16b}, [x1], x3
353    st1       { v20.16b}, [x1], x3
354    st1       { v20.16b}, [x1], x3
355    st1       { v20.16b}, [x1], x3
356    st1       { v20.16b}, [x1], x3
357    st1       { v20.16b}, [x1], x3
358    st1       { v20.16b}, [x1], x3
359
360
361
362end_func:
363
364    ldp       x19, x20, [sp], #16
365    pop_v_regs
366    ret
367
368
369
370
371
372///******************************************************************************
373
374
375///**
376//*******************************************************************************
377//*
378//*ih264_intra_pred_luma_16x16_mode_plane
379//*
380//* @brief
381//*  Perform Intra prediction for  luma_16x16 mode:PLANE
382//*
383//* @par Description:
384//*  Perform Intra prediction for  luma_16x16 mode:PLANE ,described in sec 8.3.3.4
385//*
386//* @param[in] pu1_src
387//*  UWORD8 pointer to the source
388//*
389//* @param[out] pu1_dst
390//*  UWORD8 pointer to the destination
391//*
392//* @param[in] src_strd
393//*  integer source stride
394//*
395//* @param[in] dst_strd
396//*  integer destination stride
397//*
398//* @param[in] ui_neighboravailability
399//*  availability of neighbouring pixels
400//*
401//* @returns
402//*
403//* @remarks
404//*  None
405//*
406//*******************************************************************************/
407//void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src,
408//                                        UWORD8 *pu1_dst,
409//                                        WORD32 src_strd,
410//                                        WORD32 dst_strd,
411//                                        WORD32 ui_neighboravailability)
412
413//**************Variables Vs Registers*****************************************
414//    x0 => *pu1_src
415//    x1 => *pu1_dst
416//    w2 =>  src_strd
417//    w3 =>  dst_strd
418//    w4 =>  ui_neighboravailability
419
420    .global ih264_intra_pred_luma_16x16_mode_plane_av8
421ih264_intra_pred_luma_16x16_mode_plane_av8:
422
423    push_v_regs
424    stp       x19, x20, [sp, #-16]!
425    sxtw      x3, w3
426    mov       x2, x1
427    add       x1, x0, #17
428    add       x0, x0, #15
429    mov       x8, #9
430    sub       x1, x1, #1
431    mov       x10, x1                   //top_left
432    mov       x4, #-1
433    ld1       {v2.2s}, [x1], x8
434
435    adrp      x7, :got:ih264_gai1_intrapred_luma_plane_coeffs
436    ldr       x7, [x7, #:got_lo12:ih264_gai1_intrapred_luma_plane_coeffs]
437
438    ld1       {v0.2s}, [x1]
439    rev64     v2.8b, v2.8b
440    ld1       {v6.2s, v7.2s}, [x7]
441    usubl     v0.8h, v0.8b, v2.8b
442    uxtl      v16.8h, v6.8b
443    mul       v0.8h, v0.8h , v16.8h
444    uxtl      v18.8h, v7.8b
445    add       x7, x0, x4, lsl #3
446    sub       x0, x7, x4, lsl #1
447    neg       x14, x4
448    addp      v0.8h, v0.8h, v1.8h
449    ldrb      w8, [x7], #-1
450    ldrb      w9, [x0], #1
451    saddlp    v0.2s, v0.4h
452    sub       w12, w8, w9
453    ldrb      w8, [x7], #-1
454    saddlp    v0.1d, v0.2s
455    ldrb      w9, [x0], #1
456    sub       w8, w8, w9
457    shl       v2.2s, v0.2s, #2
458    add       w12, w12, w8, lsl #1
459    add       v0.2s, v0.2s , v2.2s
460    ldrb      w8, [x7], #-1
461    ldrb      w9, [x0], #1
462    srshr     v0.2s, v0.2s, #6          // i_b = D0[0]
463    sub       w8, w8, w9
464    ldrb      w5, [x7], #-1
465    add       w8, w8, w8, lsl #1
466    dup       v4.8h, v0.h[0]
467    add       w12, w12, w8
468    ldrb      w9, [x0], #1
469    mul       v0.8h, v4.8h , v16.8h
470    sub       w5, w5, w9
471    mul       v2.8h, v4.8h , v18.8h
472    add       w12, w12, w5, lsl #2
473    ldrb      w8, [x7], #-1
474    ldrb      w9, [x0], #1
475    sub       w8, w8, w9
476    ldrb      w5, [x7], #-1
477    add       w8, w8, w8, lsl #2
478    ldrb      w6, [x0], #1
479    add       w12, w12, w8
480    ldrb      w8, [x7], #-1
481    ldrb      w9, [x0], #1
482    sub       w5, w5, w6
483    sub       w8, w8, w9
484    add       w5, w5, w5, lsl #1
485    sub       w20, w8, w8, lsl #3
486    neg       w8, w20
487    add       w12, w12, w5, lsl #1
488    ldrb      w5, [x7], #-1
489    ldrb      w6, [x10]                 //top_left
490    add       w12, w12, w8
491    sub       w9, w5, w6
492    ldrb      w6, [x1, #7]
493    add       w12, w12, w9, lsl #3      // i_c = w12
494    add       w8, w5, w6
495    add       w12, w12, w12, lsl #2
496    lsl       w8, w8, #4                // i_a = w8
497    add       w12, w12, #0x20
498    lsr       w12, w12, #6
499    shl       v28.8h, v4.8h, #3
500    dup       v6.8h, w12
501    dup       v30.8h, w8
502    shl       v26.8h, v6.8h, #3
503    sub       v30.8h, v30.8h , v28.8h
504    sub       v30.8h, v30.8h , v26.8h
505    add       v28.8h, v30.8h , v6.8h
506    add       v26.8h, v28.8h , v0.8h
507    add       v28.8h, v28.8h , v2.8h
508    sqrshrun  v20.8b, v26.8h, #5
509    sqrshrun  v21.8b, v28.8h, #5
510    add       v26.8h, v26.8h , v6.8h
511    add       v28.8h, v28.8h , v6.8h
512    sqrshrun  v22.8b, v26.8h, #5
513    st1       {v20.2s, v21.2s}, [x2], x3
514    sqrshrun  v23.8b, v28.8h, #5
515    add       v26.8h, v26.8h , v6.8h
516    add       v28.8h, v28.8h , v6.8h
517    sqrshrun  v20.8b, v26.8h, #5
518    st1       {v22.2s, v23.2s}, [x2], x3
519    sqrshrun  v21.8b, v28.8h, #5
520    add       v26.8h, v26.8h , v6.8h
521    add       v28.8h, v28.8h , v6.8h
522    sqrshrun  v22.8b, v26.8h, #5
523    st1       {v20.2s, v21.2s}, [x2], x3
524    sqrshrun  v23.8b, v28.8h, #5
525    add       v26.8h, v26.8h , v6.8h
526    add       v28.8h, v28.8h , v6.8h
527    sqrshrun  v20.8b, v26.8h, #5
528    st1       {v22.2s, v23.2s}, [x2], x3
529    sqrshrun  v21.8b, v28.8h, #5
530    add       v26.8h, v26.8h , v6.8h
531    add       v28.8h, v28.8h , v6.8h
532    sqrshrun  v22.8b, v26.8h, #5
533    st1       {v20.2s, v21.2s}, [x2], x3
534    sqrshrun  v23.8b, v28.8h, #5
535    add       v26.8h, v26.8h , v6.8h
536    add       v28.8h, v28.8h , v6.8h
537    sqrshrun  v20.8b, v26.8h, #5
538    st1       {v22.2s, v23.2s}, [x2], x3
539    sqrshrun  v21.8b, v28.8h, #5
540    add       v26.8h, v26.8h , v6.8h
541    add       v28.8h, v28.8h , v6.8h
542    sqrshrun  v22.8b, v26.8h, #5
543    st1       {v20.2s, v21.2s}, [x2], x3
544    sqrshrun  v23.8b, v28.8h, #5
545    add       v26.8h, v26.8h , v6.8h
546    add       v28.8h, v28.8h , v6.8h
547    sqrshrun  v20.8b, v26.8h, #5
548    st1       {v22.2s, v23.2s}, [x2], x3
549    sqrshrun  v21.8b, v28.8h, #5
550    add       v26.8h, v26.8h , v6.8h
551    add       v28.8h, v28.8h , v6.8h
552    sqrshrun  v22.8b, v26.8h, #5
553    st1       {v20.2s, v21.2s}, [x2], x3
554    sqrshrun  v23.8b, v28.8h, #5
555    add       v26.8h, v26.8h , v6.8h
556    add       v28.8h, v28.8h , v6.8h
557    sqrshrun  v20.8b, v26.8h, #5
558    st1       {v22.2s, v23.2s}, [x2], x3
559    sqrshrun  v21.8b, v28.8h, #5
560    add       v26.8h, v26.8h , v6.8h
561    add       v28.8h, v28.8h , v6.8h
562    sqrshrun  v22.8b, v26.8h, #5
563    st1       {v20.2s, v21.2s}, [x2], x3
564    sqrshrun  v23.8b, v28.8h, #5
565    add       v26.8h, v26.8h , v6.8h
566    add       v28.8h, v28.8h , v6.8h
567    sqrshrun  v20.8b, v26.8h, #5
568    st1       {v22.2s, v23.2s}, [x2], x3
569    sqrshrun  v21.8b, v28.8h, #5
570    add       v26.8h, v26.8h , v6.8h
571    add       v28.8h, v28.8h , v6.8h
572    sqrshrun  v22.8b, v26.8h, #5
573    st1       {v20.2s, v21.2s}, [x2], x3
574    sqrshrun  v23.8b, v28.8h, #5
575    add       v26.8h, v26.8h , v6.8h
576    add       v28.8h, v28.8h , v6.8h
577    sqrshrun  v20.8b, v26.8h, #5
578    st1       {v22.2s, v23.2s}, [x2], x3
579    sqrshrun  v21.8b, v28.8h, #5
580    add       v26.8h, v26.8h , v6.8h
581    add       v28.8h, v28.8h , v6.8h
582    sqrshrun  v22.8b, v26.8h, #5
583    st1       {v20.2s, v21.2s}, [x2], x3
584    sqrshrun  v23.8b, v28.8h, #5
585    st1       {v22.2s, v23.2s}, [x2], x3
586
587end_func_plane:
588
589    ldp       x19, x20, [sp], #16
590    pop_v_regs
591    ret
592
593