1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20@**
21@******************************************************************************
22@* @file
23@*  ih264_intra_pred_luma_16x16_a9q.s
24@*
25@* @brief
26@*  Contains function definitions for intra 16x16 Luma prediction .
27@*
28@* @author
29@*  Ittiam
30@*
31@* @par List of Functions:
32@*
33@*  - ih264_intra_pred_luma_16x16_mode_vert_a9q()
34@*  - ih264_intra_pred_luma_16x16_mode_horz_a9q()
35@*  - ih264_intra_pred_luma_16x16_mode_dc_a9q()
36@*  - ih264_intra_pred_luma_16x16_mode_plane_a9q()
37@*
38@* @remarks
39@*  None
40@*
41@*******************************************************************************
42@*
43
44@* All the functions here are replicated from ih264_intra_pred_filters.c
45@
46
47@**
48@**
49@**
50@
51
52.text
53.p2align 2
54
55
56    .extern ih264_gai1_intrapred_luma_plane_coeffs
57.hidden ih264_gai1_intrapred_luma_plane_coeffs
58scratch_intrapred_addr1:
59    .long ih264_gai1_intrapred_luma_plane_coeffs - scrlbl1 - 8
60@**
61@*******************************************************************************
62@*
63@*ih264_intra_pred_luma_16x16_mode_vert
64@*
65@* @brief
66@*   Perform Intra prediction for  luma_16x16 mode:vertical
67@*
68@* @par Description:
69@* Perform Intra prediction for  luma_16x16 mode:Vertical ,described in sec 8.3.3.1
70@*
71@* @param[in] pu1_src
72@*  UWORD8 pointer to the source
73@*
74@* @param[out] pu1_dst
75@*  UWORD8 pointer to the destination
76@*
77@* @param[in] src_strd
78@*  integer source stride
79@*
80@* @param[in] dst_strd
81@*  integer destination stride
82@*
83@* @param[in] ui_neighboravailability
84@* availability of neighbouring pixels(Not used in this function)
85@*
86@* @returns
87@*
88@* @remarks
89@*  None
90@*
91@*******************************************************************************
92@void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src,
93@                                        UWORD8 *pu1_dst,
94@                                        WORD32 src_strd,
95@                                        WORD32 dst_strd,
96@                                        WORD32 ui_neighboravailability)
97
98@**************Variables Vs Registers*****************************************
99@   r0 => *pu1_src
100@   r1 => *pu1_dst
101@   r2 =>  src_strd
102@   r3 =>  dst_strd
103@   r4 =>  ui_neighboravailability
104
105
106    .global ih264_intra_pred_luma_16x16_mode_vert_a9q
107
108ih264_intra_pred_luma_16x16_mode_vert_a9q:
109
110    stmfd         sp!, {r4-r12, r14}    @store register values to stack
111
112    add           r0, r0, #17
113    vld1.8        {q0}, [r0]
114
115    vst1.8        {q0}, [r1], r3
116    vst1.8        {q0}, [r1], r3
117    vst1.8        {q0}, [r1], r3
118    vst1.8        {q0}, [r1], r3
119    vst1.8        {q0}, [r1], r3
120    vst1.8        {q0}, [r1], r3
121    vst1.8        {q0}, [r1], r3
122    vst1.8        {q0}, [r1], r3
123    vst1.8        {q0}, [r1], r3
124    vst1.8        {q0}, [r1], r3
125    vst1.8        {q0}, [r1], r3
126    vst1.8        {q0}, [r1], r3
127    vst1.8        {q0}, [r1], r3
128    vst1.8        {q0}, [r1], r3
129    vst1.8        {q0}, [r1], r3
130    vst1.8        {q0}, [r1], r3
131
132    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack
133
134
135
136
137
138@******************************************************************************
139
140
141@**
142@*******************************************************************************
143@*
144@*ih264_intra_pred_luma_16x16_mode_horz
145@*
146@* @brief
147@*  Perform Intra prediction for  luma_16x16 mode:horizontal
148@*
149@* @par Description:
150@*  Perform Intra prediction for  luma_16x16 mode:horizontal ,described in sec 8.3.3.2
151@*
152@* @param[in] pu1_src
153@*  UWORD8 pointer to the source
154@*
155@* @param[out] pu1_dst
156@*  UWORD8 pointer to the destination
157@*
158@* @param[in] src_strd
159@*  integer source stride
160@*
161@* @param[in] dst_strd
162@*  integer destination stride
163@*
164@* @param[in] ui_neighboravailability
165@* availability of neighbouring pixels(Not used in this function)
166@*
167@* @returns
168@*
169@* @remarks
170@*  None
171@*
172@*******************************************************************************
173@*
174@void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src,
175@                                         UWORD8 *pu1_dst,
176@                                         WORD32 src_strd,
177@                                         WORD32 dst_strd,
178@                                         WORD32 ui_neighboravailability)
179@**************Variables Vs Registers*****************************************
180@   r0 => *pu1_src
181@   r1 => *pu1_dst
182@   r2 =>  src_strd
183@   r3 =>  dst_strd
184@   r4 =>  ui_neighboravailability
185
186    .global ih264_intra_pred_luma_16x16_mode_horz_a9q
187
188ih264_intra_pred_luma_16x16_mode_horz_a9q:
189
190    stmfd         sp!, {r14}            @store register values to stack
191
192    vld1.u8       {q0}, [r0]
193    mov           r2, #14
194
195    vdup.u8       q1, d1[7]
196    vdup.u8       q2, d1[6]
197    vst1.8        {q1}, [r1], r3
198
199loop_16x16_horz:
200    vext.8        q0, q0, q0, #14
201    vst1.8        {q2}, [r1], r3
202    vdup.u8       q1, d1[7]
203    subs          r2, #2
204    vdup.u8       q2, d1[6]
205    vst1.8        {q1}, [r1], r3
206    bne           loop_16x16_horz
207
208    vext.8        q0, q0, q0, #14
209    vst1.8        {q2}, [r1], r3
210
211    ldmfd         sp!, {pc}             @Restoring registers from stack
212
213
214
215
216@******************************************************************************
217
218
219@**
220@*******************************************************************************
221@*
222@*ih264_intra_pred_luma_16x16_mode_dc
223@*
224@* @brief
225@*  Perform Intra prediction for  luma_16x16 mode:DC
226@*
227@* @par Description:
228@*  Perform Intra prediction for  luma_16x16 mode:DC ,described in sec 8.3.3.3
229@*
230@* @param[in] pu1_src
231@*  UWORD8 pointer to the source
232@*
233@* @param[out] pu1_dst
234@*  UWORD8 pointer to the destination
235@*
236@* @param[in] src_strd
237@*  integer source stride
238@*
239@* @param[in] dst_strd
240@*  integer destination stride
241@*
242@* @param[in] ui_neighboravailability
243@*  availability of neighbouring pixels
244@*
245@* @returns
246@*
247@* @remarks
248@*  None
249@*
250@*******************************************************************************
251@void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src,
252@                                       UWORD8 *pu1_dst,
253@                                       WORD32 src_strd,
254@                                       WORD32 dst_strd,
255@                                       WORD32 ui_neighboravailability)
256
257@**************Variables Vs Registers*****************************************
258@   r0 => *pu1_src
259@   r1 => *pu1_dst
260@   r2 =>  src_strd
261@   r3 =>  dst_strd
262@   r4 =>  ui_neighboravailability
263
264    .global ih264_intra_pred_luma_16x16_mode_dc_a9q
265
266ih264_intra_pred_luma_16x16_mode_dc_a9q:
267
268    stmfd         sp!, {r4, r14}        @store register values to stack
269    ldr           r4, [sp, #8]          @r4 =>  ui_neighboravailability
270
271    ands          r2, r4, #0x01         @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE
272    beq           top_available
273    ands          r2, r4, #0x04         @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
274    beq           left_available
275
276    vld1.u8       {q0}, [r0]            @BOTH LEFT AND TOP AVAILABLE
277    add           r0, r0, #17
278    vpaddl.u8     q0, q0
279    vld1.u8       {q1}, [r0]
280    vpaddl.u8     q1, q1
281    vadd.u16      q0, q0, q1
282    vadd.u16      d0, d0, d1
283    vpaddl.u16    d0, d0
284    vpaddl.u32    d0, d0
285    vqrshrun.s16  d0, q0, #5
286    vdup.u8       q0, d0[0]
287    b             str_pred
288
289top_available:                          @ONLY TOP AVAILABLE
290    ands          r2, r4, #0x04         @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
291    beq           none_available
292
293    add           r0, r0, #17
294    vld1.u8       {q0}, [r0]
295    vpaddl.u8     q0, q0
296    vadd.u16      d0, d0, d1
297    vpaddl.u16    d0, d0
298    vpaddl.u32    d0, d0
299    vqrshrun.s16  d0, q0, #4
300    vdup.u8       q0, d0[0]
301    b             str_pred
302
303left_available:                         @ONLY LEFT AVAILABLE
304    vld1.u8       {q0}, [r0]
305    vpaddl.u8     q0, q0
306    vadd.u16      d0, d0, d1
307    vpaddl.u16    d0, d0
308    vpaddl.u32    d0, d0
309    vqrshrun.s16  d0, q0, #4
310    vdup.u8       q0, d0[0]
311    b             str_pred
312
313none_available:                         @NONE AVAILABLE
314    vmov.u8       q0, #128
315
316str_pred:
317    vst1.8        {q0}, [r1], r3
318    vst1.8        {q0}, [r1], r3
319    vst1.8        {q0}, [r1], r3
320    vst1.8        {q0}, [r1], r3
321    vst1.8        {q0}, [r1], r3
322    vst1.8        {q0}, [r1], r3
323    vst1.8        {q0}, [r1], r3
324    vst1.8        {q0}, [r1], r3
325    vst1.8        {q0}, [r1], r3
326    vst1.8        {q0}, [r1], r3
327    vst1.8        {q0}, [r1], r3
328    vst1.8        {q0}, [r1], r3
329    vst1.8        {q0}, [r1], r3
330    vst1.8        {q0}, [r1], r3
331    vst1.8        {q0}, [r1], r3
332    vst1.8        {q0}, [r1], r3
333
334    ldmfd         sp!, {r4, pc}         @Restoring registers from stack
335
336
337
338
339
340@******************************************************************************
341
342
343@**
344@*******************************************************************************
345@*
346@*ih264_intra_pred_luma_16x16_mode_plane
347@*
348@* @brief
349@*  Perform Intra prediction for  luma_16x16 mode:PLANE
350@*
351@* @par Description:
352@*  Perform Intra prediction for  luma_16x16 mode:PLANE ,described in sec 8.3.3.4
353@*
354@* @param[in] pu1_src
355@*  UWORD8 pointer to the source
356@*
357@* @param[out] pu1_dst
358@*  UWORD8 pointer to the destination
359@*
360@* @param[in] src_strd
361@*  integer source stride
362@*
363@* @param[in] dst_strd
364@*  integer destination stride
365@*
366@* @param[in] ui_neighboravailability
367@*  availability of neighbouring pixels
368@*
369@* @returns
370@*
371@* @remarks
372@*  None
373@*
374@*******************************************************************************
375@void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src,
376@                                        UWORD8 *pu1_dst,
377@                                        WORD32 src_strd,
378@                                        WORD32 dst_strd,
379@                                        WORD32 ui_neighboravailability)
380
381@**************Variables Vs Registers*****************************************
382@   r0 => *pu1_src
383@   r1 => *pu1_dst
384@   r2 =>  src_strd
385@   r3 =>  dst_strd
386@   r4 =>  ui_neighboravailability
387
388    .global ih264_intra_pred_luma_16x16_mode_plane_a9q
389ih264_intra_pred_luma_16x16_mode_plane_a9q:
390
391    stmfd         sp!, {r4-r10, r12, lr}
392
393    mov           r2, r1
394    add           r1, r0, #17
395    add           r0, r0, #15
396
397    mov           r8, #9
398    sub           r1, r1, #1
399    mov           r10, r1               @top_left
400    mov           r4, #-1
401    vld1.32       d2, [r1], r8
402    ldr           r7, scratch_intrapred_addr1
403scrlbl1:
404    add           r7, r7, pc
405
406    vld1.32       d0, [r1]
407    vrev64.8      d2, d2
408    vld1.32       {q3}, [r7]
409    vsubl.u8      q0, d0, d2
410    vmovl.u8      q8, d6
411    vmul.s16      q0, q0, q8
412    vmovl.u8      q9, d7
413
414    add           r7, r0, r4, lsl #3
415    sub           r0, r7, r4, lsl #1
416    neg           lr, r4
417
418    vpadd.s16     d0, d0, d1
419
420    ldrb          r8, [r7], r4
421    ldrb          r9, [r0], lr
422
423    vpaddl.s16    d0, d0
424    sub           r12, r8, r9
425
426    ldrb          r8, [r7], r4
427
428    vpaddl.s32    d0, d0
429    ldrb          r9, [r0], lr
430    sub           r8, r8, r9
431    vshl.s32      d2, d0, #2
432    add           r12, r12, r8, lsl #1
433
434    vadd.s32      d0, d0, d2
435    ldrb          r8, [r7], r4
436    ldrb          r9, [r0], lr
437    vrshr.s32     d0, d0, #6            @ i_b = D0[0]
438    sub           r8, r8, r9
439    ldrb          r5, [r7], r4
440    add           r8, r8, r8, lsl #1
441
442    vdup.16       q2, d0[0]
443    add           r12, r12, r8
444    ldrb          r9, [r0], lr
445    vmul.s16      q0, q2, q8
446    sub           r5, r5, r9
447    vmul.s16      q1, q2, q9
448    add           r12, r12, r5, lsl #2
449
450    ldrb          r8, [r7], r4
451    ldrb          r9, [r0], lr
452    sub           r8, r8, r9
453    ldrb          r5, [r7], r4
454    add           r8, r8, r8, lsl #2
455    ldrb          r6, [r0], lr
456    add           r12, r12, r8
457    ldrb          r8, [r7], r4
458    ldrb          r9, [r0], lr
459
460    sub           r5, r5, r6
461    sub           r8, r8, r9
462    add           r5, r5, r5, lsl #1
463    rsb           r8, r8, r8, lsl #3
464    add           r12, r12, r5, lsl #1
465    ldrb          r5, [r7], r4
466    ldrb          r6, [r10]             @top_left
467    add           r12, r12, r8
468    sub           r9, r5, r6
469    ldrb          r6, [r1, #7]
470    add           r12, r12, r9, lsl #3  @ i_c = r12
471    add           r8, r5, r6
472
473    add           r12, r12, r12, lsl #2
474    lsl           r8, r8, #4            @ i_a = r8
475
476    add           r12, r12, #0x20
477    lsr           r12, r12, #6
478
479    vshl.s16      q14, q2, #3
480    vdup.16       q3, r12
481
482    vdup.16       q15, r8
483    vshl.s16      q13, q3, #3
484    vsub.s16      q15, q15, q14
485    vsub.s16      q15, q15, q13
486    vadd.s16      q14, q15, q3
487
488    mov           r0, #14
489    vadd.s16      q13, q14, q0
490    vadd.s16      q14, q14, q1
491    vqrshrun.s16  d20, q13, #5
492    vqrshrun.s16  d21, q14, #5
493
494loop_16x16_plane:
495
496    vadd.s16      q13, q13, q3
497    vadd.s16      q14, q14, q3
498    vqrshrun.s16  d22, q13, #5
499    vst1.32       {q10}, [r2], r3
500    vqrshrun.s16  d23, q14, #5
501
502    vadd.s16      q13, q13, q3
503    subs          r0, #2
504    vadd.s16      q14, q14, q3
505    vqrshrun.s16  d20, q13, #5
506    vst1.32       {q11}, [r2], r3
507    vqrshrun.s16  d21, q14, #5
508    bne           loop_16x16_plane
509
510    vadd.s16      q13, q13, q3
511    vadd.s16      q14, q14, q3
512    vqrshrun.s16  d22, q13, #5
513    vst1.32       {q10}, [r2], r3
514    vqrshrun.s16  d23, q14, #5
515    vst1.32       {q11}, [r2], r3
516
517    ldmfd         sp!, {r4-r10, r12, pc}
518
519
520
521