1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21// *******************************************************************************
22// * @file
23// *  ih264_padding_neon.s
24// *
25// * @brief
26// *  Contains function definitions padding
27// *
28// * @author
29// *     Ittiam
30// *
31// * @par List of Functions:
32// *  - ih264_pad_top_av8()
33// *  - ih264_pad_left_luma_av8()
34// *  - ih264_pad_left_chroma_av8()
35// *  - ih264_pad_right_luma_av8()
36// *  - ih264_pad_right_chroma_av8()
37// *
38// * @remarks
39// *  None
40// *
41// *******************************************************************************
42//*/
43
44.text
45.p2align 2
46.include "ih264_neon_macros.s"
47///**
48//*******************************************************************************
49//*
50//* @brief pad at the top of a 2d array
51//*
52//* @par Description:
53//*  The top row of a 2d array is replicated for pad_size times at the top
54//*
55//* @param[in] pu1_src
56//*  UWORD8 pointer to the source
57//*
58//* @param[in] src_strd
59//*  integer source stride
60//*
61//* @param[in] wd
62//*  integer width of the array
63//*
64//* @param[in] pad_size
65//*  integer -padding size of the array
66//*
67//* @returns none
68//*
69//* @remarks none
70//*
71//*******************************************************************************
72//*/
73//void ih264_pad_top(UWORD8 *pu1_src,
74//                   WORD32 src_strd,
75//                   WORD32 wd,
76//                   WORD32 pad_size)
77//**************Variables Vs Registers*************************
78//    x0 => *pu1_src
79//    w1 => src_strd
80//    w2 => wd
81//    w3 => pad_size
82
83    .global ih264_pad_top_av8
84
85ih264_pad_top_av8:
86
87    // STMFD sp!, {x4-x11,x14}                //stack stores the values of the arguments
88    push_v_regs
89    sxtw      x1, w1
90    stp       x19, x20, [sp, #-16]!
91
92    sub       x5, x0, x1
93    neg       x6, x1
94
95loop_neon_memcpy_mul_16:
96    // Load 16 bytes
97    ld1       {v0.8b, v1.8b}, [x0], #16
98    mov       x4, x5
99    mov       w7, w3
100    add       x5, x5, #16
101
102loop_neon_pad_top:
103    st1       {v0.8b, v1.8b}, [x4], x6
104    subs      w7, w7, #1
105    bne       loop_neon_pad_top
106
107    subs      w2, w2, #16
108    bne       loop_neon_memcpy_mul_16
109
110    // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
111    ldp       x19, x20, [sp], #16
112    pop_v_regs
113    ret
114
115
116
117
118///**
119//*******************************************************************************
120//*
121//* @brief
122//*   Padding (luma block) at the left of a 2d array
123//*
124//* @par Description:
125//*   The left column of a 2d array is replicated for pad_size times at the left
126//*
127//*
128//* @param[in] pu1_src
129//*  UWORD8 pointer to the source
130//*
131//* @param[in] src_strd
132//*  integer source stride
133//*
134//* @param[in] ht
135//*  integer height of the array
136//*
137//* @param[in] wd
138//*  integer width of the array
139//*
140//* @param[in] pad_size
141//*  integer -padding size of the array
142//*
143//* @param[in] ht
144//*  integer height of the array
145//*
146//* @param[in] wd
147//*  integer width of the array
148//*
149//* @returns
150//*
151//* @remarks
152//*  None
153//*
154//*******************************************************************************
155//*/
156//#if PAD_LEFT_LUMA == C
157//void ih264_pad_left_luma(UWORD8 *pu1_src,
158//                        WORD32 src_strd,
159//                        WORD32 ht,
160//                        WORD32 pad_size)
161//**************Variables Vs Registers*************************
162//    x0 => *pu1_src
163//    w1 => src_strd
164//    w2 => ht
165//    w3 => pad_size
166
167
168
169    .global ih264_pad_left_luma_av8
170
171ih264_pad_left_luma_av8:
172
173    // STMFD sp!, {x4-x11,x14}                //stack stores the values of the arguments
174    push_v_regs
175    sxtw      x1, w1
176    sxtw      x3, w3
177    stp       x19, x20, [sp, #-16]!
178
179
180    sub       x4, x0, x3
181    sub       x6, x1, #16
182    subs      x5, x3, #16
183    bne       loop_32
184loop_16:                                //  /*hard coded for width=16  ,height =8,16*/
185    ldrb      w8, [x0]
186    add       x0, x0, x1
187    ldrb      w9, [x0]
188    add       x0, x0, x1
189    dup       v0.16b, w8
190    ldrb      w10, [x0]
191    add       x0, x0, x1
192    st1       {v0.16b}, [x4], x1        // 16 bytes store
193    dup       v2.16b, w9
194    st1       {v2.16b}, [x4], x1        // 16 bytes store
195    ldrb      w11, [x0]
196    add       x0, x0, x1
197    dup       v4.16b, w10
198    dup       v6.16b, w11
199    st1       {v4.16b}, [x4], x1        // 16 bytes store
200    ldrb      w8, [x0]
201    add       x0, x0, x1
202    st1       {v6.16b}, [x4], x1        // 16 bytes store
203    ldrb      w9, [x0]
204    add       x0, x0, x1
205    dup       v0.16b, w8
206    ldrb      w10, [x0]
207    add       x0, x0, x1
208    st1       {v0.16b}, [x4], x1        // 16 bytes store
209    dup       v2.16b, w9
210    ldrb      w11, [x0]
211    add       x0, x0, x1
212    st1       {v2.16b}, [x4], x1        // 16 bytes store
213    dup       v4.16b, w10
214    dup       v6.16b, w11
215    subs      w2, w2, #8
216    st1       {v4.16b}, [x4], x1        // 16 bytes store
217    st1       {v6.16b}, [x4], x1        // 16 bytes store
218    bne       loop_16
219    b         end_func
220
221loop_32:                                //  /*hard coded for width=32 ,height =8,16*/
222    ldrb      w8, [x0]
223    add       x0, x0, x1
224    ldrb      w9, [x0]
225    add       x0, x0, x1
226    dup       v0.16b, w8
227    ldrb      w10, [x0]
228    add       x0, x0, x1
229    st1       {v0.16b}, [x4], #16       // 16 bytes store
230    dup       v2.16b, w9
231    st1       {v0.16b}, [x4], x6
232    st1       {v2.16b}, [x4], #16       // 16 bytes store
233    dup       v4.16b, w10
234    st1       {v2.16b}, [x4], x6        // 16 bytes store
235    ldrb      w11, [x0]
236    add       x0, x0, x1
237    st1       {v4.16b}, [x4], #16       // 16 bytes store
238    dup       v6.16b, w11
239    st1       {v4.16b}, [x4], x6        // 16 bytes store
240    ldrb      w8, [x0]
241    add       x0, x0, x1
242    st1       {v6.16b}, [x4], #16       // 16 bytes store
243    dup       v0.16b, w8
244    ldrb      w9, [x0]
245    add       x0, x0, x1
246    st1       {v6.16b}, [x4], x6        // 16 bytes store
247    ldrb      w10, [x0]
248    add       x0, x0, x1
249    st1       {v0.16b}, [x4], #16       // 16 bytes store
250    dup       v2.16b, w9
251    st1       {v0.16b}, [x4], x6        // 16 bytes store
252    ldrb      w11, [x0]
253    add       x0, x0, x1
254    st1       {v2.16b}, [x4], #16       // 16 bytes store
255    dup       v4.16b, w10
256    st1       {v2.16b}, [x4], x6        // 16 bytes store
257    st1       {v4.16b}, [x4], #16       // 16 bytes store
258    dup       v6.16b, w11
259    st1       {v4.16b}, [x4], x6        // 16 bytes store
260    subs      w2, w2, #8
261    st1       {v6.16b}, [x4], #16       // 16 bytes store
262    st1       {v6.16b}, [x4], x6        // 16 bytes store
263    bne       loop_32
264
265
266
267end_func:
268    // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
269    ldp       x19, x20, [sp], #16
270    pop_v_regs
271    ret
272
273
274
275
276
277///**
278//*******************************************************************************
279//*
280//* @brief
281//*   Padding (chroma block) at the left of a 2d array
282//*
283//* @par Description:
284//*   The left column of a 2d array is replicated for pad_size times at the left
285//*
286//*
287//* @param[in] pu1_src
288//*  UWORD8 pointer to the source
289//*
290//* @param[in] src_strd
291//*  integer source stride
292//*
293//* @param[in] ht
294//*  integer height of the array
295//*
296//* @param[in] wd
297//*  integer width of the array (each colour component)
298//*
299//* @param[in] pad_size
300//*  integer -padding size of the array
301//*
302//* @param[in] ht
303//*  integer height of the array
304//*
305//* @param[in] wd
306//*  integer width of the array
307//*
308//* @returns
309//*
310//* @remarks
311//*  None
312//*
313//*******************************************************************************
314//*/
315//#if PAD_LEFT_CHROMA == C
316//void ih264_pad_left_chroma(UWORD8 *pu1_src,
317//                            WORD32 src_strd,
318//                            WORD32 ht,
319//                            WORD32 pad_size)
320//{
321//    x0 => *pu1_src
322//    w1 => src_strd
323//    w2 => ht
324//    w3 => pad_size
325
326
327
328    .global ih264_pad_left_chroma_av8
329
330ih264_pad_left_chroma_av8:
331
332    // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
333    push_v_regs
334    sxtw      x1, w1
335    sxtw      x3, w3
336    stp       x19, x20, [sp, #-16]!
337
338    sub       x4, x0, x3
339    sub       x6, x1, #16
340
341
342loop_32_l_c:                            //  /*hard coded for width=32  ,height =4,8,12*/
343    ldrh      w8, [x0]
344    add       x0, x0, x1
345    ldrh      w9, [x0]
346    add       x0, x0, x1
347    dup       v0.8h, w8
348    ldrh      w10, [x0]
349    add       x0, x0, x1
350    st1       {v0.16b}, [x4], #16       // 16 bytes store
351    dup       v2.8h, w9
352    st1       {v0.16b}, [x4], x6        // 16 bytes store
353    ldrh      w11, [x0]
354    add       x0, x0, x1
355    st1       {v2.16b}, [x4], #16       // 16 bytes store
356    dup       v4.8h, w10
357    st1       {v2.16b}, [x4], x6        // 16 bytes store
358    dup       v6.8h, w11
359    st1       {v4.16b}, [x4], #16       // 16 bytes store
360    st1       {v4.16b}, [x4], x6        // 16 bytes store
361    subs      w2, w2, #4
362    st1       {v6.16b}, [x4], #16       // 16 bytes store
363    st1       {v6.16b}, [x4], x6        // 16 bytes store
364
365
366    beq       end_func_l_c              ///* Branching when ht=4*/
367
368    ldrh      w8, [x0]
369    add       x0, x0, x1
370    ldrh      w9, [x0]
371    add       x0, x0, x1
372    dup       v0.8h, w8
373    ldrh      w10, [x0]
374    add       x0, x0, x1
375    st1       {v0.16b}, [x4], #16       // 16 bytes store
376    dup       v2.8h, w9
377    st1       {v0.16b}, [x4], x6
378    ldrh      w11, [x0]
379    add       x0, x0, x1
380    st1       {v2.16b}, [x4], #16       // 16 bytes store
381    dup       v4.8h, w10
382    st1       {v2.16b}, [x4], x6        // 16 bytes store
383    dup       v6.8h, w11
384    st1       {v4.16b}, [x4], #16       // 16 bytes store
385    st1       {v4.16b}, [x4], x6        // 16 bytes store
386    subs      w2, w2, #4
387    st1       {v6.16b}, [x4], #16       // 16 bytes store
388    st1       {v6.16b}, [x4], x6        // 16 bytes store
389
390    beq       end_func_l_c              ///* Branching when ht=8*/
391    bne       loop_32_l_c
392
393    ldrh      w8, [x0]
394    add       x0, x0, x1
395    ldrh      w9, [x0]
396    add       x0, x0, x1
397    dup       v0.8h, w8
398    ldrh      w10, [x0]
399    add       x0, x0, x1
400    st1       {v0.16b}, [x4], #16       // 16 bytes store
401    dup       v2.8h, w9
402    st1       {v0.16b}, [x4], x6
403    ldrh      w11, [x0]
404    add       x0, x0, x1
405    st1       {v2.16b}, [x4], #16       // 16 bytes store
406    dup       v4.8h, w10
407    st1       {v2.16b}, [x4], x6        // 16 bytes store
408    dup       v6.8h, w11
409    st1       {v4.16b}, [x4], #16       // 16 bytes store
410    st1       {v4.16b}, [x4], x6        // 16 bytes store
411    st1       {v6.16b}, [x4], #16       // 16 bytes store
412    st1       {v6.16b}, [x4], x6        // 16 bytes store
413
414end_func_l_c:
415    // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
416    ldp       x19, x20, [sp], #16
417    pop_v_regs
418    ret
419
420
421
422
423
424///**
425//*******************************************************************************
426//*
427//* @brief
428//* Padding (luma block) at the right of a 2d array
429//*
430//* @par Description:
431//* The right column of a 2d array is replicated for pad_size times at the right
432//*
433//*
434//* @param[in] pu1_src
435//*  UWORD8 pointer to the source
436//*
437//* @param[in] src_strd
438//*  integer source stride
439//*
440//* @param[in] ht
441//*  integer height of the array
442//*
443//* @param[in] wd
444//*  integer width of the array
445//*
446//* @param[in] pad_size
447//*  integer -padding size of the array
448//*
449//* @param[in] ht
450//*  integer height of the array
451//*
452//* @param[in] wd
453//*  integer width of the array
454//*
455//* @returns
456//*
457//* @remarks
458//*  None
459//*
460//*******************************************************************************
461//*/
462//#if PAD_RIGHT_LUMA == C
463//void ih264_pad_right_luma(UWORD8 *pu1_src,
464//                        WORD32 src_strd,
465//                        WORD32 ht,
466//                        WORD32 pad_size)
467//{
468//    WORD32 row;
469//
470//    for(row = 0; row < ht; row++)
471//    {
472//        memset(pu1_src, *(pu1_src -1), pad_size);
473//
474//        pu1_src += src_strd;
475//    }
476//}
477//
478//    x0 => *pu1_src
479//    w1 => src_strd
480//    w2 => ht
481//    w3 => pad_size
482
483
484
485    .global ih264_pad_right_luma_av8
486
487ih264_pad_right_luma_av8:
488
489    // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
490    push_v_regs
491    sxtw      x1, w1
492    sxtw      x3, w3
493    stp       x19, x20, [sp, #-16]!
494
495    mov       x4, x0
496    sub       x6, x1, #16
497    sub       x0, x0, #1
498    subs      x5, x3, #16
499    bne       loop_32
500loop_16_r: //  /*hard coded for width=16  ,height =8,16*/
501    ldrb      w8, [x0]
502    add       x0, x0, x1
503    ldrb      w9, [x0]
504    add       x0, x0, x1
505    dup       v0.16b, w8
506    ldrb      w10, [x0]
507    add       x0, x0, x1
508    st1       {v0.16b}, [x4], x1        // 16 bytes store
509    dup       v2.16b, w9
510    st1       {v2.16b}, [x4], x1        // 16 bytes store
511    ldrb      w11, [x0]
512    add       x0, x0, x1
513    dup       v4.16b, w10
514    dup       v6.16b, w11
515    st1       {v4.16b}, [x4], x1        // 16 bytes store
516    ldrb      w8, [x0]
517    add       x0, x0, x1
518    st1       {v6.16b}, [x4], x1        // 16 bytes store
519    ldrb      w9, [x0]
520    add       x0, x0, x1
521    dup       v0.16b, w8
522    ldrb      w10, [x0]
523    add       x0, x0, x1
524    st1       {v0.16b}, [x4], x1        // 16 bytes store
525    dup       v2.16b, w9
526    ldrb      w11, [x0]
527    add       x0, x0, x1
528    st1       {v2.16b}, [x4], x1        // 16 bytes store
529    dup       v4.16b, w10
530    dup       v6.16b, w11
531    subs      w2, w2, #8
532    st1       {v4.16b}, [x4], x1        // 16 bytes store
533    st1       {v6.16b}, [x4], x1        // 16 bytes store
534    bne       loop_16_r
535    b         end_func_r
536
537loop_32_r:                              //  /*hard coded for width=32  ,height =8,16*/
538    ldrb      w8, [x0]
539    add       x0, x0, x1
540    ldrb      w9, [x0]
541    add       x0, x0, x1
542    dup       v0.16b, w8
543    ldrb      w10, [x0]
544    add       x0, x0, x1
545    st1       {v0.16b}, [x4], #16       // 16 bytes store
546    dup       v2.16b, w9
547    st1       {v0.16b}, [x4], x6
548    st1       {v2.16b}, [x4], #16       // 16 bytes store
549    dup       v4.16b, w10
550    st1       {v2.16b}, [x4], x6        // 16 bytes store
551    ldrb      w11, [x0]
552    add       x0, x0, x1
553    st1       {v4.16b}, [x4], #16       // 16 bytes store
554    dup       v6.16b, w11
555    st1       {v4.16b}, [x4], x6        // 16 bytes store
556    ldrb      w8, [x0]
557    add       x0, x0, x1
558    st1       {v6.16b}, [x4], #16       // 16 bytes store
559    ldrb      w9, [x0]
560    add       x0, x0, x1
561    dup       v0.16b, w8
562    st1       {v6.16b}, [x4], x6        // 16 bytes store
563    ldrb      w10, [x0]
564    add       x0, x0, x1
565    st1       {v0.16b}, [x4], #16       // 16 bytes store
566    dup       v2.16b, w9
567    st1       {v0.16b}, [x4], x6        // 16 bytes store
568    ldrb      w11, [x0]
569    add       x0, x0, x1
570    st1       {v2.16b}, [x4], #16       // 16 bytes store
571    dup       v4.16b, w10
572    st1       {v2.16b}, [x4], x6        // 16 bytes store
573    st1       {v4.16b}, [x4], #16       // 16 bytes store
574    dup       v6.16b, w11
575    st1       {v4.16b}, [x4], x6        // 16 bytes store
576    subs      w2, w2, #8
577    st1       {v6.16b}, [x4], #16       // 16 bytes store
578    st1       {v6.16b}, [x4], x6        // 16 bytes store
579    bne       loop_32_r
580
581
582
583end_func_r:
584    // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
585    ldp       x19, x20, [sp], #16
586    pop_v_regs
587    ret
588
589
590
591
592
593///**
594//*******************************************************************************
595//*
596//* @brief
597//;* Padding (chroma block) at the right of a 2d array
598//*
599//* @par Description:
600//* The right column of a 2d array is replicated for pad_size times at the right
601//*
602//*
603//* @param[in] pu1_src
604//;*  UWORD8 pointer to the source
605//*
606//* @param[in] src_strd
607//*  integer source stride
608//*
609//* @param[in] ht
610//;*  integer height of the array
611//*
612//* @param[in] wd
613//*  integer width of the array (each colour component)
614//*
615//* @param[in] pad_size
616//*  integer -padding size of the array
617//*
618//* @param[in] ht
619//;*  integer height of the array
620//*
621//* @param[in] wd
622//*  integer width of the array
623//*
624//* @returns
625//*
626//* @remarks
627//*  None
628//*
629//*******************************************************************************
630//*/
631//#if PAD_RIGHT_CHROMA == C
632//void ih264_pad_right_chroma(UWORD8 *pu1_src,
633//                        WORD32 src_strd,
634//                        WORD32 ht,
635//                        WORD32 pad_size)
636//    x0 => *pu1_src
637//    w1 => src_strd
638//    w2 => ht
639//    w3 => pad_size
640
641
642
643    .global ih264_pad_right_chroma_av8
644
645ih264_pad_right_chroma_av8:
646
647    // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
648    push_v_regs
649    sxtw      x1, w1
650    sxtw      x3, w3
651    stp       x19, x20, [sp, #-16]!
652
653    mov       x4, x0
654    sub       x6, x1, #16
655    sub       x0, x0, #2
656loop_32_r_c: //  /*hard coded for width=32 ,height =8,4*/
657    ldrh      w8, [x0]
658    add       x0, x0, x1
659    ldrh      w9, [x0]
660    add       x0, x0, x1
661    dup       v0.8h, w8
662    ldrh      w10, [x0]
663    add       x0, x0, x1
664    st1       {v0.16b}, [x4], #16       // 16 bytes store
665    dup       v2.8h, w9
666    st1       {v0.16b}, [x4], x6
667    st1       {v2.16b}, [x4], #16       // 16 bytes store
668    dup       v4.8h, w10
669    st1       {v2.16b}, [x4], x6        // 16 bytes store
670    subs      w2, w2, #4
671    ldrh      w11, [x0]
672    add       x0, x0, x1
673    st1       {v4.16b}, [x4], #16       // 16 bytes store
674    dup       v6.8h, w11
675    st1       {v4.16b}, [x4], x6        // 16 bytes store
676    st1       {v6.16b}, [x4], #16       // 16 bytes store
677    st1       {v6.16b}, [x4], x6        // 16 bytes store
678
679    beq       end_func_r_c              ///* Branching when ht=4*/
680
681    ldrh      w8, [x0]
682    add       x0, x0, x1
683    dup       v0.8h, w8
684    ldrh      w9, [x0]
685    add       x0, x0, x1
686    ldrh      w10, [x0]
687    add       x0, x0, x1
688    st1       {v0.16b}, [x4], #16       // 16 bytes store
689    dup       v2.8h, w9
690    st1       {v0.16b}, [x4], x6        // 16 bytes store
691    ldrh      w11, [x0]
692    add       x0, x0, x1
693    st1       {v2.16b}, [x4], #16       // 16 bytes store
694    dup       v4.8h, w10
695    st1       {v2.16b}, [x4], x6        // 16 bytes store
696    st1       {v4.16b}, [x4], #16       // 16 bytes store
697    dup       v6.8h, w11
698    st1       {v4.16b}, [x4], x6        // 16 bytes store
699    subs      w2, w2, #4
700    st1       {v6.16b}, [x4], #16       // 16 bytes store
701    st1       {v6.16b}, [x4], x6        // 16 bytes store
702
703    beq       end_func_r_c              ///* Branching when ht=8*/
704    bne       loop_32_r_c
705    ldrh      w8, [x0]
706    add       x0, x0, x1
707    dup       v0.8h, w8
708    ldrh      w9, [x0]
709    add       x0, x0, x1
710    ldrh      w10, [x0]
711    add       x0, x0, x1
712    st1       {v0.16b}, [x4], #16       // 16 bytes store
713    dup       v2.8h, w9
714    st1       {v0.16b}, [x4], x6        // 16 bytes store
715    ldrh      w11, [x0]
716    add       x0, x0, x1
717    st1       {v2.16b}, [x4], #16       // 16 bytes store
718    dup       v4.8h, w10
719    st1       {v2.16b}, [x4], x6        // 16 bytes store
720    st1       {v4.16b}, [x4], #16       // 16 bytes store
721    dup       v6.8h, w11
722    st1       {v4.16b}, [x4], x6        // 16 bytes store
723    st1       {v6.16b}, [x4], #16       // 16 bytes store
724    st1       {v6.16b}, [x4], x6        // 16 bytes store
725
726end_func_r_c:
727    // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
728    ldp       x19, x20, [sp], #16
729    pop_v_regs
730    ret
731
732
733
734
735
736
737