1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19// *******************************************************************************
20// * //file
21// *  ihevc_padding_neon.s
22// *
23// * //brief
24// *  contains function definitions padding
25// *
26// * //author
27// *     naveen sr
28// *
29// * //par list of functions:
30// *  - ihevc_pad_left_luma()
31// *  - ihevc_pad_left_chroma()
32// *
33// * //remarks
34// *  none
35// *
36// *******************************************************************************
37//*/
38
39///**
40//*******************************************************************************
41//*
42//* //brief
43//*   padding (luma block) at the left of a 2d array
44//*
45//* //par description:
46//*   the left column of a 2d array is replicated for pad_size times at the left
47//*
48//*
49//* //param[in] pu1_src
50//*  uword8 pointer to the source
51//*
52//* //param[in] src_strd
53//*  integer source stride
54//*
55//* //param[in] ht
56//*  integer height of the array
57//*
58//* //param[in] wd
59//*  integer width of the array
60//*
61//* //param[in] pad_size
62//*  integer -padding size of the array
63//*
64//* //param[in] ht
65//*  integer height of the array
66//*
67//* //param[in] wd
68//*  integer width of the array
69//*
70//* //returns
71//*
72//* //remarks
73//*  none
74//*
75//*******************************************************************************
76//*/
77//.if pad_left_luma == c
78//void ihevc_pad_left_luma(uword8 *pu1_src,
79//                        word32 src_strd,
80//                        word32 ht,
81//                        word32 pad_size)
82//**************variables vs registers*************************
83//    x0 => *pu1_src
84//    x1 => src_strd
85//    x2 => ht
86//    x3 => pad_size
87
88.text
89.align 4
90
91.globl ihevc_pad_left_luma_av8
92
93.type ihevc_pad_left_luma_av8, %function
94
95ihevc_pad_left_luma_av8:
96
97loop_start_luma_left:
98    // pad size is assumed to be pad_left = 80
99    sub         x4,x0,x3
100
101    ldrb        w8,[x0]
102    add         x0,x0,x1
103    ldrb        w9,[x0]
104    add         x0,x0,x1
105    ldrb        w10,[x0]
106    add         x0,x0,x1
107    ldrb        w11,[x0]
108    add         x0,x0,x1
109
110    dup         v0.16b,w8
111    dup         v2.16b,w9
112    dup         v4.16b,w10
113    dup         v6.16b,w11
114
115    add         x5,x4,x1
116
117    st1         {v0.16b},[x4],#16           //128/8 = 16 bytes store
118    st1         {v0.16b},[x4],#16           // 16 bytes store
119    st1         {v0.16b},[x4],#16           // 16 bytes store
120    st1         {v0.16b},[x4],#16           // 16 bytes store
121    st1         {v0.16b},[x4]               // 16 bytes store
122
123    add         x6,x5,x1
124
125    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
126    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
127    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
128    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
129    st1         {v2.16b},[x5]               //128/8 = 16 bytes store
130
131    add         x7,x6,x1
132
133    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
134    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
135    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
136    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
137    st1         {v4.16b},[x6]               //128/8 = 16 bytes store
138
139    subs        x2, x2,#4
140
141    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
142    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
143    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
144    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
145    st1         {v6.16b},[x7]               //128/8 = 16 bytes store
146
147    // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
148
149    bne         loop_start_luma_left
150
151    ret
152
153
154
155
156
157///**
158//*******************************************************************************
159//*
160//* //brief
161//*   padding (chroma block) at the left of a 2d array
162//*
163//* //par description:
164//*   the left column of a 2d array is replicated for pad_size times at the left
165//*
166//*
167//* //param[in] pu1_src
168//*  uword8 pointer to the source
169//*
170//* //param[in] src_strd
171//*  integer source stride
172//*
173//* //param[in] ht
174//*  integer height of the array
175//*
176//* //param[in] wd
177//*  integer width of the array (each colour component)
178//*
179//* //param[in] pad_size
180//*  integer -padding size of the array
181//*
182//* //param[in] ht
183//*  integer height of the array
184//*
185//* //param[in] wd
186//*  integer width of the array
187//*
188//* //returns
189//*
190//* //remarks
191//*  none
192//*
193//*******************************************************************************
194//*/
195//.if pad_left_chroma == c
196//void ihevc_pad_left_chroma(uword8 *pu1_src,
197//                            word32 src_strd,
198//                            word32 ht,
199//                            word32 pad_size)
200//{
201//    x0 => *pu1_src
202//    x1 => src_strd
203//    x2 => ht
204//    x3 => pad_size
205
206
207
208.globl ihevc_pad_left_chroma_av8
209
210.type ihevc_pad_left_chroma_av8, %function
211
212ihevc_pad_left_chroma_av8:
213
214
215loop_start_chroma_left:
216    // pad size is assumed to be pad_left = 80
217    sub         x4,x0,x3
218
219    ldrh        w8,[x0]
220    add         x0,x0,x1
221    ldrh        w9,[x0]
222    add         x0,x0,x1
223    ldrh        w10,[x0]
224    add         x0,x0,x1
225    ldrh        w11,[x0]
226    add         x0,x0,x1
227
228    dup         v0.8h,w8
229    dup         v2.8h,w9
230    dup         v4.8h,w10
231    dup         v6.8h,w11
232
233    add         x5,x4,x1
234
235    st1         {v0.16b},[x4],#16           //128/8 = 16 bytes store
236    st1         {v0.16b},[x4],#16           // 16 bytes store
237    st1         {v0.16b},[x4],#16           // 16 bytes store
238    st1         {v0.16b},[x4],#16           // 16 bytes store
239    st1         {v0.16b},[x4]               // 16 bytes store
240
241    add         x6,x5,x1
242
243    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
244    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
245    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
246    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
247    st1         {v2.16b},[x5]               //128/8 = 16 bytes store
248
249    add         x7,x6,x1
250
251    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
252    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
253    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
254    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
255    st1         {v4.16b},[x6]               //128/8 = 16 bytes store
256
257    subs        x2, x2,#4
258
259    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
260    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
261    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
262    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
263    st1         {v6.16b},[x7]               //128/8 = 16 bytes store
264
265    // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
266
267    bne         loop_start_chroma_left
268
269    ret
270
271
272
273
274
275///**
276//*******************************************************************************
277//*
278//* //brief
279//* padding (luma block) at the right of a 2d array
280//*
281//* //par description:
282//* the right column of a 2d array is replicated for pad_size times at the right
283//*
284//*
285//* //param[in] pu1_src
286//*  uword8 pointer to the source
287//*
288//* //param[in] src_strd
289//*  integer source stride
290//*
291//* //param[in] ht
292//*  integer height of the array
293//*
294//* //param[in] wd
295//*  integer width of the array
296//*
297//* //param[in] pad_size
298//*  integer -padding size of the array
299//*
300//* //param[in] ht
301//*  integer height of the array
302//*
303//* //param[in] wd
304//*  integer width of the array
305//*
306//* //returns
307//*
308//* //remarks
309//*  none
310//*
311//*******************************************************************************
312//*/
313//.if pad_right_luma == c
314//void ihevc_pad_right_luma(uword8 *pu1_src,
315//                        word32 src_strd,
316//                        word32 ht,
317//                        word32 pad_size)
318//{
319//    word32 row//
320//
321//    for(row = 0// row < ht// row++)
322//    {
323//        memset(pu1_src, *(pu1_src -1), pad_size)//
324//
325//        pu1_src += src_strd//
326//    }
327//}
328//
329//    x0 => *pu1_src
330//    x1 => src_strd
331//    x2 => ht
332//    x3 => pad_size
333
334
335
336.globl ihevc_pad_right_luma_av8
337
338.type ihevc_pad_right_luma_av8, %function
339
340ihevc_pad_right_luma_av8:
341
342
343loop_start_luma_right:
344    // pad size is assumed to be pad_left = 80
345    mov         x4,x0
346
347    ldrb        w8,[x0, #-1]
348    add         x0,x0,x1
349    ldrb        w9,[x0, #-1]
350    add         x0,x0,x1
351    ldrb        w10,[x0, #-1]
352    add         x0,x0,x1
353    ldrb        w11,[x0, #-1]
354    add         x0,x0,x1
355
356    add         x5,x4,x1
357    add         x6,x5,x1
358    add         x7,x6,x1
359
360    dup         v0.16b,w8
361    dup         v2.16b,w9
362    dup         v4.16b,w10
363    dup         v6.16b,w11
364
365    st1         {v0.16b},[x4],#16           //128/8 = 16 bytes store
366    st1         {v0.16b},[x4],#16           // 16 bytes store
367    st1         {v0.16b},[x4],#16           // 16 bytes store
368    st1         {v0.16b},[x4],#16           // 16 bytes store
369    st1         {v0.16b},[x4]               // 16 bytes store
370
371
372    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
373    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
374    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
375    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
376    st1         {v2.16b},[x5]               //128/8 = 16 bytes store
377
378    subs        x2, x2,#4
379
380    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
381    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
382    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
383    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
384    st1         {v4.16b},[x6]               //128/8 = 16 bytes store
385
386    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
387    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
388    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
389    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
390    st1         {v6.16b},[x7]               //128/8 = 16 bytes store
391
392
393    // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
394
395
396    bne         loop_start_luma_right
397
398    ret
399
400
401
402
403
404///**
405//*******************************************************************************
406//*
407//* //brief
408////* padding (chroma block) at the right of a 2d array
409//*
410//* //par description:
411//* the right column of a 2d array is replicated for pad_size times at the right
412//*
413//*
414//* //param[in] pu1_src
415////*  uword8 pointer to the source
416//*
417//* //param[in] src_strd
418//*  integer source stride
419//*
420//* //param[in] ht
421////*  integer height of the array
422//*
423//* //param[in] wd
424//*  integer width of the array (each colour component)
425//*
426//* //param[in] pad_size
427//*  integer -padding size of the array
428//*
429//* //param[in] ht
430////*  integer height of the array
431//*
432//* //param[in] wd
433//*  integer width of the array
434//*
435//* //returns
436//*
437//* //remarks
438//*  none
439//*
440//*******************************************************************************
441//*/
442//.if pad_right_chroma == c
443//void ihevc_pad_right_chroma(uword8 *pu1_src,
444//                        word32 src_strd,
445//                        word32 ht,
446//                        word32 pad_size)
447//    x0 => *pu1_src
448//    x1 => src_strd
449//    x2 => ht
450//    x3 => pad_size
451
452
453
454.globl ihevc_pad_right_chroma_av8
455
456.type ihevc_pad_right_chroma_av8, %function
457
458ihevc_pad_right_chroma_av8:
459
460
461loop_start_chroma_right:
462    // pad size is assumed to be pad_left = 80
463    mov         x4,x0
464
465    ldrh        w8,[x0, #-2]
466    add         x0,x0,x1
467    ldrh        w9,[x0, #-2]
468    add         x0,x0,x1
469    ldrh        w10,[x0, #-2]
470    add         x0,x0,x1
471    ldrh        w11,[x0, #-2]
472    add         x0,x0,x1
473
474    dup         v0.8h,w8
475    dup         v2.8h,w9
476    dup         v4.8h,w10
477    dup         v6.8h,w11
478
479    add         x5,x4,x1
480
481    st1         {v0.16b},[x4],#16           //128/8 = 16 bytes store
482    st1         {v0.16b},[x4],#16           // 16 bytes store
483    st1         {v0.16b},[x4],#16           // 16 bytes store
484    st1         {v0.16b},[x4],#16           // 16 bytes store
485    st1         {v0.16b},[x4]               // 16 bytes store
486
487    add         x6,x5,x1
488
489    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
490    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
491    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
492    st1         {v2.16b},[x5],#16           //128/8 = 16 bytes store
493    st1         {v2.16b},[x5]               //128/8 = 16 bytes store
494
495    add         x7,x6,x1
496
497    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
498    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
499    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
500    st1         {v4.16b},[x6],#16           //128/8 = 16 bytes store
501    st1         {v4.16b},[x6]               //128/8 = 16 bytes store
502
503    subs        x2, x2,#4
504
505    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
506    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
507    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
508    st1         {v6.16b},[x7],#16           //128/8 = 16 bytes store
509    st1         {v6.16b},[x7]               //128/8 = 16 bytes store
510
511    // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store
512
513    bne         loop_start_chroma_right
514
515    ret
516
517
518
519
520
521
522
523
524