1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19//*******************************************************************************
20//* @file
21//*  ihevc_intra_pred_filters_dc.s
22//*
23//* @brief
24//*  contains function definitions for intra prediction dc filtering.
25//* functions are coded using neon  intrinsics and can be compiled using
26
27//* rvct
28//*
29//* @author
30//*  akshaya mukund
31//*
32//* @par list of functions:
33//*
34//*
35//* @remarks
36//*  none
37//*
38//*******************************************************************************
39//*/
40///**
41//*******************************************************************************
42//*
43//* @brief
44//*    luma intraprediction filter for dc input
45//*
46//* @par description:
47//*
48//* @param[in] pu1_ref
49//*  uword8 pointer to the source
50//*
51//* @param[out] pu1_dst
52//*  uword8 pointer to the destination
53//*
54//* @param[in] src_strd
55//*  integer source stride
56//*
57//* @param[in] dst_strd
58//*  integer destination stride
59//*
60//* @param[in] pi1_coeff
61//*  word8 pointer to the planar coefficients
62//*
63//* @param[in] nt
64//*  size of tranform block
65//*
66//* @param[in] mode
67//*  type of filtering
68//*
69//* @returns
70//*
71//* @remarks
72//*  none
73//*
74//*******************************************************************************
75//*/
76
77//void ihevc_intra_pred_luma_dc(uword8 *pu1_ref,
78//                              word32 src_strd,
79//                              uword8 *pu1_dst,
80//                              word32 dst_strd,
81//                              word32 nt,
82//                              word32 mode)
83//
84//**************variables vs registers*****************************************
85//x0 => *pu1_ref
86//x1 => src_strd
87//x2 => *pu1_dst
88//x3 => dst_strd
89
90//stack contents from #40
91//    nt
92//    mode
93//    pi1_coeff
94
95.text
96.align 4
97.include "ihevc_neon_macros.s"
98
99
100.globl ihevc_intra_pred_luma_dc_av8
101
102.type ihevc_intra_pred_luma_dc_av8, %function
103
104ihevc_intra_pred_luma_dc_av8:
105
106    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
107
108    stp         x19, x20,[sp,#-16]!
109
110
111//********** testing
112    //mov        x6, #128
113    //b        prologue_cpy_32
114//********** testing
115
116    mov         x11, #2                     //mov #2 to x11 (to be used to add to 2dc_val & 3dc_val)
117    mov         x9, #0
118    mov         v17.s[0], w11
119    mov         v17.s[1], w9
120
121    clz         w5,w4
122
123    add         x6, x0, x4                  //&src[nt]
124    sub         x20, x5, #32                //log2nt
125    neg         x5, x20
126    add         x7, x0, x4, lsl #1          //&src[2nt]
127
128    add         x8, x7, #1                  //&src[2nt+1]
129    mvn         x5, x5
130    add         x5, x5, #1
131    dup         v7.2s,w5
132
133    ldrb        w14, [x8]
134    sxtw        x14,w14
135    shl         d7, d7,#32
136
137    sub         x9, x7, #1                  //&src[2nt-1]
138    sshr        d7, d7,#32
139
140    mov         x7, x8                      //x7 also stores 2nt+1
141
142    ldrb        w12, [x9]
143    sxtw        x12,w12
144    add         x14, x14, x12               //src[2nt+1] + src[2nt-1]
145    add         x14, x14, x11               //src[2nt+1] + src[2nt-1] + 2
146
147    cmp         x4, #4
148    beq         dc_4
149
150    mov         x10, x4                     //nt
151
152add_loop:
153    ld1         {v0.8b},[x6],#8             //load from src[nt]
154    mov         x5, #0                      //
155    ld1         {v1.8b},[x8],#8             //load from src[2nt+1]
156
157    uaddlp      v2.4h,  v0.8b
158
159    mov         v6.s[0], w4
160    mov         v6.s[1], w5                 //store nt to accumulate
161    uaddlp      v3.4h,  v1.8b
162
163    ld1         {v0.8b},[x6],#8             //load from src[nt] (extra load for 8)
164
165    ld1         {v1.8b},[x8],#8             //load from src[2nt+1] (extra load for 8)
166    add         v4.4h,  v2.4h ,  v3.4h
167
168
169    uaddlp      v5.2s,  v4.4h
170
171
172    uadalp      v6.1d,  v5.2s               //accumulate all inp into d6 (end for nt==8)
173
174    subs        x10, x10,#8
175    beq         epil_add_loop
176
177core_loop_add:
178    uaddlp      v2.4h,  v0.8b
179    subs        x10, x10,#8
180    uaddlp      v3.4h,  v1.8b
181
182
183
184    add         v4.4h,  v2.4h ,  v3.4h
185    ld1         {v0.8b},[x6],#8             //load from src[nt] (extra load for 16)
186
187    uaddlp      v5.2s,  v4.4h
188    ld1         {v1.8b},[x8],#8             //load from src[2nt+1] (extra load for 16)
189
190    uadalp      v6.1d,  v5.2s               //accumulate all inp into d6
191    bne         core_loop_add
192
193epil_add_loop:
194
195    sshl        d18, d6, d7                 //(dc_val) shr by log2nt+1
196    cmp         x4, #32
197
198    mov         v28.s[0], w14
199    mov         v28.s[1], w5                //src[2nt+1]+2+src[2nt-1] moved to d28
200    mov         x20,#128
201    csel        x6, x20, x6,eq
202
203    dup         v16.8b, v18.b[0]            //dc_val
204    shl         d25, d18,#1                 //2*dc
205
206    beq         prologue_cpy_32
207
208    add         d27,  d25 ,  d28            //src[2nt+1]+2+src[2nt-1]+2dc_val
209    mov         x20,#0
210    csel        x6, x20, x6,ne              //nt
211
212    ushr        v29.4h, v27.4h,#2           //final dst[0]'s value in d15[0]
213    csel        x10, x4, x10,ne
214
215    add         d23,  d25 ,  d18            //3*dc
216    sub         x12, x3, x3, lsl #3         //-7*strd
217
218    add         d23,  d23 ,  d17            //3*dc + 2
219    add         x12, x12, #8                //offset after one 8x8 block (-7*strd + 8)
220
221    dup         v24.8h, v23.h[0]            //3*dc + 2 (moved to all lanes)
222    sub         x0, x3, x4                  //strd - nt
223
224prologue_col:
225    //0th column and 0-7 rows done here
226    //x8 and x9 (2nt+1+col 2nt-1-row)
227
228    mov         x8, x7                      //&src[2nt+1]
229
230    add         x0, x0, #8                  //strd - nt + 8
231    ld1         {v0.8b},[x8],#8             //col 1::7 load (prol)
232    sub         x9, x9, #7                  //&src[2nt-1-row]
233
234    ld1         {v1.8b},[x9]                //row 7::1 (0 also) load (prol)
235    sub         x9, x9, #8
236
237    uxtl        v20.8h, v0.8b
238
239    ld1         {v6.8b},[x8]                //col 8::15 load (prol extra)
240    add         v20.8h,  v20.8h ,  v24.8h   //col 1::7 add 3dc+2 (prol)
241
242    uxtl        v22.8h, v1.8b
243    sqshrun     v2.8b, v20.8h,#2            //columns shx2 movn (prol)
244
245    uxtl        v26.8h, v6.8b
246    add         v22.8h,  v22.8h ,  v24.8h   //row 1::7 add 3dc+2 (prol)
247
248    movi        d19, #0x00000000000000ff    //
249    sqshrun     v3.8b, v22.8h,#2            //rows shx2 movn (prol)
250
251    bsl         v19.8b,  v29.8b ,  v2.8b    //first row with dst[0]
252    add         v26.8h,  v26.8h ,  v24.8h   //col 8::15 add 3dc+2 (prol extra)
253
254    rev64       v3.8b,  v3.8b
255
256    st1         {v19.8b},[x2], x3           //store row 0 (prol)
257    sshr        d3, d3,#8                   //row 0 shift (prol) (first value to be ignored)
258
259    movi        d20, #0x00000000000000ff    //byte mask row 1 (prol)
260
261loop_again_col_row:
262
263    bsl         v20.8b,  v3.8b ,  v16.8b    //row 1    (prol)
264
265    movi        d21, #0x00000000000000ff    //byte mask row 2 (prol)
266    sshr        d3, d3,#8                   //row 1 shift (prol)
267
268    st1         {v20.8b},[x2], x3           //store row 1 (prol)
269    sqshrun     v4.8b, v26.8h,#2            //columns shx2 movn (prol extra)
270
271
272    bsl         v21.8b,  v3.8b ,  v16.8b    //row 2 (prol)
273
274    movi        d20, #0x00000000000000ff    //byte mask row 3 (prol)
275    sshr        d3, d3,#8                   //row 2 shift (prol)
276
277    st1         {v21.8b},[x2], x3           //store row 2 (prol)
278
279
280    bsl         v20.8b,  v3.8b ,  v16.8b    //row 3    (prol)
281
282    movi        d21, #0x00000000000000ff    //byte mask row 4 (prol)
283    sshr        d3, d3,#8                   //row 3 shift (prol)
284
285    st1         {v20.8b},[x2], x3           //store row 3 (prol)
286
287
288    bsl         v21.8b,  v3.8b ,  v16.8b    //row 4 (prol)
289
290    movi        d20, #0x00000000000000ff    //byte mask row 5 (prol)
291    sshr        d3, d3,#8                   //row 4 shift (prol)
292
293    st1         {v21.8b},[x2], x3           //store row 4 (prol)
294
295
296    bsl         v20.8b,  v3.8b ,  v16.8b    //row 5 (prol)
297
298    movi        d21, #0x00000000000000ff    //byte mask row 6 (prol)
299    sshr        d3, d3,#8                   //row 5 shift (prol)
300
301    st1         {v20.8b},[x2], x3           //store row 5 (prol)
302
303    ld1         {v1.8b},[x9]                //row 8::15 load (prol extra)
304
305    bsl         v21.8b,  v3.8b ,  v16.8b    //row 6 (prol)
306
307    uxtl        v22.8h, v1.8b
308
309    movi        d20, #0x00000000000000ff    //byte mask row 7 (prol)
310    sshr        d3, d3,#8                   //row 6 shift (prol)
311
312    st1         {v21.8b},[x2], x3           //store row 6 (prol)
313
314    bsl         v20.8b,  v3.8b ,  v16.8b    //row 7 (prol)
315    add         v22.8h,  v22.8h ,  v24.8h   //row 8::15 add 3dc+2 (prol extra)
316
317    sshr        d3, d3,#8                   //row 7 shift (prol)
318    st1         {v20.8b},[x2], x12          //store row 7 (prol)
319
320    subs        x10, x10, #8                //counter for cols
321
322    beq         end_func
323    blt         copy_16
324
325
326    movi        d20, #0x00000000000000ff    //byte mask row 9 (prol)
327    sqshrun     v3.8b, v22.8h,#2            //rows shx2 movn (prol)
328
329    rev64       v3.8b,  v3.8b
330
331    st1         {v4.8b},[x2], x3            //store 2nd col (for 16x16)
332
333    st1         {v16.8b},[x2], x3
334    st1         {v16.8b},[x2], x3
335    st1         {v16.8b},[x2], x3
336    st1         {v16.8b},[x2], x3
337    st1         {v16.8b},[x2], x3
338    st1         {v16.8b},[x2], x3
339    st1         {v16.8b},[x2], x0           //go to next row for 16
340
341
342    bsl         v20.8b,  v3.8b ,  v16.8b    //row 9    (prol)
343    subs        x10, x10, #8
344
345    st1         {v20.8b},[x2], x3           //store row 9 (prol)
346    sshr        d3, d3,#8                   //row 9 shift (prol)
347
348    movi        d20, #0x00000000000000ff    //byte mask row 9 (prol)
349
350    b           loop_again_col_row
351
352
353copy_16:
354    st1         {v16.8b},[x2], x3
355    st1         {v16.8b},[x2], x3
356    st1         {v16.8b},[x2], x3
357    st1         {v16.8b},[x2], x3
358    st1         {v16.8b},[x2], x3
359    st1         {v16.8b},[x2], x3
360    st1         {v16.8b},[x2], x3
361    st1         {v16.8b},[x2]
362
363    b           end_func
364
365prologue_cpy_32:
366    mov         x9, #128
367    //sub        x7, x3, #-24
368    add         x5, x2, x3
369    add         x8, x5, x3
370    add         x10, x8, x3
371    dup         v20.16b, v16.b[0]
372    lsl         x6, x3, #2
373    sub         x6, x6, #16
374
375    st1         {v20.16b}, [x2],#16
376    st1         {v20.16b}, [x5],#16
377    st1         {v20.16b}, [x8],#16
378    st1         {v20.16b}, [x10],#16
379
380    st1         {v20.16b}, [x2], x6
381    st1         {v20.16b}, [x5], x6
382    st1         {v20.16b}, [x8], x6
383    st1         {v20.16b}, [x10], x6
384
385    sub         x9, x9, #32                 //32x32 prol/epil counter dec
386
387kernel_copy:
388    st1         {v20.16b}, [x2],#16
389    st1         {v20.16b}, [x5],#16
390    st1         {v20.16b}, [x8],#16
391    st1         {v20.16b}, [x10],#16
392
393    st1         {v20.16b}, [x2], x6
394    st1         {v20.16b}, [x5], x6
395    st1         {v20.16b}, [x8], x6
396    st1         {v20.16b}, [x10], x6
397
398    subs        x9, x9, #32
399
400    st1         {v20.16b}, [x2],#16
401    st1         {v20.16b}, [x5],#16
402    st1         {v20.16b}, [x8],#16
403    st1         {v20.16b}, [x10],#16
404
405    st1         {v20.16b}, [x2], x6
406    st1         {v20.16b}, [x5], x6
407    st1         {v20.16b}, [x8], x6
408    st1         {v20.16b}, [x10], x6
409
410    bne         kernel_copy
411
412epilogue_copy:
413    st1         {v20.16b}, [x2],#16
414    st1         {v20.16b}, [x5],#16
415    st1         {v20.16b}, [x8],#16
416    st1         {v20.16b}, [x10],#16
417
418    st1         {v20.16b}, [x2]
419    st1         {v20.16b}, [x5]
420    st1         {v20.16b}, [x8]
421    st1         {v20.16b}, [x10]
422
423    b           end_func
424
425
426dc_4:
427    ld1         {v0.8b},[x6],#8             //load from src[nt]
428    ld1         {v1.8b},[x8],#8             //load from src[2nt+1]
429
430    uaddlp      v2.4h,  v0.8b
431    mov         x5, #0                      //
432    mov         v6.s[0], w4
433    mov         v6.s[1], w5                 //store nt to accumulate
434    uaddlp      v3.4h,  v1.8b
435
436    add         v4.4h,  v2.4h ,  v3.4h
437
438
439    uaddlp      v5.2s,  v4.4h
440    movi        d30, #0x00000000ffffffff
441
442    and         v5.8b,  v5.8b ,  v30.8b
443
444    mov         v28.s[0], w14
445    mov         v28.s[1], w5                //src[2nt+1]+2+src[2nt-1] moved to d28
446    add         d6,  d6 ,  d5               //accumulate all inp into d6 (end for nt==8)
447
448    sshl        d18, d6, d7                 //(dc_val) shr by log2nt+1
449    mov         x8, x7                      //&src[2nt+1]
450
451    shl         d25, d18,#1                 //2*dc
452    sub         x9, x9, #3                  //&src[2nt-1-row]
453
454    dup         v16.8b, v18.b[0]            //dc_val
455    add         d27,  d25 ,  d28            //src[2nt+1]+2+src[2nt-1]+2dc_val
456
457    ushr        v29.4h, v27.4h,#2           //final dst[0]'s value in d15[0]
458    sub         x12, x3, x3, lsl #2         //-3*strd
459    add         d23,  d25 ,  d18            //3*dc
460
461    add         d23,  d23 ,  d17            //3*dc + 2
462    add         x12, x12, #4                //offset after one 4x4 block (-3*strd + 4)
463
464    dup         v24.8h, v23.h[0]            //3*dc + 2 (moved to all lanes)
465    sub         x0, x3, x4                  //strd - nt
466
467
468    ld1         {v0.8b},[x8]                //col 1::3 load (prol)
469    ld1         {v1.8b},[x9]                //row 3::1 (0 also) load (prol)
470
471    uxtl        v20.8h, v0.8b
472
473    uxtl        v22.8h, v1.8b
474    add         v20.8h,  v20.8h ,  v24.8h   //col 1::7 add 3dc+2 (prol)
475
476    add         v22.8h,  v22.8h ,  v24.8h   //row 1::7 add 3dc+2 (prol)
477
478    movi        d19, #0x00000000000000ff    //
479    sqshrun     v2.8b, v20.8h,#2            //columns shx2 movn (prol)
480
481    movi        d20, #0x00000000000000ff    //byte mask row 1 (prol)
482    sqshrun     v3.8b, v22.8h,#2            //rows shx2 movn (prol)
483
484
485    bsl         v19.8b,  v29.8b ,  v2.8b    //first row with dst[0]
486
487    rev64       v3.8b,  v3.8b
488
489    st1         {v19.s}[0],[x2], x3         //store row 0 (prol)
490    sshr        d3, d3,#40                  //row 0 shift (prol) (first value to be ignored)
491
492    movi        d21, #0x00000000000000ff    //byte mask row 2 (prol)
493
494    bsl         v20.8b,  v3.8b ,  v16.8b    //row 1    (prol)
495    sshr        d3, d3,#8                   //row 1 shift (prol)
496
497    st1         {v20.s}[0],[x2], x3         //store row 1 (prol)
498
499    bsl         v21.8b,  v3.8b ,  v16.8b    //row 2 (prol)
500
501    movi        d20, #0x00000000000000ff    //byte mask row 3 (prol)
502
503    sshr        d3, d3,#8                   //row 2 shift (prol)
504    st1         {v21.s}[0],[x2], x3         //store row 2 (prol)
505
506    bsl         v20.8b,  v3.8b ,  v16.8b    //row 3    (prol)
507    st1         {v20.s}[0],[x2]             //store row 3 (prol)
508
509epilogue_end:
510end_func:
511    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
512    ldp         x19, x20,[sp],#16
513
514    ret
515
516
517
518
519
520