1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@*******************************************************************************
20@* @file
21@*  ihevc_intra_pred_filters_dc.s
22@*
23@* @brief
24@*  contains function definitions for intra prediction dc filtering.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  akshaya mukund
31@*
32@* @par list of functions:
33@*
34@*
35@* @remarks
36@*  none
37@*
38@*******************************************************************************
39@*/
40@/**
41@*******************************************************************************
42@*
43@* @brief
44@*    luma intraprediction filter for dc input
45@*
46@* @par description:
47@*
48@* @param[in] pu1_ref
49@*  uword8 pointer to the source
50@*
51@* @param[out] pu1_dst
52@*  uword8 pointer to the destination
53@*
54@* @param[in] src_strd
55@*  integer source stride
56@*
57@* @param[in] dst_strd
58@*  integer destination stride
59@*
60@* @param[in] pi1_coeff
61@*  word8 pointer to the planar coefficients
62@*
63@* @param[in] nt
64@*  size of tranform block
65@*
66@* @param[in] mode
67@*  type of filtering
68@*
69@* @returns
70@*
71@* @remarks
72@*  none
73@*
74@*******************************************************************************
75@*/
76
77@void ihevc_intra_pred_luma_dc(uword8 *pu1_ref,
78@                              word32 src_strd,
79@                              uword8 *pu1_dst,
80@                              word32 dst_strd,
81@                              word32 nt,
82@                              word32 mode)
83@
84@**************variables vs registers*****************************************
85@r0 => *pu1_ref
86@r1 => src_strd
87@r2 => *pu1_dst
88@r3 => dst_strd
89
90@stack contents from #104
91@   nt
92@   mode
93@   pi1_coeff
94
95.equ    nt_offset,      104
96
97.text
98.align 4
99
100
101
102
103.globl ihevc_intra_pred_luma_dc_a9q
104
105.type ihevc_intra_pred_luma_dc_a9q, %function
106
107ihevc_intra_pred_luma_dc_a9q:
108
109    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
110    vpush       {d8 - d15}
111    ldr         r4,[sp,#nt_offset]          @loads nt
112
113@********** testing
114    @mov        r6, #128
115    @b      prologue_cpy_32
116@********** testing
117
118    mov         r11, #2                     @mov #2 to r11 (to be used to add to 2dc_val & 3dc_val)
119    mov         r9, #0
120    vmov        d17, r11, r9
121
122    clz         r5, r4
123
124    add         r6, r0, r4                  @&src[nt]
125    rsb         r5, r5, #32                 @log2nt
126    add         r7, r0, r4, lsl #1          @&src[2nt]
127
128    add         r8, r7, #1                  @&src[2nt+1]
129    mvn         r5, r5
130    add         r5, r5, #1
131    vdup.32     d8, r5
132
133    ldrb        r14, [r8]
134    vshl.i64    d8, d8, #32
135
136    sub         r9, r7, #1                  @&src[2nt-1]
137    vshr.s64    d8, d8, #32
138
139    mov         r7, r8                      @r7 also stores 2nt+1
140
141    ldrb        r12, [r9]
142    add         r14, r14, r12               @src[2nt+1] + src[2nt-1]
143    add         r14, r14, r11               @src[2nt+1] + src[2nt-1] + 2
144
145    cmp         r4, #4
146    beq         dc_4
147
148    mov         r10, r4                     @nt
149
150add_loop:
151    vld1.s8     d0, [r6]!                   @load from src[nt]
152    mov         r5, #0                      @
153    vld1.s8     d1, [r8]!                   @load from src[2nt+1]
154
155    vpaddl.u8   d2, d0
156
157    vmov        d6, r4, r5                  @store nt to accumulate
158    vpaddl.u8   d3, d1
159
160    vld1.s8     d0, [r6]!                   @load from src[nt] (extra load for 8)
161
162    vld1.s8     d1, [r8]!                   @load from src[2nt+1] (extra load for 8)
163    vadd.u16    d4, d2, d3
164
165
166    vpaddl.u16  d5, d4
167
168
169    vpadal.u32  d6, d5                      @accumulate all inp into d6 (end for nt==8)
170
171    subs        r10, #8
172    beq         epil_add_loop
173
174core_loop_add:
175    vpaddl.u8   d2, d0
176    subs        r10, #8
177    vpaddl.u8   d3, d1
178
179
180
181    vadd.u16    d4, d2, d3
182    vld1.s8     d0, [r6]!                   @load from src[nt] (extra load for 16)
183
184    vpaddl.u16  d5, d4
185    vld1.s8     d1, [r8]!                   @load from src[2nt+1] (extra load for 16)
186
187    vpadal.u32  d6, d5                      @accumulate all inp into d6
188    bne         core_loop_add
189
190epil_add_loop:
191
192    vshl.s64    d9, d6, d8                  @(dc_val) shr by log2nt+1
193    cmp         r4, #32
194
195    vmov        d28, r14, r5                @src[2nt+1]+2+src[2nt-1] moved to d28
196    moveq       r6, #128
197
198    vdup.8      d16, d9[0]                  @dc_val
199    vshl.s64    d13, d9, #1                 @2*dc
200
201    beq         prologue_cpy_32
202
203    vadd.i64    d14, d13, d28               @src[2nt+1]+2+src[2nt-1]+2dc_val
204    movne       r6, #0                      @nt
205
206    vshr.u16    d15, d14, #2                @final dst[0]'s value in d15[0]
207    movne       r10, r4
208
209    vadd.i64    d11, d13, d9                @3*dc
210    sub         r12, r3, r3, lsl #3         @-7*strd
211
212    vadd.i64    d11, d11, d17               @3*dc + 2
213    add         r12, r12, #8                @offset after one 8x8 block (-7*strd + 8)
214
215    vdup.16     q12, d11[0]                 @3*dc + 2 (moved to all lanes)
216    sub         r0, r3, r4                  @strd - nt
217
218prologue_col:
219    @0th column and 0-7 rows done here
220    @r8 and r9 (2nt+1+col 2nt-1-row)
221
222    mov         r8, r7                      @&src[2nt+1]
223
224    add         r0, r0, #8                  @strd - nt + 8
225    vld1.s8     d0, [r8]!                   @col 1::7 load (prol)
226    sub         r9, r9, #7                  @&src[2nt-1-row]
227
228    vld1.s8     d1, [r9]                    @row 7::1 (0 also) load (prol)
229    sub         r9, r9, #8
230
231    vmovl.u8    q10, d0
232
233    vld1.s8     d6, [r8]                    @col 8::15 load (prol extra)
234    vadd.i16    q10, q10, q12               @col 1::7 add 3dc+2 (prol)
235
236    vmovl.u8    q11, d1
237    vqshrun.s16 d2, q10, #2                 @columns shr2 movn (prol)
238
239    vmovl.u8    q13, d6
240    vadd.i16    q11, q11, q12               @row 1::7 add 3dc+2 (prol)
241
242    vmov.i64    d19, #0x00000000000000ff    @
243    vqshrun.s16 d3, q11, #2                 @rows shr2 movn (prol)
244
245    vbsl        d19, d15, d2                @first row with dst[0]
246    vadd.i16    q13, q13, q12               @col 8::15 add 3dc+2 (prol extra)
247
248    vrev64.8    d3, d3
249
250    vst1.8      d19, [r2], r3               @store row 0 (prol)
251    vshr.s64    d3, d3, #8                  @row 0 shift (prol) (first value to be ignored)
252
253    vmov.i64    d20, #0x00000000000000ff    @byte mask row 1 (prol)
254
255loop_again_col_row:
256
257    vbsl        d20, d3, d16                @row 1  (prol)
258
259    vmov.i64    d21, #0x00000000000000ff    @byte mask row 2 (prol)
260    vshr.s64    d3, d3, #8                  @row 1 shift (prol)
261
262    vst1.8      d20, [r2], r3               @store row 1 (prol)
263    vqshrun.s16 d4, q13, #2                 @columns shr2 movn (prol extra)
264
265
266    vbsl        d21, d3, d16                @row 2 (prol)
267
268    vmov.i64    d20, #0x00000000000000ff    @byte mask row 3 (prol)
269    vshr.s64    d3, d3, #8                  @row 2 shift (prol)
270
271    vst1.8      d21, [r2], r3               @store row 2 (prol)
272
273
274    vbsl        d20, d3, d16                @row 3  (prol)
275
276    vmov.i64    d21, #0x00000000000000ff    @byte mask row 4 (prol)
277    vshr.s64    d3, d3, #8                  @row 3 shift (prol)
278
279    vst1.8      d20, [r2], r3               @store row 3 (prol)
280
281
282    vbsl        d21, d3, d16                @row 4 (prol)
283
284    vmov.i64    d20, #0x00000000000000ff    @byte mask row 5 (prol)
285    vshr.s64    d3, d3, #8                  @row 4 shift (prol)
286
287    vst1.8      d21, [r2], r3               @store row 4 (prol)
288
289
290    vbsl        d20, d3, d16                @row 5 (prol)
291
292    vmov.i64    d21, #0x00000000000000ff    @byte mask row 6 (prol)
293    vshr.s64    d3, d3, #8                  @row 5 shift (prol)
294
295    vst1.8      d20, [r2], r3               @store row 5 (prol)
296
297    vld1.s8     d1, [r9]                    @row 8::15 load (prol extra)
298
299    vbsl        d21, d3, d16                @row 6 (prol)
300
301    vmovl.u8    q11, d1
302
303    vmov.i64    d20, #0x00000000000000ff    @byte mask row 7 (prol)
304    vshr.s64    d3, d3, #8                  @row 6 shift (prol)
305
306    vst1.8      d21, [r2], r3               @store row 6 (prol)
307
308    vbsl        d20, d3, d16                @row 7 (prol)
309    vadd.i16    q11, q11, q12               @row 8::15 add 3dc+2 (prol extra)
310
311    vshr.s64    d3, d3, #8                  @row 7 shift (prol)
312    vst1.8      d20, [r2], r12              @store row 7 (prol)
313
314    subs        r10, r10, #8                @counter for cols
315
316    beq         end_func
317    blt         copy_16
318
319
320    vmov.i64    d20, #0x00000000000000ff    @byte mask row 9 (prol)
321    vqshrun.s16 d3, q11, #2                 @rows shr2 movn (prol)
322
323    vrev64.8    d3, d3
324
325    vst1.8      d4, [r2], r3                @store 2nd col (for 16x16)
326
327    vst1.8      d16, [r2], r3
328    vst1.8      d16, [r2], r3
329    vst1.8      d16, [r2], r3
330    vst1.8      d16, [r2], r3
331    vst1.8      d16, [r2], r3
332    vst1.8      d16, [r2], r3
333    vst1.8      d16, [r2], r0               @go to next row for 16
334
335
336    vbsl        d20, d3, d16                @row 9  (prol)
337    subs        r10, r10, #8
338
339    vst1.8      d20, [r2], r3               @store row 9 (prol)
340    vshr.s64    d3, d3, #8                  @row 9 shift (prol)
341
342    vmov.i64    d20, #0x00000000000000ff    @byte mask row 9 (prol)
343
344    b           loop_again_col_row
345
346
347copy_16:
348    vst1.8      d16, [r2], r3
349    vst1.8      d16, [r2], r3
350    vst1.8      d16, [r2], r3
351    vst1.8      d16, [r2], r3
352    vst1.8      d16, [r2], r3
353    vst1.8      d16, [r2], r3
354    vst1.8      d16, [r2], r3
355    vst1.8      d16, [r2]
356
357    b           end_func
358
359prologue_cpy_32:
360    mov         r9, #128
361    @sub        r7, r3, #-24
362    add         r5, r2, r3
363    add         r8, r5, r3
364    add         r10, r8, r3
365    vdup.8      q10, d16[0]
366    lsl         r6, r3, #2
367    add         r6, r6, #0xfffffff0
368
369    vst1.8      {d20,d21}, [r2]!
370    vst1.8      {d20,d21}, [r5]!
371    vst1.8      {d20,d21}, [r8]!
372    vst1.8      {d20,d21}, [r10]!
373
374    vst1.8      {d20,d21}, [r2], r6
375    vst1.8      {d20,d21}, [r5], r6
376    vst1.8      {d20,d21}, [r8], r6
377    vst1.8      {d20,d21}, [r10], r6
378
379    sub         r9, r9, #32                 @32x32 prol/epil counter dec
380
381kernel_copy:
382    vst1.8      {d20,d21}, [r2]!
383    vst1.8      {d20,d21}, [r5]!
384    vst1.8      {d20,d21}, [r8]!
385    vst1.8      {d20,d21}, [r10]!
386
387    vst1.8      {d20,d21}, [r2], r6
388    vst1.8      {d20,d21}, [r5], r6
389    vst1.8      {d20,d21}, [r8], r6
390    vst1.8      {d20,d21}, [r10], r6
391
392    subs        r9, r9, #32
393
394    vst1.8      {d20,d21}, [r2]!
395    vst1.8      {d20,d21}, [r5]!
396    vst1.8      {d20,d21}, [r8]!
397    vst1.8      {d20,d21}, [r10]!
398
399    vst1.8      {d20,d21}, [r2], r6
400    vst1.8      {d20,d21}, [r5], r6
401    vst1.8      {d20,d21}, [r8], r6
402    vst1.8      {d20,d21}, [r10], r6
403
404    bne         kernel_copy
405
406epilogue_copy:
407    vst1.8      {d20,d21}, [r2]!
408    vst1.8      {d20,d21}, [r5]!
409    vst1.8      {d20,d21}, [r8]!
410    vst1.8      {d20,d21}, [r10]!
411
412    vst1.8      {d20,d21}, [r2]
413    vst1.8      {d20,d21}, [r5]
414    vst1.8      {d20,d21}, [r8]
415    vst1.8      {d20,d21}, [r10]
416
417    b           end_func
418
419
420dc_4:
421    vld1.s8     d0, [r6]!                   @load from src[nt]
422    vld1.s8     d1, [r8]!                   @load from src[2nt+1]
423
424    vpaddl.u8   d2, d0
425    mov         r5, #0                      @
426    vmov        d6, r4, r5                  @store nt to accumulate
427    vpaddl.u8   d3, d1
428
429    vadd.u16    d4, d2, d3
430
431
432    vpaddl.u16  d5, d4
433    vmov.i64    d30, #0x00000000ffffffff
434
435    vand        d5, d5, d30
436
437    vmov        d28, r14, r5                @src[2nt+1]+2+src[2nt-1] moved to d28
438    vadd.i64    d6, d6, d5                  @accumulate all inp into d6 (end for nt==8)
439
440    vshl.s64    d9, d6, d8                  @(dc_val) shr by log2nt+1
441    mov         r8, r7                      @&src[2nt+1]
442
443    vshl.s64    d13, d9, #1                 @2*dc
444    sub         r9, r9, #3                  @&src[2nt-1-row]
445
446    vdup.8      d16, d9[0]                  @dc_val
447    vadd.i64    d14, d13, d28               @src[2nt+1]+2+src[2nt-1]+2dc_val
448
449    vshr.u16    d15, d14, #2                @final dst[0]'s value in d15[0]
450    sub         r12, r3, r3, lsl #2         @-3*strd
451    vadd.i64    d11, d13, d9                @3*dc
452
453    vadd.i64    d11, d11, d17               @3*dc + 2
454    add         r12, r12, #4                @offset after one 4x4 block (-3*strd + 4)
455
456    vdup.16     q12, d11[0]                 @3*dc + 2 (moved to all lanes)
457    sub         r0, r3, r4                  @strd - nt
458
459
460    vld1.s8     d0, [r8]                    @col 1::3 load (prol)
461    vld1.s8     d1, [r9]                    @row 3::1 (0 also) load (prol)
462
463    vmovl.u8    q10, d0
464
465    vmovl.u8    q11, d1
466    vadd.i16    q10, q10, q12               @col 1::7 add 3dc+2 (prol)
467
468    vadd.i16    q11, q11, q12               @row 1::7 add 3dc+2 (prol)
469
470    vmov.i64    d19, #0x00000000000000ff    @
471    vqshrun.s16 d2, q10, #2                 @columns shr2 movn (prol)
472
473    vmov.i64    d20, #0x00000000000000ff    @byte mask row 1 (prol)
474    vqshrun.s16 d3, q11, #2                 @rows shr2 movn (prol)
475
476
477    vbsl        d19, d15, d2                @first row with dst[0]
478
479    vrev64.8    d3, d3
480
481    vst1.32     d19[0], [r2], r3            @store row 0 (prol)
482    vshr.s64    d3, d3, #40                 @row 0 shift (prol) (first value to be ignored)
483
484    vmov.i64    d21, #0x00000000000000ff    @byte mask row 2 (prol)
485
486    vbsl        d20, d3, d16                @row 1  (prol)
487    vshr.s64    d3, d3, #8                  @row 1 shift (prol)
488
489    vst1.32     d20[0], [r2], r3            @store row 1 (prol)
490
491    vbsl        d21, d3, d16                @row 2 (prol)
492
493    vmov.i64    d20, #0x00000000000000ff    @byte mask row 3 (prol)
494
495    vshr.s64    d3, d3, #8                  @row 2 shift (prol)
496    vst1.32     d21[0], [r2], r3            @store row 2 (prol)
497
498    vbsl        d20, d3, d16                @row 3  (prol)
499    vst1.32     d20[0], [r2]                @store row 3 (prol)
500
501epilogue_end:
502end_func:
503    vpop        {d8 - d15}
504    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
505
506
507
508
509
510
511
512